In [102]:
### import statements 
import json
import obonet
from itertools import combinations 
from Bio import Medline
import networkx as nx
import string
from textblob import TextBlob  


import nltk
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from networkx.algorithms import tree

import math

In [103]:
# experiment configurations
# MAX_NUMBER_BIGRAMS = 30
MAX_NUMBER_ARTICLE = 1200

In [104]:
# parsing a medline file 
def parse_medline_rmap(medline_file):    
    map_abstracts = {}    
    pmid = ''
    abstract = ''  
    with open(medline_file) as medline_handle:
        records = Medline.parse(medline_handle)
        for record in records:         
            keys = record.keys()            
            if 'PMID' in keys and 'AB' in keys: 

                pmid = record['PMID']
                abstract = record['AB']
                
                map_abstracts[pmid] = abstract.lower()
    return map_abstracts  

In [105]:
def parse_json_gpt_api_data(json_file):

    json_records_map = {}
    # Open and read the JSON file
    with open(json_file, 'r') as json_file:
        json_data = json.load(json_file)

    # Now json_data is a list of dictionaries, each representing an item in the array
    for item in json_data:
        gpt_id = item['GPT-ID']
        title = item['Title']
        abstract = item['Abstract']
        # json_records_map[gpt_id]=(title + " " + abstract)
        json_records_map[gpt_id]=(title + " " + abstract)        
    return json_records_map

In [106]:
def remove_string_special_characters(s):
      
    # removes special characters with ' '
    stripped = re.sub('[^a-zA-z\s]', '', s)
    stripped = re.sub('_', '', stripped)
      
    # Change any white space to one space
    stripped = re.sub('\s+', ' ', stripped)
      
    # Remove start and end white spaces
    stripped = stripped.strip()
    if stripped != '':
            return stripped.lower()

In [107]:
pubmed_abstracts = parse_medline_rmap('../dataset/pubmed-cancerandc-set-2015-2019.txt')
cgpt_abstracts = parse_json_gpt_api_data('../dataset/cancer-gpt-apis.txt')

# cleaning PubMed articles from special characters
clean_pubmed_articles = []
for abst in list(pubmed_abstracts.values())[0:]:
    cleaned = remove_string_special_characters(abst)    
    clean_pubmed_articles.append(cleaned)
    
# cleaning chatGPT articles from special characters
clean_chatGPT_articles = []
for abst in list(cgpt_abstracts.values())[0:]:
    cleaned = remove_string_special_characters(abst)    
    clean_chatGPT_articles.append(cleaned)    
    

In [108]:
print(clean_pubmed_articles[0])

although rare bronchopleural fistula bpf following anatomic lung resection is a serious complication associated with high rates of mortality risk factors for bpf include surgical approach neoadjuvant therapy diabetes mellitus and chronic obstructive pulmonary disease as neoadjuvant treatment is increasingly being administered to patients with locally advanced lung cancer and as more patients are being diagnosed with lung cancer at an older ageelderly patients present with a higher index of multiple comorbiditiesthe incidence of bpf among patients undergoing anatomic resection for lung cancer is expected to increase in this manuscript we detail risk factors and considerations for bpf and describe a stepwise approach to treat bpf following lobectomy for lung cancer


In [109]:
stop_words = set(stopwords.words('english'))
special_list = ['abstract']

def stopwords_rem_pubmed(clean_pubmed_training):
    stopped_pubmed_training = []
    for abst in clean_pubmed_training[:MAX_NUMBER_ARTICLE]:
        valid_l = []
        valid_rec = []
        blob_object = TextBlob(abst)
        list_tokens = blob_object.words

        for token in list_tokens:        
            if token not in stop_words:
                valid_l.append(token)            
        valid_rec = ' '.join(valid_l)
        stopped_pubmed_training.append(valid_rec)
    return stopped_pubmed_training
    
    
def stopwords_rem_chatGPT_dataset(clean_chatGPT):    
    stopped_chatGPT_training = []
    for abst in clean_chatGPT_training[:MAX_NUMBER_ARTICLE]:
        valid_l = []
        valid_rec = []
        blob_object = TextBlob(abst)
        list_tokens = blob_object.words

        for token in list_tokens:        
            if (token not in stop_words) and (token not in special_list):
                valid_l.append(token)            
        valid_rec = ' '.join(valid_l)
        stopped_chatGPT_training.append(valid_rec)   
    return stopped_chatGPT_training


def stopwords_rem_chatGPT_article(clean_chatGPT_article):    
    stopped_chatGPT_training = []
    valid_l = []
    valid_rec = []
    blob_object = TextBlob(clean_chatGPT_article)
    list_tokens = blob_object.words

    for token in list_tokens:        
        if (token not in stop_words) and (token not in special_list):
            valid_l.append(token)            
    valid_rec = ' '.join(valid_l)
    # stopped_chatGPT_training.append(valid_rec)   
    return str(valid_rec)

In [110]:
pubmed_articles_ready = stopwords_rem_pubmed(clean_pubmed_articles)

# print(len(stopped_pubmed_training))  
gpt_articles_ready = []
for article in clean_chatGPT_articles:
    gpt_articles_ready.append(stopwords_rem_chatGPT_article(article))
print(len(gpt_articles_ready))    

1202


In [111]:
# print(pubmed_articles_ready[0])
# print('-----')
# print(gpt_articles_ready[0])

In [125]:
# Getting PubMed bigrams
def compute_bigrams(training_articles):
    list_bigrams = []

    vectorizer = CountVectorizer(ngram_range =(2, 2))
    X1 = vectorizer.fit_transform(training_articles)
    features = (vectorizer.get_feature_names_out())
    # print("\n\nX1 : \n", X1.toarray())

    # Applying TFIDF
    # You can still get n-grams here
    vectorizer = TfidfVectorizer(ngram_range = (2, 2))
    X2 = vectorizer.fit_transform(training_articles)
    scores = (X2.toarray())
    # print("\n\nScores : \n", scores)

    # Getting top ranking features
    sums = X2.sum(axis = 0)
    data1 = []
    for col, term in enumerate(features):
        data1.append( (term, sums[0, col] ))
    ranking = pd.DataFrame(data1, columns = ['term', 'rank'])
    words = (ranking.sort_values('rank', ascending = False))

    bigram_ranks = {}
    for index, row in words.iterrows():
        # print(row['term'],'\t\t\t',  row['rank'])

        splits = row['term'].split()
        bigram_ranks[row['rank']] = (splits[0], splits[1])

    count = 0    
    for k, v in bigram_ranks.items():
        # if count < MAX_NUMBER_BIGRAMS:
        #     # print(k,'\t',  v)
        #     count += 1
        list_bigrams.append(v)
    return bigram_ranks

In [126]:
def construct_training_model(training_articles):
    bigrams_map_training = compute_bigrams(training_articles)
    gpt_training_bigrams = bigrams_map_training.values()
    
    graph_training_model = nx.Graph()
    graph_training_model.add_edges_from(list(gpt_training_bigrams))
    
    return graph_training_model    

In [127]:
# construct a network training model from both datasets (gpt and pubmed)

gpt_training_model = construct_training_model(gpt_articles_ready[:100])
pubmed_training_model = construct_training_model(pubmed_articles_ready[:100])

# ----------   Verifying GPT Training  Model ----------# 
print(' -------- GPT Training Model --------')
node_count = len(gpt_training_model.nodes())
edge_count = len(gpt_training_model.edges())
print('Original node count: ', node_count)
print('Original edge count: ', edge_count)

# ----------   Verifying PubMed Training  Model ----------# 
print(' -------- PubMed Training Model --------')
node_count = len(pubmed_training_model.nodes())
edge_count = len(pubmed_training_model.edges())
print('Original node count: ', node_count)
print('Original edge count: ', edge_count)

 -------- GPT Training Model --------
Original node count:  559
Original edge count:  1050
 -------- PubMed Training Model --------
Original node count:  828
Original edge count:  977


In [128]:
def get_giant_lcc(graph_training_model):
    gcc = sorted(nx.connected_components(graph_training_model), key=len, reverse=True)
    giant_cc = graph_training_model.subgraph(gcc[0])
    return giant_cc

In [129]:
print(' -------- GPT GIANT LCC Graph --------')
gpt_lcc = get_giant_lcc(gpt_training_model)
print(gpt_lcc)

print(' -------- PUBMED GIANT LCC Graph --------')
pubmed_lcc = get_giant_lcc(pubmed_training_model)
print(pubmed_lcc)


 -------- GPT GIANT LCC Graph --------
Graph with 489 nodes and 1008 edges
 -------- PUBMED GIANT LCC Graph --------
Graph with 588 nodes and 842 edges


In [130]:
 # STEP2: -- compute individual articles bigrams -------
def calibrate_model(ds_label, begin_index, end_index, training_graph, calibrate_set):
    
    training_graph_copy = training_graph.copy() 

    ratios_added_per_fold = []
    for abst in calibrate_set[begin_index:end_index]:
        
        tokens = nltk.word_tokenize(abst)

        # compute the bigrams
        bigrams = list(nltk.bigrams(tokens))

        # -------  check if the giant has the bigram components, add new edge 
        # -------          otherwise, don't add new edges
        # -------  count how many nodes            
        count = 0
        added_edges = []
        for bigram in bigrams:

            if training_graph_copy.has_node(bigram[0]) and training_graph_copy.has_node(bigram[1]):

                if not training_graph_copy.has_edge(bigram[0], bigram[1]):

                    training_graph_copy.add_edge(bigram[0], bigram[1])
                    count += 1
                    added_edges.append((bigram[0], bigram[1]))
        ratio_ = count / len(tokens)        
        
        ratios_added_per_fold.append(ratio_) 
        
        training_graph_copy.remove_edges_from(added_edges)      
    return ratios_added_per_fold

In [131]:
def calc_mean(tst_set_list):
    average = sum(tst_set_list) / len(tst_set_list)        
    formatted_avg = float("{:.5f}".format(average))        
    return formatted_avg

In [144]:
count = 0

gpt_means = []
for index in range(100,MAX_NUMBER_ARTICLE):
    label_prefix = 'TEST-'
    if index % 100 == 0:
        count += 1
        calb_ratios_list = calibrate_model(label_prefix + str(count), index, index+100, gpt_lcc, gpt_articles_ready)
        # print(calb_ratios_list)
        tst_mean_g = calc_mean(calb_ratios_list) 
        print("The average of the list is:", tst_mean_g)
        gpt_means.append(tst_mean_g)
        
gpt_min_value = min(gpt_means)
gpt_max_value = max(gpt_means)  

The average of the list is: 0.27947
The average of the list is: 0.29009
The average of the list is: 0.26738
The average of the list is: 0.25622
The average of the list is: 0.25135
The average of the list is: 0.28374
The average of the list is: 0.27115
The average of the list is: 0.26867
The average of the list is: 0.25155
The average of the list is: 0.25541
The average of the list is: 0.24857


In [148]:
count = 0
pubmed_means = []
for index in range(100,MAX_NUMBER_ARTICLE):
    label_prefix = 'TEST-'
    if index % 100 == 0:
        count += 1
        calb_ratios_list = calibrate_model(label_prefix + str(count), index, index+100, pubmed_lcc, pubmed_articles_ready)
        # print(calb_ratios_list)
        tst_mean_p = calc_mean(calb_ratios_list) 
        # print("The average of the list is:", tst_mean_p)
        pubmed_means.append(tst_mean_p)
        
pubmed_min_value = min(pubmed_means)
pubmed_max_value = max(pubmed_means) 
# print(gpt_means)
for ratio in pubmed_means:
    print(ratio)

0.15838
0.15191
0.16585
0.15232
0.16439
0.16477
0.15776
0.15788
0.15757
0.15096
0.15829


In [134]:
def fit_an_article(article_text, training_graph):
    
    training_graph_copy = training_graph.copy()
    
    # chat_no_added_edges = []
    # for abst in stopped_pubmed_training[begin_index:end_index]:

    tokens = nltk.word_tokenize(article_text)

    # compute the bigrams
    bigrams = list(nltk.bigrams(tokens))

    # -------  check if the giant has the bigram components, add new edge 
    # -------          otherwise, don't add new edges
    # -------  count how many nodes    

    count = 0
    added_edges = []
    for bigram in bigrams:

        if training_graph_copy.has_node(bigram[0]) and training_graph_copy.has_node(bigram[1]):

            if not training_graph_copy.has_edge(bigram[0], bigram[1]):

                training_graph_copy.add_edge(bigram[0], bigram[1])
                count += 1
                added_edges.append((bigram[0], bigram[1]))
    ratio_ = count / len(tokens)        
    training_graph_copy.remove_edges_from(added_edges)
        
    return ratio_

In [146]:
# The average of the list is: 0.15191
# The average of the list is: 0.16585
misclassified = 0
correct_classified = 0
for article in gpt_articles_ready[200:MAX_NUMBER_ARTICLE]:
    # print(type(article))
    ratio_val = fit_an_article(article, gpt_lcc)
    if ratio_val >= pubmed_max_value and ratio_val <= pubmed_max_value :       
        misclassified+=1
        # print('MISCLASSIFIED: Fit ratio for individual articles: ', ratio_val)
    else:
        correct_classified+=1
        # print('CORRECT CLASS: Fit ratio for individual articles: ', ratio_val)
print('-------------------------------------------------')        
print('MISCLASSIFIED: ', misclassified/800)
print('CORRECT CLASSIFIED: ', correct_classified/800)   
print('-------------------------------------------------')        

-------------------------------------------------
MISCLASSIFIED:  0.0
CORRECT CLASSIFIED:  1.25
-------------------------------------------------


In [141]:
misclassified = 0
correct_classified = 0
for article in pubmed_articles_ready[200:MAX_NUMBER_ARTICLE]:
    # print(type(article))
    ratio_val = fit_an_article(article, gpt_lcc)
    if ratio_val >= gpt_min_value and ratio_val <= gpt_max_value:       
        misclassified+=1
        # print('MISCLASSIFIED: Fit ratio for individual articles: ', ratio_val)
    else:
        correct_classified+=1
        # print('CORRECT CLASS: Fit ratio for individual articles: ', ratio_val)
print('-------------------------------------------------')        
print('MISCLASSIFIED: ', misclassified/100)
print('CORRECT CLASSIFIED: ', correct_classified/100)   
print('-------------------------------------------------')  

-------------------------------------------------
MISCLASSIFIED:  0.03
CORRECT CLASSIFIED:  9.97
-------------------------------------------------


In [142]:
def distance_to_range(point, range_start, range_end):
    # Calculate the distance to the nearest endpoint of the range
    distance = min(abs(point - range_start), abs(point - range_end))
    return distance

In [149]:
# two classes classification

two_articles_dataset = []

for pubmed_article in pubmed_articles_ready[200:250]:
    two_articles_dataset.append('PUBMED: ' + pubmed_article)

for gpt_article in gpt_articles_ready[200:250]:
    two_articles_dataset.append('GPT: ' + gpt_article)
    

count = 0
chatgpt_class = 0
pubmed_class = 0

failed_to_classify = 0
misclassified_as_gpt = 0
misclassified_as_pubmed = 0


# RANGE 1: PUBMED
range1_start = pubmed_min_value
range1_end = pubmed_max_value

# RANGE 2: GPT
range2_start = gpt_min_value
range2_end = gpt_max_value

for article in two_articles_dataset:
    
    gpt_ratio_val    = fit_an_article(article, gpt_lcc)
    pubmed_ratio_val = fit_an_article(article, pubmed_lcc)
    
    # Classifying GPT
    if gpt_ratio_val >= range2_start and ratio_val <= range2_end :       
        if article[:20].startswith('GPT'):
            chatgpt_class+=1
            print('ChatGPT : Fit ratio for individual articles: ', gpt_ratio_val, 'evidence', article[:20])
        else:
            misclassified_as_pubmed+=1
            
    # Classifying PUBMED
    elif pubmed_ratio_val >= range1_start and ratio_val <= range1_end:
        if article[:20].startswith('PUBMED'):
            pubmed_class += 1
            print('PUBMED : Fit ratio for individual articles: ', pubmed_ratio_val, 'evidence', article[:20])
        else: 
            misclassified_as_gpt+=1
        
    else:
        # Calculate distances
        distance_to_range1 = distance_to_range(pubmed_ratio_val, range1_start, range1_end)
        distance_to_range2 = distance_to_range(gpt_ratio_val, range2_start, range2_end) 
        
        print('distance to range 1: ', distance_to_range1)
        print('distance to range 2: ', distance_to_range2)        
        
        # RANGE 1: PUBMED SHOULD WIN
        if distance_to_range1 < distance_to_range2:
            if article[:20].startswith('GPT'):
                misclassified_as_gpt+=1
                print('PUBMED PREDICTED INCORRECTLY => ', 'ratio:', pubmed_ratio_val ,', evidence:', article[:20])                
            else:   
                # count+=1
                pubmed_class += 1
                print('PUBMED CLASS PREDICTED => ', 'ratio:', pubmed_ratio_val , ', evidence:', article[:20])

        # RANGE 2: GPT SHOULD WIN
        elif distance_to_range2 < distance_to_range1:
            if article[:20].startswith('PUBMED'):                
                misclassified_as_pubmed+=1
                print('GPT PREDICTED INCORRECTLY => ', 'ratio:', gpt_ratio_val , ', evidence:', article[:20])                     
            else:
                chatgpt_class += 1
                print('GPT CLASS PREDICTED => ', 'ratio:', gpt_ratio_val , ', evidence:', article[:20])

    print(' -------------------------------- ')
    
    
print('---------------COUNTS---------------------------')    
print('Number of publications analyzed: ', count)
print('PUBMED CLASSIFIED: ', pubmed_class)   
print('CHATGPT CLASSIFIED: ', chatgpt_class)   
print('FAILED_TO_CLASSIFY: ', failed_to_classify)
print('GPT MISCLASSIFIED AS PUBMED: ', misclassified_as_gpt)   
print('PUBMED MISCLASSIFIED AS GPT: ', misclassified_as_pubmed) 
print('-------------------------------------------------') 
    
    
print('------------- %PERCENTAGE% -----------------------')    
print('Number of publications analyzed: ', count)
print('PUBMED CLASSIFIED: ', pubmed_class/50)   
print('CHATGPT CLASSIFIED: ', chatgpt_class/50)   
print('FAILED_TO_CLASSIFY: ', failed_to_classify/50)
print('GPT MISCLASSIFIED AS PUBMED: ', misclassified_as_gpt/50)   
print('PUBMED MISCLASSIFIED AS GPT: ', misclassified_as_pubmed/50) 
print('-------------------------------------------------') 

distance to range 1:  0.013519241706161145
distance to range 2:  0.12060791469194315
PUBMED CLASS PREDICTED =>  ratio: 0.13744075829383887 , evidence: PUBMED: background l
 -------------------------------- 
distance to range 1:  0.0014790243902438882
distance to range 2:  0.16930170731707317
PUBMED CLASS PREDICTED =>  ratio: 0.1524390243902439 , evidence: PUBMED: background p
 -------------------------------- 
distance to range 1:  0.11199896103896105
distance to range 2:  0.22259597402597403
PUBMED CLASS PREDICTED =>  ratio: 0.03896103896103896 , evidence: PUBMED: comorbid pai
 -------------------------------- 
distance to range 1:  0.0012139130434782597
distance to range 2:  0.18878739130434785
PUBMED CLASS PREDICTED =>  ratio: 0.15217391304347827 , evidence: PUBMED: background a
 -------------------------------- 
distance to range 1:  0.052122790697674434
distance to range 2:  0.1904304651162791
PUBMED CLASS PREDICTED =>  ratio: 0.09883720930232558 , evidence: PUBMED: background t
 