In [19]:
import pandas as pd
from xml.etree import ElementTree as ET
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

content_dir = '/Users/fernando/Documents/Research/LLM_SR_medicine/reviews_data/PICOS/'
file_path = '/Users/fernando/Documents/Research/LLM_SR_medicine/reviews_data/PICOS/All_incl_articles_20230630.xml'
tree = ET.parse(file_path)
root = tree.getroot()

In [20]:

def get_all_text(element, default=None):
    """Recursively get all text in the element and its descendants."""
    if element is not None:
        return ''.join(element.itertext())
    return default

def parse_record(record):
    """Parse a single record and return a dictionary."""
    row = {}
    for child in record.iter():
        tag = child.tag
        text = get_all_text(child)
        row[tag] = text
    return row

data = [parse_record(record) for record in root.findall('./records/record')]
df = pd.DataFrame(data)
df

Unnamed: 0,record,database,source-app,rec-number,foreign-keys,key,ref-type,contributors,titles,title,...,remote-database-name,authors,author,related-urls,url,custom6,orig-pub,publisher,secondary-authors,translated-title
0,All_incl_articles_20230630.enlEndNote1117Clini...,All_incl_articles_20230630.enl,EndNote,1,1,1,17,,Clinical practice guidelines for the diagnosis...,Clinical practice guidelines for the diagnosis...,...,,,,,,,,,,
1,All_incl_articles_20230630.enlEndNote2217Osteo...,All_incl_articles_20230630.enl,EndNote,2,2,2,17,,"Osteoporosis prevention, diagnosis, and therap...","Osteoporosis prevention, diagnosis, and therapy",...,,,,,,,,,,
2,All_incl_articles_20230630.enlEndNote3317Obesi...,All_incl_articles_20230630.enl,EndNote,3,3,3,17,,Obesity: preventing and managing the global ep...,Obesity: preventing and managing the global ep...,...,,,,,,,,,,
3,All_incl_articles_20230630.enlEndNote4417Unite...,All_incl_articles_20230630.enl,EndNote,4,4,4,17,,United Kingdom back pain exercise and manipula...,United Kingdom back pain exercise and manipula...,...,,,,,,,,,,
4,All_incl_articles_20230630.enlEndNote5517Physi...,All_incl_articles_20230630.enl,EndNote,5,5,5,17,,Physiotherapy rehabilitation after total knee ...,Physiotherapy rehabilitation after total knee ...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4657,All_incl_articles_20230630.enlEndNote404640461...,All_incl_articles_20230630.enl,EndNote,4046,4046,4046,17,"Zongo, N.Korsaga Somé, N.Ouédraogo, S.Ouédraog...","[Cancer of the vulva: Diagnostic stages, treat...","[Cancer of the vulva: Diagnostic stages, treat...",...,,"Zongo, N.Korsaga Somé, N.Ouédraogo, S.Ouédraog...","Dem, A.",,,,"Cancer de la vulve : stades diagnostiques, tra...",,,
4658,All_incl_articles_20230630.enlEndNote404740471...,All_incl_articles_20230630.enl,EndNote,4047,4047,4047,17,"Zorowitz, R. D.Smout, R. J.Gassaway, J. A.Horn...",Usage of pain medications during stroke rehabi...,Usage of pain medications during stroke rehabi...,...,,"Zorowitz, R. D.Smout, R. J.Gassaway, J. A.Horn...","Horn, S. D.",,,,,,,
4659,All_incl_articles_20230630.enlEndNote404840481...,All_incl_articles_20230630.enl,EndNote,4048,4048,4048,17,"Zucchi, B.Mangone, M.Agostini, F.Paoloni, M.Pe...",Movement Analysis with Inertial Measurement Un...,Movement Analysis with Inertial Measurement Un...,...,,"Zucchi, B.Mangone, M.Agostini, F.Paoloni, M.Pe...","Villani, C.",,,,,,,
4660,All_incl_articles_20230630.enlEndNote404940491...,All_incl_articles_20230630.enl,EndNote,4049,4049,4049,17,"Zügner, R.Tranberg, R.Lisovskaja, V.Kärrholm, J.",Different reliability of instrumented gait ana...,Different reliability of instrumented gait ana...,...,,"Zügner, R.Tranberg, R.Lisovskaja, V.Kärrholm, J.","Kärrholm, J.",,,,,,,


In [21]:

# We'll use TF-IDF to vectorize the lines and DataFrame text, then use cosine similarity to find the best matches.
df['concatenated_fields'] = df[[ 'contributors', 'titles', 'title','secondary-title', 'short-title','pages', 'volume', 'number',
       'edition', 'keywords', 'keyword', 'dates', 'year', 'pub-dates', 'date',
       'isbn', 'accession-num', 'authors',
       'author',]].fillna('').astype(str).apply(lambda x: ' '.join(x), axis=1)

pattern = r'(enlEndNote\d+[A-Za-z]+)'
print(len(df))
# Finding duplicated rows
df.drop_duplicates()# Showing duplicated rows
print(len(df))
df['uniqueid'] = df['record'].str.extract(pattern)
print(len(df['uniqueid'].unique()))
duplicates = df[df.duplicated(subset='uniqueid')]
df=df.drop_duplicates(subset='uniqueid', keep='first')

pattern = r'(enlEndNote\d+)'
duplicates['uniqueid'] = duplicates['record'].str.extract(pattern)
print(len(duplicates['uniqueid'].unique()))
df=pd.concat([df,duplicates])
print(len(df['uniqueid'].unique()))

df = df.drop_duplicates(subset='uniqueid', keep='first')
print(len(df['uniqueid'].unique()))

# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(min_df=1, analyzer='word', stop_words='english')


# Re-load the corrected text file since the variable was lost
with open(f'{content_dir}screening2_info/corrected_excluded_screening2.txt', 'r') as file:
    corrected_lines = file.readlines()

# Strip whitespace for a clean comparison
corrected_lines = [line.strip() for line in corrected_lines]


# We'll use TF-IDF to vectorize the lines and DataFrame text, then use cosine similarity to find the best matches.

# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(min_df=1, analyzer='word', stop_words='english')

# Vectorize the concatenated fields of the DataFrame
tfidf_matrix_df = vectorizer.fit_transform(df['concatenated_fields'])

# Vectorize the lines from the text file
tfidf_matrix_lines = vectorizer.transform(corrected_lines)

# Calculate the cosine similarity between each line and each row in the DataFrame
cosine_similarities = cosine_similarity(tfidf_matrix_lines, tfidf_matrix_df)

# For each line, find the index of the row with the highest cosine similarity
best_match_indices = np.argmax(cosine_similarities, axis=1)

# Get the similarity scores for the best matches
best_match_scores = cosine_similarities[np.arange(cosine_similarities.shape[0]), best_match_indices]

# Pair each line with its best matching row index and similarity score
matching_results = list(zip(best_match_indices, best_match_scores))



# Create a dataframe to hold the results for better visualization
matches_df = pd.DataFrame({
    'LineIndex': np.arange(len(corrected_lines)),
    'BestMatchRowIndex': best_match_indices,
    'SimilarityScore': best_match_scores
})

# Add the corresponding line and the matching text from the dataframe to the results dataframe
matches_df['Line'] = [corrected_lines[i] for i in matches_df['LineIndex']]
matches_df['BestMatchText'] = [df.iloc[i]['concatenated_fields'] for i in matches_df['BestMatchRowIndex']]

matches_df.loc[matches_df['LineIndex'] == 64, 'BestMatchRowIndex'] = 252
matches_df.loc[matches_df['LineIndex'] == 16, 'BestMatchRowIndex'] = 1414
matches_df[matches_df.BestMatchRowIndex.duplicated(keep=False)]

df['screening1'] = False
df['screening2'] = False

df['screening1'] = df.index.isin(matches_df['BestMatchRowIndex'])
df.screening1.value_counts()


with open(f'{content_dir}screening1_info/75_included.txt', 'r') as file:
    included75 = file.read().split('\n')

with open(f'{content_dir}screening1_info/citations_included.txt', 'r') as file:
    citations_review = file.read().split('\n')

# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(min_df=1, analyzer='word', stop_words='english')
# Strip whitespace for a clean comparison
corrected_lines = [line.strip() for line in citations_review]
# We'll use TF-IDF to vectorize the lines and DataFrame text, then use cosine similarity to find the best matches.

# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(min_df=1, analyzer='word', stop_words='english')

# Vectorize the concatenated fields of the DataFrame
tfidf_matrix_df = vectorizer.fit_transform(df['concatenated_fields'])

# Vectorize the lines from the text file
tfidf_matrix_lines = vectorizer.transform(citations_review)

# Calculate the cosine similarity between each line and each row in the DataFrame
cosine_similarities = cosine_similarity(tfidf_matrix_lines, tfidf_matrix_df)

# For each line, find the index of the row with the highest cosine similarity
best_match_indices = np.argmax(cosine_similarities, axis=1)

# Get the similarity scores for the best matches
best_match_scores = cosine_similarities[np.arange(cosine_similarities.shape[0]), best_match_indices]

# Pair each line with its best matching row index and similarity score
matching_results = list(zip(best_match_indices, best_match_scores))


# Create a dataframe to hold the results for better visualization
matches_df = pd.DataFrame({
    'LineIndex': np.arange(len(citations_review)),
    'BestMatchRowIndex': best_match_indices,
    'SimilarityScore': best_match_scores
})

# Add the corresponding line and the matching text from the dataframe to the results dataframe
matches_df['Line'] = [corrected_lines[i] for i in matches_df['LineIndex']]
matches_df['BestMatchText'] = [df.iloc[i]['concatenated_fields'] for i in matches_df['BestMatchRowIndex']]


df['screening2'] = df.index.isin(matches_df['BestMatchRowIndex'])
df.screening2.value_counts()


df.loc[df['screening2'] == True, 'screening1'] = True
print(df['screening1'].value_counts())
print(df['screening2'].value_counts())
print(df.groupby('screening1')['screening2'].value_counts())
print(df.groupby('screening1')['screening2'].value_counts().sum())

# Counting NaNs per column
nan_count = df.isna().sum()

print("\nNumber of NaNs per column:\n")
print(nan_count)
# df.to_pickle(f"{content_dir}preprocessed_articles.pkl")

4662
4662
4652
10
4662
4662


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicates['uniqueid'] = duplicates['record'].str.extract(pattern)


screening1
False    4477
True      185
Name: count, dtype: int64
screening2
False    4587
True       75
Name: count, dtype: int64
screening1  screening2
False       False         4477
True        False          110
            True            75
Name: count, dtype: int64
4662

Number of NaNs per column:

record                         0
database                       0
source-app                     0
rec-number                     0
foreign-keys                   0
key                            0
ref-type                       0
contributors                   0
titles                         0
title                          1
style                          0
secondary-title                7
short-title                  613
pages                        193
volume                       175
number                       633
edition                      643
keywords                     385
keyword                      385
dates                          0
year                           2
p

In [22]:
df['uniqueid'] = df.index.astype(str)
# find indeces with no title or abstract or is nan
no_title_abstract = df[(df['titles'] == '') | (df['abstract'] == '') | (df['titles'].isna()) | (df['abstract'].isna())]
no_title_abstract.to_pickle(f"{content_dir}no_title_abstract.pkl")
df.drop("uniqueid", axis=1, inplace=True)

print(len(df),len(no_title_abstract))
full_records = df[~df.index.isin(no_title_abstract.index)]

print(full_records['screening1'].value_counts())
print(full_records.groupby('screening1')['screening2'].value_counts())
full_records.to_pickle("preprocessed_articles_filtered.pkl")
full_records.to_excel("preprocessed_articles_filtered.xlsx")


4662 161
screening1
False    4320
True      181
Name: count, dtype: int64
screening1  screening2
False       False         4320
True        False          108
            True            73
Name: count, dtype: int64


In [23]:
full_records

Unnamed: 0,record,database,source-app,rec-number,foreign-keys,key,ref-type,contributors,titles,title,...,related-urls,url,custom6,orig-pub,publisher,secondary-authors,translated-title,concatenated_fields,screening1,screening2
0,All_incl_articles_20230630.enlEndNote1117Clini...,All_incl_articles_20230630.enl,EndNote,1,1,1,17,,Clinical practice guidelines for the diagnosis...,Clinical practice guidelines for the diagnosis...,...,,,,,,,,Clinical practice guidelines for the diagnosi...,False,False
1,All_incl_articles_20230630.enlEndNote2217Osteo...,All_incl_articles_20230630.enl,EndNote,2,2,2,17,,"Osteoporosis prevention, diagnosis, and therap...","Osteoporosis prevention, diagnosis, and therapy",...,,,,,,,,"Osteoporosis prevention, diagnosis, and thera...",False,False
2,All_incl_articles_20230630.enlEndNote3317Obesi...,All_incl_articles_20230630.enl,EndNote,3,3,3,17,,Obesity: preventing and managing the global ep...,Obesity: preventing and managing the global ep...,...,,,,,,,,Obesity: preventing and managing the global e...,False,False
3,All_incl_articles_20230630.enlEndNote4417Unite...,All_incl_articles_20230630.enl,EndNote,4,4,4,17,,United Kingdom back pain exercise and manipula...,United Kingdom back pain exercise and manipula...,...,,,,,,,,United Kingdom back pain exercise and manipul...,False,False
4,All_incl_articles_20230630.enlEndNote5517Physi...,All_incl_articles_20230630.enl,EndNote,5,5,5,17,,Physiotherapy rehabilitation after total knee ...,Physiotherapy rehabilitation after total knee ...,...,,,,,,,,Physiotherapy rehabilitation after total knee...,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3152,All_incl_articles_20230630.enlEndNote272727271...,All_incl_articles_20230630.enl,EndNote,2727,2727,2727,17,"Østerås, N.Hagen, K. B.Grotle, M.Sand-Svartrud...",Exercise programme with telephone follow-up fo...,Exercise programme with telephone follow-up fo...,...,,,,,,,,"Østerås, N.Hagen, K. B.Grotle, M.Sand-Svartrud...",False,False
3153,All_incl_articles_20230630.enlEndNote272827281...,All_incl_articles_20230630.enl,EndNote,2728,2728,2728,17,"Østerås, N.van Bodegom-Vos, L.Dziedzic, K.Mose...",Implementing international osteoarthritis trea...,Implementing international osteoarthritis trea...,...,,,,,,,,"Østerås, N.van Bodegom-Vos, L.Dziedzic, K.Mose...",False,False
3165,All_incl_articles_20230630.enlEndNote454945491...,All_incl_articles_20230630.enl,EndNote,4549,4549,4549,17,"Õunpuu, SylviaPierz, KristanRethlefsen, Susan ...",Cost savings for single event multilevel surge...,Cost savings for single event multilevel surge...,...,,,,,,,,"Õunpuu, SylviaPierz, KristanRethlefsen, Susan ...",False,False
3889,All_incl_articles_20230630.enlEndNote338533851...,All_incl_articles_20230630.enl,EndNote,3385,3385,3385,17,"Śliwiński, Z.Frączek, E.Starczyńska, M.",Role of the physiotherapist in the orthopaedic...,Role of the physiotherapist in the orthopaedic...,...,,,,,,,,"Śliwiński, Z.Frączek, E.Starczyńska, M. Role o...",False,False
