<a href="https://colab.research.google.com/github/galenos-project/literature-mining/blob/main/06_16_brainstorm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Extracting articles using Entrez

In [None]:
pip install pymed

Collecting pymed
  Downloading pymed-0.8.9-py3-none-any.whl (9.6 kB)
Installing collected packages: pymed
Successfully installed pymed-0.8.9


In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


In [None]:
from gensim.corpora.dictionary import Dictionary
from gensim import models
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

In [None]:
from Bio import Entrez
import pandas as pd

# Set email
Entrez.email = "avahomiar@gmail.com"

# Define search term
search_term = "(((mood) OR (depress*) OR (affective disorder)) OR ((psychosis) OR (schizo*) OR (psychotic))) AND ((psychedelic) OR (hallucinogen) OR (entheogen) OR (hallucinogenic) OR (psychotropic)) AND (2014/2024[Date - Publication])"

# Function to fetch PubMed IDs
def fetch_pubmed_ids(term, retmax=500):
    handle = Entrez.esearch(db="pubmed", term=term, retmax=retmax, usehistory="y")
    record = Entrez.read(handle)
    handle.close()
    return record

# Function to fetch articles given a list of PubMed IDs
def fetch_articles(pubmed_ids, batch_size=200):
    articles = []
    for start in range(0, len(pubmed_ids), batch_size):
        batch_ids = pubmed_ids[start:start + batch_size]
        handle = Entrez.efetch(db="pubmed", id=",".join(batch_ids), retmode="xml")
        batch_records = Entrez.read(handle)
        handle.close()
        articles.extend(batch_records['PubmedArticle'])
    return articles

# Fetch PubMed IDs
search_results = fetch_pubmed_ids(search_term, retmax=10000)
pubmed_ids = search_results['IdList']
total_results = int(search_results['Count'])

# Initialize list to hold article data
article_list = []

# Fetch articles in batches
batch_size = 200  # Fetch 200 articles at a time to avoid HTTP 400 errors because 200 records per page on pubmed
articles = fetch_articles(pubmed_ids, batch_size=batch_size)
for article in articles:
    article_dict = {}
    article_dict['Title'] = article['MedlineCitation']['Article'].get('ArticleTitle', '')

    abstract = article['MedlineCitation']['Article'].get('Abstract', {}).get('AbstractText', '')
    if isinstance(abstract, list):
        article_dict['Abstract'] = ' '.join(abstract)
    else:
        article_dict['Abstract'] = abstract

    pub_date = ''
    if article['MedlineCitation']['Article'].get('ArticleDate', []):
        date = article['MedlineCitation']['Article']['ArticleDate'][0]
        year = date.get('Year', '')
        month = date.get('Month', '')
        day = date.get('Day', '')
        pub_date = f"{year}-{month}-{day}"
    elif article['MedlineCitation']['Article'].get('Journal', {}).get('JournalIssue', {}).get('PubDate', {}):
        date = article['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']
        year = date.get('Year', '')
        month = date.get('Month', '')
        day = date.get('Day', '')
        pub_date = f"{year}-{month}-{day}"

    article_dict['Publication Date'] = pub_date

    keywords = article['MedlineCitation'].get('KeywordList', [[]])
    if keywords and keywords[0]:
        article_dict['Keywords'] = ' '.join(keywords[0])
    else:
        article_dict['Keywords'] = ''

    article_list.append(article_dict)

# Convert list of article details to a DataFrame
articles_df = pd.DataFrame(article_list)

# Save DataFrame to a CSV file
output_csv_path = '/content/pubmed_articles.csv'
articles_df.to_csv(output_csv_path, index=False)

# success message
print(f"Articles DataFrame saved to '{output_csv_path}'")

# CSV file
from google.colab import files
files.download(output_csv_path)


Articles DataFrame saved to '/content/pubmed_articles.csv'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Preprocessing text with LDA, compiling records and creating corpus

In [None]:
import re
import nltk
import spacy
import gensim
import pandas as pd
from Bio import Entrez

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load English stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Set email for Entrez
Entrez.email = "avahomiar@gmail.com"

# Define preprocessing function
def preprocess_text(text):
    # Remove Emails
    text = re.sub(r'\S*@\S*\s?', '', text)
    # Remove new line characters
    text = re.sub(r'\s+', ' ', text)
    # Remove distracting single quotes
    text = re.sub(r"\'", "", text)
    return text

# Function to fetch PubMed IDs using Entrez
def fetch_pubmed_ids(term, retmax=500):
    handle = Entrez.esearch(db="pubmed", term=term, retmax=retmax, usehistory="y")
    record = Entrez.read(handle)
    handle.close()
    return record

# Function to fetch articles given a list of PubMed IDs
def fetch_articles(pubmed_ids, batch_size=200):
    articles = []
    for start in range(0, len(pubmed_ids), batch_size):
        batch_ids = pubmed_ids[start:start + batch_size]
        handle = Entrez.efetch(db="pubmed", id=",".join(batch_ids), retmode="xml")
        batch_records = Entrez.read(handle)
        handle.close()
        articles.extend(batch_records['PubmedArticle'])
    return articles

# Define search term
search_term = "(((mood) OR (depress*) OR (affective disorder)) OR ((psychosis) OR (schizo*) OR (psychotic))) AND ((psychedelic) OR (hallucinogen) OR (entheogen) OR (hallucinogenic) OR (psychotropic)) AND (2014/2024[Date - Publication])"

# Fetch PubMed IDs
search_results = fetch_pubmed_ids(search_term, retmax=10000)
pubmed_ids = search_results['IdList']
total_results = int(search_results['Count'])

# Initialize list to store article dictionaries
articleList = []

# Initialize spacy ‘en’ model, keeping only tagger component (for efficiency)
# Run in terminal: python -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Fetch articles in batches
batch_size = 200  # Fetch 200 articles at a time to avoid HTTP 400 errors because 200 records per page in pubmed
articles = fetch_articles(pubmed_ids, batch_size=batch_size)

# Iterate through search results
for article in articles:
    articleDict = {}

    # Extract title and abstract
    title = article['MedlineCitation']['Article'].get('ArticleTitle', '')
    abstract_list = article['MedlineCitation']['Article'].get('Abstract', {}).get('AbstractText', [])

    # Process multiple abstract sections
    if isinstance(abstract_list, list):
        abstract = ' '.join([preprocess_text(section) for section in abstract_list])
    else:
        abstract = preprocess_text(abstract_list)

    # Check if title and abstract are strings
    if isinstance(title, str):
        title = preprocess_text(title)

    # Combine title and abstract if they are not None
    if title and abstract:
        alltext = title + '. ' + abstract
    elif title:
        alltext = title
    elif abstract:
        alltext = abstract
    else:
        alltext = ''

    # Append title, abstract, and combined text to article dictionary
    articleDict['title'] = title
    articleDict['abstract'] = abstract
    articleDict['alltext'] = alltext

    # Append modified article dictionary to article list
    articleList.append(articleDict)

# Convert article list to DataFrame
articlesPD = pd.DataFrame(articleList)

# Print DataFrame
print(articlesPD)

# Define file path for saving the output in Colab
output_file = '/content/corpus.tsv'

# Export DataFrame to .tsv file
articlesPD.to_csv(output_file, sep='\t', index=False)

print(f"Corpus saved to '{output_file}'")

# Handle missing values in 'title' and 'abstract' columns
articlesPD['title'].fillna('', inplace=True)
articlesPD['abstract'].fillna('', inplace=True)

# Combine 'title' and 'abstract' into 'alltext' column
articlesPD["alltext"] = articlesPD["title"].astype(str) + '. ' + articlesPD["abstract"].astype(str)

# Convert to list
data = articlesPD.alltext.values.tolist()

pprint(data[:1])

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))
print(data_words[:1])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'VERB']) #select noun and verb
print(data_lemmatized[:2])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


                                                  title  \
0     Seizure due to multiple drugs intoxication: a ...   
1     Efficacy of ranitidine in olanzapine-induced w...   
2     Characterization and evaluation of self-nanoem...   
3     Pharmacokinetic profile after multiple deltoid...   
4     Pregabalin Treatment of a Patient With Complex...   
...                                                 ...   
5036  Limbic system white matter microstructure and ...   
5037  Proposed DSM-5 mixed features are associated w...   
5038  A randomised, placebo-controlled 52-week trial...   
5039  Can an early weight management program (WMP) p...   
5040  A rare case of acute respiratory distress synd...   

                                               abstract  \
0     The mechanism of the antidepressant effect of ...   
1     Weight gain has long been recognized as a side...   
2     The purpose of this work was to develop self-n...   
3     Paliperidone palmitate (PP) is a once-monthly ...

In [None]:
vectorizer = CountVectorizer(analyzer='word',
                             min_df=10,
# minimum reqd occurences of a word
                             stop_words='english',
# remove stop words
                             lowercase=True,
# convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',
# num chars > 3
                             # max_features=50000,
# max number of uniq words
)
data_vectorized = vectorizer.fit_transform(data_lemmatized)


# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=20,               # Number of topics
                                      max_iter=10,
# Max learning iterations
                                      learning_method='online',
                                      random_state=100,
# Random state
                                      batch_size=128,
# n docs in each learning iter
                                      evaluate_every = -1,
# compute perplexity every n iters, default: Don't
                                      n_jobs = -1,
# Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model)  # Model attributes

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
 evaluate_every=-1, learning_decay=0.7,
 learning_method='online', learning_offset=10.0,
 max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
 n_components=10, n_jobs=-1, perp_tol=0.1,
 random_state=100, topic_word_prior=None,
 total_samples=1000000.0, verbose=0)


LatentDirichletAllocation(learning_method='online', n_components=20, n_jobs=-1,
                          random_state=100)


In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -2587447.56380699
Perplexity:  535.2226929699794
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 20,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [None]:
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
             perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0),
       n_jobs=1,
       param_grid={'n_topics': [10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [None]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 10}
Best Log Likelihood Score:  -545178.2999993508
Model Perplexity:  551.3215595358134


In [None]:
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ['Topic' + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ['Doc' + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Styling
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return 'font-weight: {weight}'.format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.0,0.61,0.0,0.0,0.06,0.06,0.12,0.0,0.14,0.0,1
Doc1,0.0,0.28,0.0,0.0,0.15,0.55,0.0,0.0,0.0,0.0,5
Doc2,0.13,0.0,0.47,0.05,0.11,0.12,0.0,0.0,0.12,0.0,2
Doc3,0.0,0.05,0.03,0.0,0.0,0.85,0.07,0.0,0.0,0.0,5
Doc4,0.04,0.03,0.0,0.45,0.0,0.0,0.17,0.0,0.3,0.0,3
Doc5,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.77,0.03,0.03,7
Doc6,0.0,0.0,0.0,0.25,0.02,0.0,0.39,0.1,0.07,0.17,6
Doc7,0.0,0.0,0.0,0.29,0.0,0.0,0.0,0.0,0.67,0.03,8
Doc8,0.0,0.0,0.0,0.0,0.0,0.26,0.39,0.0,0.32,0.03,6
Doc9,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.82,0.02,8


In [None]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names_out()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()

Unnamed: 0,ability,abnormality,abolish,abrogate,absence,absorption,abstinence,abstract,abundance,abuse,...,worsen,worsening,write,year,yield,ymrs,yohimbine,youth,zinc,ziprasidone
Topic0,17.431345,38.125629,30.299268,8.863018,3.157406,2.957689,0.100007,0.100001,0.210276,0.100029,...,0.100006,0.100001,0.100015,0.100019,0.100042,0.100003,22.740817,0.100001,8.141271,0.100005
Topic1,0.100017,0.100028,0.100017,0.1,0.100031,0.100047,73.559054,0.100018,0.100009,67.006494,...,0.303427,0.165947,8.081507,1029.049008,0.100025,0.100003,0.100003,64.781697,0.100079,0.100008
Topic2,0.100015,0.100048,0.100003,0.100001,0.100022,0.100078,0.100003,0.100038,0.10003,0.100022,...,0.100021,0.100001,0.100107,0.100037,0.100034,0.100005,0.1,0.100008,0.100134,160.889876
Topic3,7.002864,0.100033,0.100008,0.100009,13.537193,0.10007,0.100015,33.526328,0.100028,1.578896,...,9.912453,0.100045,0.100053,66.888168,47.281128,0.100003,0.1,38.436744,0.100001,0.100008
Topic4,98.820157,20.05768,34.587016,0.100119,21.730271,23.857755,0.100014,0.100002,20.28201,0.1002,...,0.100029,0.100007,0.100011,0.100029,17.827201,0.100003,0.100053,0.100007,50.87776,0.100006


In [None]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,effect,rat,antidepressant,mouse,induce,test,increase,treatment,stress,depression,behavior,level,activity,decrease,study
Topic 1,use,risk,year,study,increase,woman,drug,antidepressant,patient,associate,medication,age,rate,child,population
Topic 2,antipsychotic,generation,metabolic,schizophrenia,drug,glucose,attitude,sga,diabetes,act,cyp,ziprasidone,adherence,formulation,body
Topic 3,treatment,study,antidepressant,effect,review,trial,use,disorder,depression,evidence,drug,datum,therapy,include,efficacy
Topic 4,receptor,effect,drug,dopamine,mechanism,activity,induce,brain,antagonist,increase,concentration,protein,cell,activation,action
Topic 5,patient,treatment,week,placebo,group,score,study,scale,trial,day,risperidone,response,randomize,efficacy,baseline
Topic 6,patient,schizophrenia,treatment,study,disorder,symptom,level,control,associate,group,clozapine,subject,use,function,episode
Topic 7,lithium,gene,mood,drug,disorder,expression,induce,response,case,polymorphism,effect,cell,report,genotype,associate
Topic 8,depression,patient,treatment,care,medication,anxiety,symptom,use,health,disorder,antidepressant,study,intervention,adherence,life
Topic 9,sleep,memory,task,effect,network,use,connectivity,cortex,performance,increase,processing,thc,control,reward,condition


In [None]:
for index,topic in enumerate(lda_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['risk', 'search', 'pregnancy', 'research', 'datum', 'include', 'depression', 'antidepressant', 'effect', 'evidence', 'study', 'use', 'review', 'drug', 'treatment']


THE TOP 15 WORDS FOR TOPIC #1
['time', 'sertraline', 'phase', 'formulation', 'concentration', 'result', 'analysis', 'state', 'release', 'study', 'model', 'venlafaxine', 'method', 'drug', 'use']


THE TOP 15 WORDS FOR TOPIC #2
['episode', 'illness', 'ect', 'act', 'outcome', 'discontinuation', 'schizophrenia', 'effectiveness', 'relapse', 'cost', 'intervention', 'adherence', 'medication', 'patient', 'treatment']


THE TOP 15 WORDS FOR TOPIC #3
['treatment', 'cognition', 'effect', 'improve', 'patient', 'associate', 'task', 'dysfunction', 'performance', 'impairment', 'memory', 'deficit', 'function', 'symptom', 'schizophrenia']


THE TOP 15 WORDS FOR TOPIC #4
['affect', 'feeling', 'tyrosine', 'cbd', 'ghrelin', 'memantine', 'energy', 'cannabinoid', 'glycine', 'trp', 'peptide', 'met', 'nursing', 'hom

#Checking for the presence of topic words individually in the papers, and all words in a topic in each paper


In [None]:
# Check for presence of top words in titles and abstracts
for index, row in articlesPD.iterrows():
    title = row['title']
    abstract_text = row['abstract']
    pub_date = row.get('publication_date', '')

    # Check if title and abstract are not NaNs
    if isinstance(title, str) and isinstance(abstract_text, str):
        # Check if top 5 words from each topic are mentioned
        for topic_idx, top_words in enumerate(topic_keywords):
            for word in top_words[:5]:
                if word in title or word in abstract_text:
                    # The word is mentioned in the title or abstract
                    print(f"Topic {topic_idx + 1}: Word '{word}' mentioned in the title/abstract of article {index} published on {pub_date}")
    else:
        print(f"Skipping article {index} due to missing title or abstract")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Topic 7: Word 'patient' mentioned in the title/abstract of article 4721 published on 
Topic 7: Word 'schizophrenia' mentioned in the title/abstract of article 4721 published on 
Topic 7: Word 'treatment' mentioned in the title/abstract of article 4721 published on 
Topic 7: Word 'study' mentioned in the title/abstract of article 4721 published on 
Topic 7: Word 'disorder' mentioned in the title/abstract of article 4721 published on 
Topic 8: Word 'disorder' mentioned in the title/abstract of article 4721 published on 
Topic 9: Word 'patient' mentioned in the title/abstract of article 4721 published on 
Topic 9: Word 'treatment' mentioned in the title/abstract of article 4721 published on 
Topic 9: Word 'medication' mentioned in the title/abstract of article 4721 published on 
Topic 10: Word 'effect' mentioned in the title/abstract of article 4721 published on 
Topic 1: Word 'effect' mentioned in the title/abstract of arti

In [None]:
# Check for the presence of all top words in titles and abstracts
for index, row in articlesPD.iterrows():
    title = row['title']
    abstract_text = row['abstract']
    pub_date = row.get('publication_date', '')

    # Combine title and abstract
    combined_text = f"{title} {abstract_text}"

    # Check if title and abstract are not NaNs
    if isinstance(title, str) and isinstance(abstract_text, str):
        # Check if all top 5 words from each topic are mentioned
        for topic_idx, top_words in enumerate(topic_keywords):
            if all(word in combined_text for word in top_words[:5]):
                # All words are mentioned in the combined title and abstract
                print(f"Topic {topic_idx + 1}: All top 5 words are mentioned in the title/abstract of article {index} published on {pub_date}")
    else:
        print(f"Skipping article {index} due to missing title or abstract")

Topic 7: All top 5 words are mentioned in the title/abstract of article 30 published on 
Topic 7: All top 5 words are mentioned in the title/abstract of article 54 published on 
Topic 9: All top 5 words are mentioned in the title/abstract of article 55 published on 
Topic 6: All top 5 words are mentioned in the title/abstract of article 56 published on 
Topic 5: All top 5 words are mentioned in the title/abstract of article 57 published on 
Topic 2: All top 5 words are mentioned in the title/abstract of article 58 published on 
Topic 2: All top 5 words are mentioned in the title/abstract of article 67 published on 
Topic 6: All top 5 words are mentioned in the title/abstract of article 81 published on 
Topic 6: All top 5 words are mentioned in the title/abstract of article 90 published on 
Topic 4: All top 5 words are mentioned in the title/abstract of article 115 published on 
Topic 2: All top 5 words are mentioned in the title/abstract of article 123 published on 
Topic 6: All top 5 

In [None]:
# Get the total number of rows in the DataFrame
total_rows = len(articlesPD)

# List to store results
results = []

# Check for the presence of top words in titles and abstracts for all rows
for index in range(total_rows - 50, total_rows):
    row = articlesPD.iloc[index]
    title = row['title']
    abstract_text = row['abstract']
    pub_date = row.get('publication_date', '')


    # Combine title and abstract
    combined_text = f"{title} {abstract_text}"

    # Check if title and abstract are not NaNs
    if isinstance(title, str) and isinstance(abstract_text, str):
        # Check if all top 5 words from each topic are mentioned
        for topic_idx, top_words in enumerate(topic_keywords):
            if all(word in combined_text for word in top_words[:5]):
                # All words are mentioned in the combined title and abstract
                results.append((topic_idx + 1, top_words[:5], index, pub_date))
    else:
        print(f"Skipping article {index} due to missing title or abstract")

# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=['Topic', 'Words', 'Article_Index', 'Publication_Date'])

# Save DataFrame to a .tsv file in Google Colab
output_file = '/content/historical_abstracts.tsv'
results_df.to_csv(output_file, sep='\t', index=False)

print(f"Results saved to '{output_file}'")

Results saved to '/content/historical_abstracts.tsv'


In [None]:
# Initialize topic labels
topic_labels = [f"Topic {i+1}" for i in range(len(topic_keywords))]

# Initialize an empty DataFrame with rows as papers and columns as topics
binary_data_table = pd.DataFrame(index=range(len(articlesPD)), columns=topic_labels)

# Iterate through each paper
for index, row in articlesPD.iterrows():
    title = row['title']
    abstract_text = row['abstract']

    # Initialize a list to store topic mentions for this paper
    topic_mentions = [0] * len(topic_keywords)

    # edited code
from nltk import word_tokenize
import string
 # Check if title and abstract are not NaNs
if isinstance(title, str) and isinstance(abstract_text, str):
  titlewords = set([w for w in word_tokenize(title) if w not in string.punctuation])
  abstractwords = set([w for w in word_tokenize(abstract) if w not in string.punctuation])
  allwords = titlewords.union(abstractwords)
# Check if top 5 words from each topic are mentioned
for topic_idx, top_words in enumerate(topic_keywords):
  if all (e in allwords for e in set(top_words[:5])):
    topic_mentions[topic_idx] = 1

    # Update  binary data table with topic mentions for  paper
    binary_data_table.iloc[index] = topic_mentions

# Add labels for rows (papers)
binary_data_table.index.name = 'Paper'

# Print  first 50 rows of the binary data table
print(binary_data_table.head(50))

# Save  binary data table to csv
binary_data_table.to_csv('binary_data_table.csv')

print("Binary data table saved to 'binary_data_table.csv'")


      Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8 Topic 9  \
Paper                                                                           
0         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
1         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
2         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
4         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
6         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
7         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
8         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
9         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
10        NaN     NaN     Na

In [None]:
import pandas as pd
from nltk import word_tokenize
import string

# topic_keywords = [...]
# articlesPD = pd.read_csv('path_to_articlesPD.csv')

# Initialize topic labels
topic_labels = [f"Topic {i+1}" for i in range(len(topic_keywords))]

# Initialize an empty DataFrame with rows as papers and columns as topics
binary_data_table = pd.DataFrame(0, index=range(len(articlesPD)), columns=topic_labels)

# Iterate through each paper
for index, row in articlesPD.iterrows():
    title = row['title']
    abstract_text = row['abstract']

    # Check if title and abstract are not NaNs
    if isinstance(title, str) and isinstance(abstract_text, str):
        # Tokenize and remove punctuation
        titlewords = set([w.lower() for w in word_tokenize(title) if w not in string.punctuation])
        abstractwords = set([w.lower() for w in word_tokenize(abstract_text) if w not in string.punctuation])
        allwords = titlewords.union(abstractwords)

        # Check if top 5 words from each topic are mentioned
        for topic_idx, top_words in enumerate(topic_keywords):
            if all(e.lower() in allwords for e in set(top_words[:5])):
                binary_data_table.iloc[index, topic_idx] = 1

# Add labels for rows (papers)
binary_data_table.index.name = 'Paper'

# Print first 50 rows of the binary data table
print(binary_data_table.head(50))

# Save the binary data table to csv
binary_data_table.to_csv('binary_data_table.csv')

print("Binary data table saved to 'binary_data_table.csv'")


       Topic 1  Topic 2  Topic 3  Topic 4  Topic 5  Topic 6  Topic 7  Topic 8  \
Paper                                                                           
0            0        0        0        0        0        0        0        0   
1            0        0        0        0        0        0        0        0   
2            0        0        0        0        0        0        0        0   
3            0        0        0        0        0        0        0        0   
4            0        0        0        0        0        0        0        0   
5            0        0        0        0        0        0        0        0   
6            0        0        0        0        0        0        0        0   
7            0        0        0        0        0        0        0        0   
8            0        0        0        0        0        0        0        0   
9            0        0        0        0        0        0        0        0   
10           0        0     

In [None]:
binary_data_table = pd.read_csv('binary_data_table.csv')

if 'Paper' not in binary_data_table.columns:
    raise KeyError("'Paper' column not found in binary_data_table")

binary_data_table['Paper'] = binary_data_table['Paper'].astype(str)

articlesPD['title'] = articlesPD['title'].astype(str)

articlesPD['publication_date'] = pd.to_datetime(articlesPD['publication_date'])

combined_data = pd.merge(articlesPD, binary_data_table, left_on='title', right_on='Paper', how='inner')

combined_data.to_csv('combined_data.csv', index=False)

print("Combined data saved to 'combined_data.csv'")

KeyError: 'publication_date'

In [None]:
combined_data = pd.read_csv('combined_data.csv')

topic_labels = [col for col in combined_data.columns if col.startswith('Topic')]

print("First few rows of combined data:")
print(combined_data.head())

print("\nColumn types:")
print(combined_data.dtypes)

combined_data['publication_date'] = pd.to_datetime(combined_data['publication_date'], errors='coerce')

print("\nNumber of NaT values in publication_date:")
print(combined_data['publication_date'].isna().sum())

combined_data = combined_data.dropna(subset=['publication_date'])

combined_data['publication_week'] = combined_data['publication_date'].dt.isocalendar().week
combined_data['publication_month'] = combined_data['publication_date'].dt.month
combined_data['publication_year'] = combined_data['publication_date'].dt.year

print("\nFirst few rows with new time period columns:")
print(combined_data[['publication_date', 'publication_week', 'publication_month', 'publication_year']].head())

mentions_weekly = combined_data.groupby(['publication_year', 'publication_week'])[topic_labels].sum()
mentions_monthly = combined_data.groupby(['publication_year', 'publication_month'])[topic_labels].sum()
mentions_yearly = combined_data.groupby('publication_year')[topic_labels].sum()

proportions_weekly = mentions_weekly.div(mentions_weekly.sum(axis=1), axis=0)
proportions_monthly = mentions_monthly.div(mentions_monthly.sum(axis=1), axis=0)
proportions_yearly = mentions_yearly.div(mentions_yearly.sum(axis=1), axis=0)

print("\nAggregate Counts of Mentions of Topics Over Weeks:")
print(mentions_weekly)
print("\nAggregate Counts of Mentions of Topics Over Months:")
print(mentions_monthly)
print("\nAggregate Counts of Mentions of Topics Over Years:")
print(mentions_yearly)

In [None]:
try:
    corpus = pd.read_csv('/content/corpus_colab.tsv', sep='\t', on_bad_lines='skip')
except pd.errors.ParserError as e:
    print("Error parsing the TSV file:", e)

binary_data_table = pd.read_csv('binary_data_table.csv')

corpus['publication_date'] = pd.to_datetime(corpus['publication_date'], errors='coerce')

corpus = corpus.dropna(subset=['publication_date'])

corpus['title'] = corpus['title'].astype(str)
binary_data_table['Paper'] = binary_data_table['Paper'].astype(str)

print("First few rows of corpus:")
print(corpus.head())

print("\nFirst few rows of binary_data_table:")
print(binary_data_table.head())

combined_data = pd.merge(corpus, binary_data_table, left_on='title', right_on='Paper', how='inner')

topic_labels = [col for col in binary_data_table.columns if col.startswith('Topic')]

selected_columns = ['publication_date', 'title', 'abstract'] + topic_labels
combined_data = combined_data[selected_columns]

output_filename = 'combined_publications_with_topic_mentions.csv'
combined_data.to_csv(output_filename, index=False)

print("\nFirst few rows of the final combined data:")
print(combined_data.head())

print(f"\nFile '{output_filename}' has been created with the required information.")

FileNotFoundError: [Errno 2] No such file or directory: '/content/corpus_colab.tsv'

##With PyMed

In [None]:
pip install pymed



In [None]:
# Install pymed and pandas if not already installed
!pip install pymed
!pip install pandas

from pymed import PubMed
import pandas as pd
import datetime

# Initialize PubMed API
pubmed = PubMed(tool="PubMedSearcher", email="avahomiar@gmail.com")

# Define the search term
search_term = "(((mood) OR (depress*) OR (affective disorder)) OR ((psychosis) OR (schizo*) OR (psychotic))) AND ((psychedelic) OR (hallucinogen) OR (entheogen) OR (hallucinogenic) OR (psychotropic)) AND (2014/2024[Date - Publication])"

# Perform the search
results = pubmed.query(search_term, max_results=50000)

# Initialize lists to store article details
articleList = []
articleInfo = []

# Iterate over search results
for article in results:
    articleDict = article.toDict()
    articleList.append(articleDict)

# Extract relevant information from each article
for article in articleList:
    pubmedId = article['pubmed_id'].partition('\n')[0]
    pub_date = article.get('publication_date', '')

    # Convert date to YYYY-MM-DD format if it's a datetime.date object
    if isinstance(pub_date, datetime.date):
        pub_date = pub_date.strftime('%Y-%m-%d')

    abstract = article.get('abstract', '')
    if isinstance(abstract, list):
        abstract = ' '.join(abstract)

    articleInfo.append({
        u'pubmed_id': pubmedId,
        u'title': article.get('title', ''),
        u'abstract': abstract,
        u'publication_date': pub_date,
        u'keywords': ', '.join(article.get('keywords', []))
    })

# Convert list of article details to DataFrame
articlesPD = pd.DataFrame.from_dict(articleInfo)

# Export DataFrame to CSV file
output_csv_path = '/content/export_dataframe.csv'
articlesPD.to_csv(output_csv_path, index=False, header=True)

# Print success message and preview of DataFrame
print(f"Articles DataFrame saved to '{output_csv_path}'")
print(articlesPD.head())

# Optionally, download the CSV file
from google.colab import files
files.download(output_csv_path)


Articles DataFrame saved to '/content/export_dataframe.csv'
  pubmed_id                                              title  \
0  27793241  Seizure due to multiple drugs intoxication: a ...   
1  25529756  Efficacy of ranitidine in olanzapine-induced w...   
2  25148542  Characterization and evaluation of self-nanoem...   
3  27136907  Pharmacokinetic profile after multiple deltoid...   
4  26706484  Pregabalin Treatment of a Patient With Complex...   

                                            abstract publication_date  \
0  The mechanism of the antidepressant effect of ...       2016-10-30   
1  Weight gain has long been recognized as a side...       2016-10-21   
2  The purpose of this work was to develop self-n...       2016-10-18   
3  Paliperidone palmitate (PP) is a once-monthly ...       2016-05-04   
4  Complex regional pain syndrome (CRPS) is a pai...       2015-12-27   

                                            keywords  
0  Bupripion, Bupropiona, Convulsão, Intensive ca

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#PubMed Search string and results ORIGINAL

from pymed import PubMed
import pandas as pd

pubmed = PubMed(tool="PubMedSearcher", email="avahomiar@gmail.com")

search_term = "(((mood) OR (depress*) OR (affective disorder)) OR ((psychosis) OR (schizo*) OR (psychotic))) AND ((psychedelic) OR (hallucinogen) OR (entheogen) OR (hallucinogenic) OR (psychotropic)) AND (2014/2024[Date - Publication])"
results = pubmed.query(search_term)
articleList = []
articleInfo = []

for article in results:
    articleDict = article.toDict()
    articleList.append(articleDict)

for article in articleList:
    pubmedId = article['pubmed_id'].partition('\n')[0]
    articleInfo.append({
        u'pubmed_id': pubmedId,
        u'title': article['title'],
        u'abstract': article['abstract'],})

    # keywords available
    if 'keywords' in article:
        articleInfo[-1]['keywords'] = article['keywords']

articlesPD = pd.DataFrame.from_dict(articleInfo)
export_csv = articlesPD.to_csv(r'/content/export_dataframe.csv', index=None, header=True)

print(articlesPD.head())


  pubmed_id                                              title  \
0  27793241  Seizure due to multiple drugs intoxication: a ...   
1  25529756  Efficacy of ranitidine in olanzapine-induced w...   
2  25148542  Characterization and evaluation of self-nanoem...   
3  27136907  Pharmacokinetic profile after multiple deltoid...   
4  26706484  Pregabalin Treatment of a Patient With Complex...   

                                            abstract  \
0  The mechanism of the antidepressant effect of ...   
1  Weight gain has long been recognized as a side...   
2  The purpose of this work was to develop self-n...   
3  Paliperidone palmitate (PP) is a once-monthly ...   
4  Complex regional pain syndrome (CRPS) is a pai...   

                                            keywords  
0  [Bupripion, Bupropiona, Convulsão, Intensive c...  
1              [olanzapine, ranitidine, weight gain]  
2  [Bioavailability enhancement, food effect, sel...  
3  [atypical long-acting injectable, deltoid a

In [None]:
# Install necessary libraries
!pip install pymed
!pip install nltk
!pip install spacy
!pip install gensim

# Import libraries
import re
import nltk
import spacy
import pandas as pd
from pymed import PubMed
from pprint import pprint

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load English stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Load SpaCy model
!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Initialize PubMed
pubmed = PubMed(tool="PubMedSearcher", email="avahomiar@gmail.com")

# Define search term
search_term = "(((mood) OR (depress*) OR (affective disorder)) OR ((psychosis) OR (schizo*) OR (psychotic))) AND ((psychedelic) OR (hallucinogen) OR (entheogen) OR (hallucinogenic) OR (psychotropic)) AND (2014/2024[Date - Publication])"

# Fetch articles
results = pubmed.query(search_term, max_results=5000)
articleList = []

for article in results:
    articleDict = article.toDict()
    pub_date = articleDict.get('publication_date', '')
    title = articleDict.get('title', '')
    abstract = articleDict.get('abstract', '')
    if isinstance(abstract, list):
        abstract = ' '.join(abstract)
    keywords = ' '.join(articleDict.get('keywords', []))

    # Combine title, abstract, and keywords into one text
    text = f"{title} {abstract} {keywords}"

    articleDict['text'] = text
    articleDict['publication_date'] = pub_date
    articleList.append(articleDict)

# Convert to DataFrame
articlesPD = pd.DataFrame.from_dict(articleList)

# Save raw data to CSV for reference
export_csv = articlesPD.to_csv('/content/raw_articles.csv', index=None, header=True)
print("Raw articles DataFrame saved to 'raw_articles.csv'")

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'\S*@\S*\s?', '', text)  # Remove emails
    text = re.sub(r'\s+', ' ', text)  # Remove new line characters
    text = re.sub(r"\'", "", text)  # Remove distracting single quotes
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    return text

# Apply preprocessing
articlesPD['text'] = articlesPD['text'].apply(preprocess_text)

# Tokenize the text
def sent_to_words(sentences):
    for sentence in sentences:
        yield gensim.utils.simple_preprocess(str(sentence), deacc=True)  # deacc=True removes punctuations

data = articlesPD['text'].values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])  # Print the first tokenized entry for verification

# Define lemmatization function
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(" ".join([token.lemma_ for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Apply lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'VERB'])
print(data_lemmatized[:2])  # Print the first two lemmatized entries for verification

# Convert the lemmatized data to a DataFrame
lemmatized_df = pd.DataFrame({'Text': data_lemmatized})

# Save the lemmatized corpus to a TSV file
output_corpus_path = '/content/lemmatized_corpus.tsv'
lemmatized_df.to_csv(output_corpus_path, sep='\t', index=False)

print(f"Lemmatized corpus saved to '{output_corpus_path}'")

# Optionally, download the TSV file
from google.colab import files
files.download(output_corpus_path)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Raw articles DataFrame saved to 'raw_articles.csv'
[['seizure', 'due', 'to', 'multiple', 'drugs', 'intoxication', 'case', 'report', 'the', 'mechanism', 'of', 'the', 'antidepressant', 'effect', 'of', 'bupropion', 'is', 'not', 'fully', 'understood', 'besides', 'using', 'it', 'in', 'the', 'treatment', 'of', 'depression', 'it', 'is', 'found', 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
for article in results:
    articleDict = article.toDict()

    # Combine title, abstract, and keywords into one text
    text = f"{articleDict['title']}\n{articleDict.get('abstract', '')}\n{' '.join(articleDict.get('keywords', []))}"

    articleDict['text'] = text
    articleList.append(articleDict)

articlesPD = pd.DataFrame.from_dict(articleList)
export_csv = articlesPD.to_csv(r'C:\Users\ahomiar\OneDrive - Nexus365\Desktop\export_dataframe.csv', index=None, header=True)

print(articlesPD)

                                              pubmed_id  \
0                                              27793241   
1                                              25529756   
2                                              25148542   
3                                              27136907   
4                                              26706484   
...                                                 ...   
4995                                           23428786   
4996                                           23428785   
4997                                           23428784   
4998                                           23428782   
4999  23420082\n12720301\n18308820\n15056516\n110992...   

                                                  title  \
0     Seizure due to multiple drugs intoxication: a ...   
1     Efficacy of ranitidine in olanzapine-induced w...   
2     Characterization and evaluation of self-nanoem...   
3     Pharmacokinetic profile after multiple deltoid...

In [None]:
search_term = "(((mood) OR (depress*) OR (affective disorder)) OR ((psychosis) OR (schizo*) OR (psychotic))) AND ((psychedelic) OR (hallucinogen) OR (entheogen) OR (hallucinogenic) OR (psychotropic)) AND (2014/2024[Date - Publication])"
results = pubmed.query(search_term)
articleList = []
articleInfo = []
corpus = ""

for article in results:
    articleDict = article.toDict()

    # Combine title, abstract, and keywords into one text
    text = f"{articleDict['title']}\n{articleDict.get('abstract', '')}\n{' '.join(articleDict.get('keywords', []))}"

    # Append text to corpus
    corpus += text + "\n"

    articleDict['text'] = text
    articleList.append(articleDict)

articlesPD = pd.DataFrame.from_dict(articleList)
export_csv = articlesPD.to_csv(r'C:\Users\ahomiar\OneDrive - Nexus365\Desktop\export_dataframe.csv', index=None, header=True)

print(corpus)

import pandas as pd

# Your existing code for generating the corpus

# Convert corpus to DataFrame
corpus_df = pd.DataFrame({'Text': corpus.split('\n')})

# Define file path for saving the output
output_file = '/content/corpus.tsv'

# Export the DataFrame to a .tsv file
corpus_df.to_csv(output_file, sep='\t', index=False)

print(f"Corpus saved to '{output_file}'")

# Convert corpus to DataFrame
corpus_df = pd.DataFrame({'Text': corpus.split('\n')})

# Define file path for saving the output in the Google Colab directory
output_file = '/content/corpus.tsv'

# Export the DataFrame to a .tsv file
corpus_df.to_csv(output_file, sep='\t', index=False)

print(f"Corpus saved to '{output_file}'")


Seizure due to multiple drugs intoxication: a case report.
The mechanism of the antidepressant effect of bupropion is not fully understood. Besides, using it in the treatment of depression, it is found to be effective in reducing withdrawal symptoms due to smoking cessation. A 28-year-old female patient with a history of depression was admitted to emergency department an hour after ingestion of bupropion, quetiapine, and levothyroxine in high doses to commit suicide. While accepting her into the Intensive Care Unit, she was awake, alert, disoriented and agitated. After 2h, the patient had a generalized tonic-clonic seizure. The necessary treatment was given and 9h later with hemodynamic improvement, the patients' mental status improved. Bupropion may cause unusual behaviors such as delusions, paranoia, hallucinations, or confusion. The risk of seizure is strongly dose-dependent. We want to emphasize the importance of early gastric lavage and administration of activated charcoal.
Buprip

In [None]:
from gensim.corpora.dictionary import Dictionary
from gensim import models
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

In [None]:
import re, nltk, spacy, gensim
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load English stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Define preprocessing function
def preprocess_text(text):
    # Remove Emails
    text = re.sub(r'\S*@\S*\s?', '', text)
    # Remove new line characters
    text = re.sub(r'\s+', ' ', text)
    # Remove distracting single quotes
    text = re.sub(r"\'", "", text)
    return text

# Define search term and query PubMed
search_term = "(((mood) OR (depress*) OR (affective disorder)) OR ((psychosis) OR (schizo*) OR (psychotic))) AND ((psychedelic) OR (hallucinogen) OR (entheogen) OR (hallucinogenic) OR (psychotropic)) AND (2014/2024[Date - Publication])"
results = pubmed.query(search_term)

# Initialize empty list to store article dictionaries
articleList = []

# Initialize spacy ‘en’ model, keeping only tagger component (for efficiency)
# Run in terminal: python -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Iterate through search results
for article in results:
    articleDict = article.toDict()

    # Extract title and abstract, with empty strings as default values if they don't exist
    title = articleDict.get('title', '')
    abstract = articleDict.get('abstract', '')

    # Check if title and abstract are strings
    if isinstance(title, str):
        title = preprocess_text(title)
    if isinstance(abstract, str):
        abstract = preprocess_text(abstract)

    # Combine title and abstract if they are not None
    if title and abstract:
        alltext = title + '. ' + abstract
    elif title:
        alltext = title
    elif abstract:
        alltext = abstract
    else:
        alltext = ''

    # Append title and abstract to article dictionary
    articleDict['title'] = title
    articleDict['abstract'] = abstract
    articleDict['alltext'] = alltext

    # Append modified article dictionary to article list
    articleList.append(articleDict)


# Convert article list to DataFrame
articlesPD = pd.DataFrame(articleList)

# Print the DataFrame
print(articlesPD)

# Define file path for saving the output in the Google Colab directory
output_file = '/content/corpus.tsv'

# Export the DataFrame to a .tsv file
articlesPD.to_csv(output_file, sep='\t', index=False)

print(f"Corpus saved to '{output_file}'")

# Handle missing values in 'title' and 'abstract' columns
articlesPD['title'].fillna('', inplace=True)
articlesPD['abstract'].fillna('', inplace=True)

# Combine 'title' and 'abstract' into 'alltext' column
articlesPD["alltext"] = articlesPD["title"].astype(str) + '. ' + articlesPD["abstract"].astype(str)

# Convert to list
data = articlesPD.alltext.values.tolist()

pprint(data[:1])

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))
print(data_words[:1])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'VERB']) #select noun and verb
print(data_lemmatized[:2])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                            pubmed_id  \
0                                            27793241   
1                                            25529756   
2                                            25148542   
3                                            27136907   
4                                            26706484   
..                                                ...   
95  25628381\n17981266\n21123312\n21677641\n164699...   
96                                           25626189   
97                                           25624181   
98                                           25619678   
99                                           25619432   

                                                title  \
0   Seizure due to multiple drugs intoxication: a ...   
1   Efficacy of ranitidine in olanzapine-induced w...   
2   Characterization and evaluation of self-nanoem...   
3   Pharmacokinetic profile after multiple deltoid...   
4   Pregabalin Treatment of a 

In [None]:
vectorizer = CountVectorizer(analyzer='word',
                             min_df=10,
# minimum reqd occurences of a word
                             stop_words='english',
# remove stop words
                             lowercase=True,
# convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',
# num chars > 3
                             # max_features=50000,
# max number of uniq words
)
data_vectorized = vectorizer.fit_transform(data_lemmatized)


# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=20,               # Number of topics
                                      max_iter=10,
# Max learning iterations
                                      learning_method='online',
                                      random_state=100,
# Random state
                                      batch_size=128,
# n docs in each learning iter
                                      evaluate_every = -1,
# compute perplexity every n iters, default: Don't
                                      n_jobs = -1,
# Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model)  # Model attributes

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
 evaluate_every=-1, learning_decay=0.7,
 learning_method='online', learning_offset=10.0,
 max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
 n_components=10, n_jobs=-1, perp_tol=0.1,
 random_state=100, topic_word_prior=None,
 total_samples=1000000.0, verbose=0)


LatentDirichletAllocation(learning_method='online', n_components=20, n_jobs=-1,
                          random_state=100)


In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -12123.160377851218
Perplexity:  110.02651214831117
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 20,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [None]:
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
             perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0),
       n_jobs=1,
       param_grid={'n_topics': [10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [None]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 10}
Best Log Likelihood Score:  -3158.044554375084
Model Perplexity:  85.51228493114559


In [None]:
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ['Topic' + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ['Doc' + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Styling
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return 'font-weight: {weight}'.format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.0,0.0,0.0,0.0,0.7,0.0,0.0,0.0,0.26,0.0,4
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.97,0.0,8
Doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.96,0.0,8
Doc3,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.94,0.01,8
Doc4,0.0,0.0,0.0,0.0,0.96,0.0,0.0,0.0,0.0,0.0,4
Doc5,0.05,0.05,0.05,0.05,0.55,0.05,0.05,0.05,0.05,0.05,4
Doc6,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.79,0.0,8
Doc7,0.01,0.01,0.01,0.01,0.01,0.01,0.6,0.36,0.01,0.01,6
Doc8,0.0,0.0,0.0,0.0,0.0,0.58,0.0,0.41,0.0,0.0,5
Doc9,0.03,0.03,0.03,0.03,0.03,0.03,0.77,0.03,0.03,0.03,6


In [None]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names_out()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()

Unnamed: 0,activity,administration,affect,age,aim,analysis,antidepressant,antipsychotic,assess,base,...,suggest,symptom,test,time,treat,treatment,trial,use,week,year
Topic0,7.097066,0.780863,2.008175,0.948729,0.825351,0.832266,6.774668,0.65333,0.705242,1.343791,...,0.658546,3.9564,1.692817,1.477055,0.533988,12.458249,0.594954,1.43325,0.62742,1.681761
Topic1,0.486029,0.613973,0.877518,0.92991,0.523559,1.17614,2.882595,0.531633,0.504885,2.205681,...,0.461323,0.537301,0.860541,0.903305,0.928248,0.738673,1.006866,0.665004,0.437832,0.711363
Topic2,0.648359,0.605313,0.526726,1.780515,0.499672,0.599043,12.72459,0.976707,1.369489,3.452719,...,0.448796,0.812233,0.626169,0.581511,0.748189,1.178238,1.452608,10.460503,0.604449,4.723486
Topic3,1.09636,2.853238,1.051711,0.532732,0.855433,0.618989,0.759551,0.54156,1.2884,0.528493,...,0.574605,0.593363,1.687179,0.617551,0.466819,0.615695,0.537604,0.526024,0.57208,0.537678
Topic4,0.528649,1.421545,0.701123,1.716883,1.141713,1.191609,0.454568,0.497461,1.393304,0.704442,...,0.717369,7.150447,0.491617,0.660121,0.750109,7.460074,0.898001,1.611438,0.949142,2.90073


In [None]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,receptor,depression,treatment,activity,antidepressant,study,illness,change,level,patient,drug,symptom,mechanism,schizophrenia,function
Topic 1,care,effect,patient,antidepressant,base,report,receive,evidence,analysis,datum,group,trial,increase,risk,control
Topic 2,antidepressant,use,follow,day,medication,increase,year,base,observe,study,report,receive,patient,difference,care
Topic 3,effect,administration,observe,study,induce,test,condition,receptor,factor,report,improve,assess,mechanism,decrease,activity
Topic 4,patient,case,treatment,symptom,level,disorder,depression,remission,improvement,cause,measure,year,lead,improve,function
Topic 5,effect,group,depression,treatment,receptor,patient,antidepressant,control,week,test,measure,increase,study,medication,score
Topic 6,treatment,depression,increase,medication,care,antidepressant,receive,datum,follow,improve,behavior,include,need,use,group
Topic 7,response,treatment,disorder,use,patient,identify,study,level,schizophrenia,suggest,evidence,depression,finding,remission,illness
Topic 8,patient,treatment,use,study,effect,risk,drug,medication,schizophrenia,factor,group,disorder,treat,depression,efficacy
Topic 9,induce,treat,level,increase,activity,investigate,test,decrease,change,week,result,administration,suggest,use,affect


In [None]:
for index,topic in enumerate(lda_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['study', 'patient', 'report', 'symptom', 'control', 'identify', 'score', 'drug', 'medication', 'year', 'age', 'cause', 'depression', 'disease', 'case']


THE TOP 15 WORDS FOR TOPIC #1
['induce', 'difference', 'efficacy', 'base', 'assess', 'schizophrenia', 'effect', 'medication', 'dose', 'patient', 'receive', 'study', 'observe', 'follow', 'day']


THE TOP 15 WORDS FOR TOPIC #2
['study', 'result', 'medication', 'evidence', 'test', 'report', 'increase', 'group', 'control', 'depression', 'receptor', 'antidepressant', 'care', 'patient', 'effect']


THE TOP 15 WORDS FOR TOPIC #3
['time', 'study', 'suggest', 'lead', 'increase', 'schizophrenia', 'evaluate', 'effect', 'decrease', 'dose', 'illness', 'behavior', 'test', 'use', 'induce']


THE TOP 15 WORDS FOR TOPIC #4
['induce', 'dose', 'behavior', 'difference', 'efficacy', 'disease', 'increase', 'result', 'administration', 'suggest', 'include', 'report', 'effect', 'test', 'receptor']


THE TOP 15 WORDS FOR TOPIC #5

In [None]:
# Check for the presence of top words in titles and abstracts
for index, row in articlesPD.iterrows():
    title = row['title']
    abstract_text = row['abstract']
    pub_date = row.get('publication_date', '')

    # Check if title and abstract are not NaNs
    if isinstance(title, str) and isinstance(abstract_text, str):
        # Check if top 5 words from each topic are mentioned
        for topic_idx, top_words in enumerate(topic_keywords):
            for word in top_words[:5]:
                if word in title or word in abstract_text:
                    # The word is mentioned in the title or abstract
                    print(f"Topic {topic_idx + 1}: Word '{word}' mentioned in the title/abstract of article {index} published on {pub_date}")
    else:
        print(f"Skipping article {index} due to missing title or abstract")

Topic 1: Word 'depression' mentioned in the title/abstract of article 0 published on 2016-10-30
Topic 1: Word 'treatment' mentioned in the title/abstract of article 0 published on 2016-10-30
Topic 1: Word 'antidepressant' mentioned in the title/abstract of article 0 published on 2016-10-30
Topic 2: Word 'effect' mentioned in the title/abstract of article 0 published on 2016-10-30
Topic 2: Word 'patient' mentioned in the title/abstract of article 0 published on 2016-10-30
Topic 2: Word 'antidepressant' mentioned in the title/abstract of article 0 published on 2016-10-30
Topic 3: Word 'antidepressant' mentioned in the title/abstract of article 0 published on 2016-10-30
Topic 3: Word 'use' mentioned in the title/abstract of article 0 published on 2016-10-30
Topic 4: Word 'effect' mentioned in the title/abstract of article 0 published on 2016-10-30
Topic 4: Word 'administration' mentioned in the title/abstract of article 0 published on 2016-10-30
Topic 5: Word 'patient' mentioned in the ti

In [None]:
# Check for the presence of all top words in titles and abstracts
for index, row in articlesPD.iterrows():
    title = row['title']
    abstract_text = row['abstract']
    pub_date = row.get('publication_date', '')

    # Combine title and abstract
    combined_text = f"{title} {abstract_text}"

    # Check if title and abstract are not NaNs
    if isinstance(title, str) and isinstance(abstract_text, str):
        # Check if all top 5 words from each topic are mentioned
        for topic_idx, top_words in enumerate(topic_keywords):
            if all(word in combined_text for word in top_words[:5]):
                # All words are mentioned in the combined title and abstract
                print(f"Topic {topic_idx + 1}: All top 5 words are mentioned in the title/abstract of article {index} published on {pub_date}")
    else:
        print(f"Skipping article {index} due to missing title or abstract")

Topic 8: All top 5 words are mentioned in the title/abstract of article 4 published on 2015-12-27
Topic 10: All top 5 words are mentioned in the title/abstract of article 20 published on 2015-05-16
Topic 9: All top 5 words are mentioned in the title/abstract of article 26 published on 2015-05-02
Topic 7: All top 5 words are mentioned in the title/abstract of article 32 published on 2015-04-02
Topic 7: All top 5 words are mentioned in the title/abstract of article 37 published on 2015-03-24
Topic 8: All top 5 words are mentioned in the title/abstract of article 49 published on 2015-03-04
Topic 8: All top 5 words are mentioned in the title/abstract of article 50 published on 2015-03-04
Topic 8: All top 5 words are mentioned in the title/abstract of article 51 published on 2015-03-04
Topic 8: All top 5 words are mentioned in the title/abstract of article 52 published on 2015-03-04
Topic 2: All top 5 words are mentioned in the title/abstract of article 55 published on 2015-03-03
Topic 7: A

In [None]:
# Get the total number of rows in the DataFrame
total_rows = len(articlesPD)

# List to store results
results = []

# Check for the presence of top words in titles and abstracts for all rows
for index in range(total_rows - 50, total_rows):
    row = articlesPD.iloc[index]
    title = row['title']
    abstract_text = row['abstract']
    pub_date = row.get('publication_date', '')


    # Combine title and abstract
    combined_text = f"{title} {abstract_text}"

    # Check if title and abstract are not NaNs
    if isinstance(title, str) and isinstance(abstract_text, str):
        # Check if all top 5 words from each topic are mentioned
        for topic_idx, top_words in enumerate(topic_keywords):
            if all(word in combined_text for word in top_words[:5]):
                # All words are mentioned in the combined title and abstract
                results.append((topic_idx + 1, top_words[:5], index, pub_date))
    else:
        print(f"Skipping article {index} due to missing title or abstract")

# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=['Topic', 'Words', 'Article_Index', 'Publication_Date'])

# Save DataFrame to a .tsv file in Google Colab
output_file = '/content/historical_abstracts.tsv'
results_df.to_csv(output_file, sep='\t', index=False)

print(f"Results saved to '{output_file}'")

Results saved to '/content/historical_abstracts.tsv'


In [None]:
# Initialize topic labels
topic_labels = [f"Topic {i+1}" for i in range(len(topic_keywords))]

# Initialize an empty DataFrame with rows as papers and columns as topics
binary_data_table = pd.DataFrame(index=range(len(articlesPD)), columns=topic_labels)

# Iterate through each paper
for index, row in articlesPD.iterrows():
    title = row['title']
    abstract_text = row['abstract']

    # Initialize a list to store topic mentions for this paper
    topic_mentions = [0] * len(topic_keywords)

    # edited code
from nltk import word_tokenize
import string
 # Check if title and abstract are not NaNs
if isinstance(title, str) and isinstance(abstract_text, str):
  titlewords = set([w for w in word_tokenize(title) if w not in string.punctuation])
  abstractwords = set([w for w in word_tokenize(abstract) if w not in string.punctuation])
  allwords = titlewords.union(abstractwords)
# Check if top 5 words from each topic are mentioned
for topic_idx, top_words in enumerate(topic_keywords):
  if all (e in allwords for e in set(top_words[:5])):
    topic_mentions[topic_idx] = 1

    # Update  binary data table with topic mentions for  paper
    binary_data_table.iloc[index] = topic_mentions

# Add labels for rows (papers)
binary_data_table.index.name = 'Paper'

# Print  first 50 rows of the binary data table
print(binary_data_table.head(50))

# Save  binary data table to csv
binary_data_table.to_csv('binary_data_table.csv')

print("Binary data table saved to 'binary_data_table.csv'")


      Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8 Topic 9  \
Paper                                                                           
0         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
1         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
2         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
4         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
6         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
7         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
8         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
9         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
10        NaN     NaN     Na

In [None]:
import pandas as pd
from nltk import word_tokenize
import string

# Assuming topic_keywords and articlesPD are defined elsewhere
# topic_keywords = [...]
# articlesPD = pd.read_csv('path_to_articlesPD.csv')

# Initialize topic labels
topic_labels = [f"Topic {i+1}" for i in range(len(topic_keywords))]

# Initialize an empty DataFrame with rows as papers and columns as topics
binary_data_table = pd.DataFrame(0, index=range(len(articlesPD)), columns=topic_labels)

# Iterate through each paper
for index, row in articlesPD.iterrows():
    title = row['title']
    abstract_text = row['abstract']

    # Check if title and abstract are not NaNs
    if isinstance(title, str) and isinstance(abstract_text, str):
        # Tokenize and remove punctuation
        titlewords = set([w.lower() for w in word_tokenize(title) if w not in string.punctuation])
        abstractwords = set([w.lower() for w in word_tokenize(abstract_text) if w not in string.punctuation])
        allwords = titlewords.union(abstractwords)

        # Check if top 5 words from each topic are mentioned
        for topic_idx, top_words in enumerate(topic_keywords):
            if all(e.lower() in allwords for e in set(top_words[:5])):
                binary_data_table.iloc[index, topic_idx] = 1

# Add labels for rows (papers)
binary_data_table.index.name = 'Paper'

# Print first 50 rows of the binary data table
print(binary_data_table.head(50))

# Save the binary data table to csv
binary_data_table.to_csv('binary_data_table.csv')

print("Binary data table saved to 'binary_data_table.csv'")


       Topic 1  Topic 2  Topic 3  Topic 4  Topic 5  Topic 6  Topic 7  Topic 8  \
Paper                                                                           
0            0        0        0        0        0        0        0        0   
1            0        0        0        0        0        0        0        0   
2            0        0        0        0        0        0        0        0   
3            0        0        0        0        0        0        0        0   
4            0        0        0        0        0        0        0        0   
5            0        0        0        0        0        0        0        0   
6            0        0        0        0        0        0        0        0   
7            0        0        0        0        0        0        0        0   
8            0        0        0        0        0        0        0        0   
9            0        0        0        0        0        0        0        0   
10           0        0     

In [None]:
binary_data_table = pd.read_csv('binary_data_table.csv')

if 'Paper' not in binary_data_table.columns:
    raise KeyError("'Paper' column not found in binary_data_table")

binary_data_table['Paper'] = binary_data_table['Paper'].astype(str)

articlesPD['title'] = articlesPD['title'].astype(str)

articlesPD['publication_date'] = pd.to_datetime(articlesPD['publication_date'])

combined_data = pd.merge(articlesPD, binary_data_table, left_on='title', right_on='Paper', how='inner')

combined_data.to_csv('combined_data.csv', index=False)

print("Combined data saved to 'combined_data.csv'")

Combined data saved to 'combined_data.csv'


In [None]:
combined_data = pd.read_csv('combined_data.csv')

topic_labels = [col for col in combined_data.columns if col.startswith('Topic')]

print("First few rows of combined data:")
print(combined_data.head())

print("\nColumn types:")
print(combined_data.dtypes)

combined_data['publication_date'] = pd.to_datetime(combined_data['publication_date'], errors='coerce')

print("\nNumber of NaT values in publication_date:")
print(combined_data['publication_date'].isna().sum())

combined_data = combined_data.dropna(subset=['publication_date'])

combined_data['publication_week'] = combined_data['publication_date'].dt.isocalendar().week
combined_data['publication_month'] = combined_data['publication_date'].dt.month
combined_data['publication_year'] = combined_data['publication_date'].dt.year

print("\nFirst few rows with new time period columns:")
print(combined_data[['publication_date', 'publication_week', 'publication_month', 'publication_year']].head())

mentions_weekly = combined_data.groupby(['publication_year', 'publication_week'])[topic_labels].sum()
mentions_monthly = combined_data.groupby(['publication_year', 'publication_month'])[topic_labels].sum()
mentions_yearly = combined_data.groupby('publication_year')[topic_labels].sum()

proportions_weekly = mentions_weekly.div(mentions_weekly.sum(axis=1), axis=0)
proportions_monthly = mentions_monthly.div(mentions_monthly.sum(axis=1), axis=0)
proportions_yearly = mentions_yearly.div(mentions_yearly.sum(axis=1), axis=0)

print("\nAggregate Counts of Mentions of Topics Over Weeks:")
print(mentions_weekly)
print("\nAggregate Counts of Mentions of Topics Over Months:")
print(mentions_monthly)
print("\nAggregate Counts of Mentions of Topics Over Years:")
print(mentions_yearly)

print("\nRelative Proportions of Mentions of Topics Over Weeks:")
print(proportions_weekly)
print("\nRelative Proportions of Mentions of Topics Over Months:")
print(proportions_monthly)
print("\nRelative Proportions of Mentions of Topics Over Years:")
print(proportions_yearly)


First few rows of combined data:
Empty DataFrame
Columns: [pubmed_id, title, abstract, keywords, journal, publication_date, authors, methods, conclusions, results, copyrights, doi, xml, alltext, Paper, Topic 1, Topic 2, Topic 3, Topic 4, Topic 5, Topic 6, Topic 7, Topic 8, Topic 9, Topic 10]
Index: []

[0 rows x 25 columns]

Column types:
pubmed_id           object
title               object
abstract            object
keywords            object
journal             object
publication_date    object
authors             object
methods             object
conclusions         object
results             object
copyrights          object
doi                 object
xml                 object
alltext             object
Paper               object
Topic 1             object
Topic 2             object
Topic 3             object
Topic 4             object
Topic 5             object
Topic 6             object
Topic 7             object
Topic 8             object
Topic 9             object
Topic 10   

In [None]:
try:
    corpus = pd.read_csv('/content/corpus_colab.tsv', sep='\t', on_bad_lines='skip')
except pd.errors.ParserError as e:
    print("Error parsing the TSV file:", e)

binary_data_table = pd.read_csv('binary_data_table.csv')

corpus['publication_date'] = pd.to_datetime(corpus['publication_date'], errors='coerce')

corpus = corpus.dropna(subset=['publication_date'])

corpus['title'] = corpus['title'].astype(str)
binary_data_table['Paper'] = binary_data_table['Paper'].astype(str)

print("First few rows of corpus:")
print(corpus.head())

print("\nFirst few rows of binary_data_table:")
print(binary_data_table.head())

combined_data = pd.merge(corpus, binary_data_table, left_on='title', right_on='Paper', how='inner')

topic_labels = [col for col in binary_data_table.columns if col.startswith('Topic')]

selected_columns = ['publication_date', 'title', 'abstract'] + topic_labels
combined_data = combined_data[selected_columns]

output_filename = 'combined_publications_with_topic_mentions.csv'
combined_data.to_csv(output_filename, index=False)

print("\nFirst few rows of the final combined data:")
print(combined_data.head())

print(f"\nFile '{output_filename}' has been created with the required information.")

FileNotFoundError: [Errno 2] No such file or directory: '/content/corpus_colab.tsv'