In [5]:
import pandas as pd
from nltk.corpus import stopwords
from gensim import corpora, models, similarities, utils, matutils
from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_validation import cross_val_score

In [6]:
df_all_topics = pd.read_csv('pubmed_data', usecols= ['pid', 'title', 'abstract', 'topic_id', 'sr_title', 'rel'])

In [7]:
df_all_topics.head()

Unnamed: 0,pid,title,abstract,topic_id,sr_title,rel
0,3903849,[Nonvisualization of the gallbladder lumen by ...,Out of 12 000 sonographic examinations of the ...,CD011548,Ultrasound versus liver function tests for dia...,0
1,3903849,[Nonvisualization of the gallbladder lumen by ...,Out of 12 000 sonographic examinations of the ...,CD011549,Endoscopic ultrasound versus magnetic resonanc...,0
2,3903849,[Nonvisualization of the gallbladder lumen by ...,Out of 12 000 sonographic examinations of the ...,CD010339,Endoscopic retrograde cholangiopancreatography...,0
3,1559484,Cholangioscopy.,,CD011548,Ultrasound versus liver function tests for dia...,0
4,1559484,Cholangioscopy.,,CD011549,Endoscopic ultrasound versus magnetic resonanc...,0


In [8]:
df_all_topics.set_index(["topic_id", "pid"]).count(level="topic_id").sort_values(by = ["title"])

Unnamed: 0_level_0,title,abstract,sr_title,rel
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CD008760,64,56,64,64
CD010860,94,92,94,94
CD010705,114,112,114,114
CD010896,169,159,169,169
CD010775,241,221,241,241
CD010772,316,307,316,316
CD010771,318,309,318,318
CD010542,348,338,348,348
CD010386,625,579,625,625
CD009135,791,702,791,791


In [9]:
def generate_corpus(df_topics):
    #primero creamos el dicionario    
    stop_words = stopwords.words('english')
    dictionary = corpora.Dictionary()
    for index, row in df_topics.iterrows():
        if type(row['abstract']) == str:  # revisamos si tenemos el abstract (no estan todos)
            title = list(filter(lambda x : x not in stop_words, utils.tokenize(row['title'], lowercase = True)))
            abstract = list(filter(lambda x : x not in stop_words, utils.tokenize(row['abstract'], lowercase = True)))
            dictionary.add_documents([title,abstract]) 
    #luego transformamos las palabras segun el diccionario
    corpus = []
    for index, row in df_topics.iterrows():
        if type(row['abstract']) == str: # revisamos si tenemos el abstract (no estan todos)
            title = list(filter(lambda x : x  not in stop_words, utils.tokenize(row['title'], lowercase=True))) 
            abstract = list(filter(lambda x : x  not in stop_words, utils.tokenize(row['abstract'], lowercase=True)))
            corpus.append(dictionary.doc2bow(title+abstract))
    
    print("dictionary len: {0}".format(len(dictionary)))
    
    return corpus, dictionary        
    
    
    

In [38]:
 df_specific_topic = df_all_topics[df_all_topics['topic_id'] == "CD009944"]

In [11]:
 df_specific_topic.head()

Unnamed: 0,pid,title,abstract,topic_id,sr_title,rel
1685,4588237,Radiology of the biliary system.,,CD009591,Imaging modalities for the non-invasive diagno...,0
6680,3517700,[Is it still useful to integrate cholecystogra...,There are discordant opinions in literature on...,CD009591,Imaging modalities for the non-invasive diagno...,0
6915,3052732,[Sonographic appearance of adenomyomatosis of ...,,CD009591,Imaging modalities for the non-invasive diagno...,0
19632,21902990,Large abdominal wall endometrioma following la...,Endometriosis is a common condition in women t...,CD009591,Imaging modalities for the non-invasive diagno...,0
24666,2203297,Adenomyomatosis of the gall bladder: the NUH e...,Adenomyomatosis (AD) is a degenerative disorde...,CD009591,Imaging modalities for the non-invasive diagno...,0


In [None]:
corpus , dictionary = generate_corpus( df_all_topics)

In [None]:
dictionary.token2id

In [None]:
corpora.MmCorpus.serialize('./corpus_df_all_topics.mm', corpus)
dictionary.save('./dictionary_df_all_topics.dict') 

In [15]:
corpus = corpora.MmCorpus('./corpus_df_all_topics.mm')
dictionary = corpora.Dictionary.load('./dictionary_df_all_topics.dict')
len(dictionary)

119464

In [16]:
lsi = models.lsimodel.LsiModel(corpus=corpus, id2word=dictionary, num_topics=400)
lsi.print_topics(10)

[(0,
  '0.670*"patients" + 0.172*"disease" + 0.140*"study" + 0.138*"p" + 0.121*"treatment" + 0.118*"cases" + 0.111*"clinical" + 0.101*"cancer" + 0.089*"group" + 0.085*"diagnosis"'),
 (1,
  '0.540*"patients" + -0.342*"cells" + -0.266*"cell" + -0.170*"lymph" + -0.147*"expression" + -0.137*"p" + -0.129*"cases" + -0.126*"cancer" + -0.122*"cd" + -0.111*"tumor"'),
 (2,
  '-0.320*"ad" + -0.292*"disease" + 0.288*"lymph" + -0.229*"cognitive" + -0.215*"alzheimer" + 0.199*"cancer" + 0.191*"node" + -0.181*"dementia" + 0.169*"nodes" + 0.157*"cells"'),
 (3,
  '-0.324*"patients" + 0.296*"duct" + 0.250*"cases" + 0.244*"bile" + -0.234*"lymph" + 0.178*"common" + 0.174*"stones" + -0.159*"node" + 0.154*"diagnosis" + 0.152*"laparoscopic"'),
 (4,
  '0.444*"cells" + -0.313*"cancer" + -0.240*"lymph" + 0.240*"cell" + 0.203*"cd" + -0.194*"node" + 0.170*"p" + 0.165*"patients" + 0.146*"duct" + -0.135*"nodes"'),
 (5,
  '0.397*"disease" + 0.296*"ad" + -0.286*"p" + 0.223*"duct" + 0.209*"alzheimer" + 0.205*"lymph" + 

In [17]:
lsi.save('./lsi_model_400.model')

In [41]:
lsi = models.lsimodel.LsiModel(corpus=corpus, id2word=dictionary, num_topics=600)
lsi.print_topics(10)

[(0,
  '0.670*"patients" + 0.172*"disease" + 0.140*"study" + 0.138*"p" + 0.121*"treatment" + 0.118*"cases" + 0.111*"clinical" + 0.101*"cancer" + 0.089*"group" + 0.085*"diagnosis"'),
 (1,
  '0.540*"patients" + -0.342*"cells" + -0.266*"cell" + -0.170*"lymph" + -0.147*"expression" + -0.137*"p" + -0.128*"cases" + -0.127*"cancer" + -0.122*"cd" + -0.111*"tumor"'),
 (2,
  '-0.320*"ad" + -0.291*"disease" + 0.289*"lymph" + -0.229*"cognitive" + -0.215*"alzheimer" + 0.199*"cancer" + 0.192*"node" + -0.181*"dementia" + 0.169*"nodes" + 0.156*"cells"'),
 (3,
  '-0.324*"patients" + 0.296*"duct" + 0.250*"cases" + 0.244*"bile" + -0.235*"lymph" + 0.179*"common" + 0.174*"stones" + -0.159*"node" + 0.154*"diagnosis" + 0.152*"laparoscopic"'),
 (4,
  '0.444*"cells" + -0.312*"cancer" + 0.241*"cell" + -0.240*"lymph" + 0.203*"cd" + -0.194*"node" + 0.170*"p" + 0.166*"patients" + 0.144*"duct" + -0.135*"nodes"'),
 (5,
  '0.395*"disease" + 0.297*"ad" + -0.280*"p" + 0.225*"duct" + 0.209*"alzheimer" + 0.206*"lymph" + 

In [42]:
lsi.save('./lsi_model_600.model')

In [32]:
def get_reduced_data(df_specific_topic, dictionary, lsi):
    labels = []
    corpus = []
    stop_words = stopwords.words('english')
    for index, row in df_specific_topic.iterrows():
            if type(row['abstract']) == str: # revisamos si tenemos el abstract (no estan todos)
                title = list(filter(lambda x : x  not in stop_words, utils.tokenize(row['title'], lowercase=True))) 
                abstract = list(filter(lambda x : x  not in stop_words, utils.tokenize(row['abstract'], lowercase=True)))
                corpus.append(list(x[1] for x in lsi[dictionary.doc2bow(title+abstract)]))
                labels.append(row['rel'])
    
    
    return labels, corpus

In [43]:
labels, new_corpus = get_reduced_data(df_specific_topic, dictionary, lsi)

In [44]:
print(new_corpus[0])
print(len(labels))
print(type(labels))
print(type(new_corpus[0]))

[2.835565884254884, -0.7801260130393838, 2.338088738909797, 3.744501663012928, -1.1477247056721143, 2.3316716113140936, 0.8892436159019634, 2.006987281369657, -0.5177147445211432, 0.8038350661655081, -0.10872442411494598, -0.08007151210413711, -1.221210918668494, 0.13100386968298258, -1.0698500664229647, -1.3761389657679628, -0.7928385299271155, 1.1984319457578345, -2.1118969232893634, 0.21057790174054317, -1.4354748260852914, 0.849396560319428, -0.4838991008863585, 0.2897066436440419, 0.6115861364089227, 1.5856519293574982, -1.03952927441068, -0.5266480324008895, 1.987232440707245, 1.8976949610200438, 0.1388826048945927, -0.953269941618747, 0.24586480043553946, -0.8873748752284141, -1.0821882305029629, 0.457914503799375, -1.538625695218549, 0.8763333645609368, -0.4725193074448676, -0.6843014453901273, -0.4553317973719866, 1.5381158539273823, 0.154344170862252, -0.7703468418929709, -0.260217303079703, -0.34940869026685406, -0.08291765049693979, -0.5705677849684244, -0.8633780911613059,

In [45]:
#testing with a simple model to see if works
clf = BernoulliNB()
scores1 = cross_val_score(clf, new_corpus, labels, cv=5, scoring=('accuracy')).mean()
print (scores1)

0.8252593269272023
