# Prepare working dataset for search and vis

In [1]:
import json, os
import numpy as np
import pandas as pd
import nltk

## Load json file with parsed information

In [2]:
# paths
path_to_json_file = os.path.abspath('../data/json/')

### json file
# json_file = '/all_parsed_data_json' #name json file  
json_file = '/sample_json' #sample json file for testing  

file = '{}{}.json'.format(path_to_json_file, json_file)

In [3]:
# load json file
df = pd.read_json(file)

In [4]:
# Display full dataframe
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [5]:
df.head()

Unnamed: 0,nct_id,study_first_submitted,source,brief_title,overall_status,verification_date,study_type,study_first_posted,last_update_submitted,last_update_posted,phase,condition,condition_browse/mesh_term,intervention_browse/mesh_term,detailed_description/textblock,brief_summary/textblock,location/facility/address/city,location/facility/address/country,location/facility/address/zip,sponsors/lead_sponsor/agency,sponsors/lead_sponsor/agency_class,study_design_info/allocation,study_design_info/intervention_model,study_design_info/primary_purpose
0,NCT00058058,"April 7, 2003",American College of Radiology Imaging Network,Magnetic Resonance Imaging in Women Recently D...,Completed,June 2018,Interventional,"April 9, 2003","June 29, 2018","July 27, 2018",,Breast Cancer,Breast Neoplasms,,\n OBJECTIVES:\n\n - Determine th...,\n RATIONALE: Diagnostic procedures such ...,Little Rock,United States,72205,American College of Radiology Imaging Network,Other,,Single Group Assignment,Diagnostic
1,NCT01980602,"September 24, 2013",University of Hull,Effect of Exercise on Patients With Claudicati...,Unknown status,November 2013,Interventional,"November 11, 2013","November 4, 2013","November 11, 2013",,Intermittent Claudication,Peripheral Arterial Disease,,,\n Title: How does exercise improve the c...,Hull,United Kingdom,HU3 2JZ,University of Hull,Other,Randomized,Single Group Assignment,Treatment
10,NCT01277458,"January 14, 2011",Public Health England,Ethnicity Data in HIV Positive Men Who Have Se...,Unknown status,December 2010,Observational,"January 17, 2011","January 14, 2011","January 17, 2011",Phase 4,HIV Infections,HIV Infections,,\n HIV Doctors will be identified at each...,\n Are there differences between the way ...,London,United Kingdom,SE1 7EH,Public Health England,Other,,,
100,NCT01164891,"July 16, 2010",Hoffmann-La Roche,A Pharmacokinetic and Metabolism Study of 14C-...,Completed,September 2017,Interventional,"July 19, 2010","January 11, 2016","February 5, 2016",Phase 3,Malignant Melanoma,Melanoma,,,"\n This open-label, non-randomized study ...",Zürich,Switzerland,8091,Hoffmann-La Roche,Industry,,Single Group Assignment,Treatment
1000,NCT03795727,"January 3, 2019",Cairo University,Clinical Evaluation of Sectional Matrix Versus...,Not yet recruiting,February 2019,Interventional,"January 8, 2019","March 10, 2019","March 12, 2019",,Proximal Contact,,,,\n The aim of this study is to evaluate t...,Cairo,Egypt,11553,Cairo University,Other,Randomized,Parallel Assignment,Treatment


## Clean dataset

In [6]:
# change column type
def data_types(dataframe, cols = [], to_type = ''):
    for col in cols:
        dataframe[col] = df[col].astype(to_type)

In [7]:
columns_dates = ['study_first_submitted', 'last_update_submitted', 'verification_date']
data_types(df, columns_dates, 'datetime64')

In [8]:
# Extract year 
df['year_submitted'] = df['study_first_submitted'].dt.year
df['year_last_updated'] = df['last_update_submitted'].dt.year
df['year_verification'] = df['verification_date'].dt.year

In [9]:
# Remove unnecesary columns
df.drop(columns_dates, axis = 1, inplace=True)

In [10]:
columns_to_drop = ['study_first_posted', 'last_update_posted']
df.drop(columns_to_drop, axis = 1, inplace=True)

In [11]:
# Remove /n in all df
df = df.replace(r'\n',' ', regex=True)

In [12]:
# Create new column for all text
df['all_text'] = df['source'] + ' ' + df['brief_title'] + ' ' + df['condition'] + ' ' + df['condition_browse/mesh_term'] + ' '+ df['intervention_browse/mesh_term'] + ' '+ df['detailed_description/textblock'] + ' ' + df['brief_summary/textblock']

# All_text in lowercase
df['all_text'] = df['all_text'].str.lower()

# remove extra whitespace
df.all_text = df.all_text.replace('\s+', ' ', regex=True)


In [13]:
# Remove extra white space in summary
df['brief_summary/textblock'] = df['brief_summary/textblock'].replace('\s+', ' ', regex=True)
df['brief_summary/textblock'][0]

' RATIONALE: Diagnostic procedures such as magnetic resonance imaging (MRI) may improve the ability to detect cancer in the unaffected breast of women recently diagnosed with unilateral breast cancer. PURPOSE: Diagnostic trial to determine the effectiveness of MRI in evaluating the unaffected breast of women recently diagnosed with unilateral breast cancer. '

In [14]:
# # add url columns
url_string = 'https://clinicaltrials.gov/ct2/show/'
df['col'] = url_string + df['nct_id'].astype(str)

In [15]:
df.sample(3)

Unnamed: 0,nct_id,source,brief_title,overall_status,study_type,phase,condition,condition_browse/mesh_term,intervention_browse/mesh_term,detailed_description/textblock,brief_summary/textblock,location/facility/address/city,location/facility/address/country,location/facility/address/zip,sponsors/lead_sponsor/agency,sponsors/lead_sponsor/agency_class,study_design_info/allocation,study_design_info/intervention_model,study_design_info/primary_purpose,year_submitted,year_last_updated,year_verification,all_text,col
7171,NCT03100630,Hoffmann-La Roche,Compare Bioavailability of RO7239361 After Sub...,Completed,Interventional,,Healthy,,,,Randomized study in healthy men and women. As...,Cypress,United States,90630,Hoffmann-La Roche,Industry,Randomized,Parallel Assignment,Other,2017,2018,2016.0,hoffmann-la roche compare bioavailability of r...,https://clinicaltrials.gov/ct2/show/NCT03100630
6032,NCT01619553,UConn Health,Genetic Analysis of Keloids,Recruiting,Observational,,Keloid,Keloid,,Keloids are scars that keep growing bey...,Keloids have a strong genetic component. The ...,Farmington,United States,06030-3705,UConn Health,Other,,,,2012,2018,2016.0,uconn health genetic analysis of keloids keloi...,https://clinicaltrials.gov/ct2/show/NCT01619553
4651,NCT03649958,"Brain State Technologies, LLC",Neurotechnology Following Traumatic Brain Injury,Recruiting,Interventional,Phase 4,Post-Concussion Symptoms,Brain Injuries,,Persistent symptoms after concussion or...,This study seeks to improve symptoms such as ...,Bethesda,United States,20814,"Brain State Technologies, LLC",Industry,Randomized,Parallel Assignment,Treatment,2018,2018,2010.0,"brain state technologies, llc neurotechnology ...",https://clinicaltrials.gov/ct2/show/NCT03649958


## Preprocess dataset
- Tokenize, Lemmatize / Stem
- Remove stopwords

In [16]:
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer

# porter_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

In [17]:
# Stemming
# def stem_sentences(text):
#     tokens = text.split()
#     stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
#     return ' '.join(stemmed_tokens)

# df['stems'] = df['all_text'].apply(stem_sentences)

In [18]:
# Lemmatizing
def lemm_sentences(text):
    tokens = text.split()
    lemm_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemm_tokens)

df['lemmas'] = df['all_text'].apply(lemm_sentences)

In [19]:
# Remove stop words [not needed with countvectorizer]
# stop_words = stopwords.words('english')
# df['tokens'] = df['lemmas'].apply(lambda x: [item for item in x.split() if item not in stop_words])

In [20]:
df.lemmas[0:5]

0       american college of radiology imaging network ...
1       university of hull effect of exercise on patie...
10      public health england ethnicity data in hiv po...
100     hoffmann-la roche a pharmacokinetic and metabo...
1000    cairo university clinical evaluation of sectio...
Name: lemmas, dtype: object

## Bags of words

In [21]:
# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
import re

In [22]:
pat_numbers = lambda x: re.sub(r'(\d)+', '', x.lower())

cv = CountVectorizer(stop_words='english',
                     preprocessor = pat_numbers,
                     max_features = 3000,
                     lowercase = True,
                     max_df = 0.8,
                     ngram_range = (1, 3))

In [23]:
docs = list(df['lemmas'])

X = cv.fit_transform(docs)

In [24]:
len(cv.vocabulary_)

3000

In [25]:
word_counts = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())

In [26]:
# add nct_id to word_counts df
nct_id = df['nct_id']

df_word_counts = pd.merge(nct_id, word_counts, left_index=True, right_index=True)

In [27]:
df_word_counts.head()

Unnamed: 0,nct_id,abdominal,ability,ablation,able,abnormal,abnormality,absence,absence disease,absence disease progression,absorption,abuse,ac,acceptability,acceptable,accepted,access,accordance,according,account,accrual,accrued,accrued study,accuracy,accurate,acetaminophen,achieve,achieved,achieving,acid,acne,acquired,act,acting,action,activated,activation,active,activities,activity,actual,acuity,acupuncture,acute,acute myeloid,acute myeloid leukemia,ad,adapted,adaptive,add,added,addiction,adding,addition,additional,additionally,address,adenocarcinoma,adenoma,adequate,adhd,adherence,adipose,adjusted,adjustment,adjuvant,administer,administered,administration,admission,admitted,adolescent,adult,adult patient,adults,advance,advanced,advantage,adverse,adverse effect,adverse event,adverse events,aerobic,af,affect,affected,affecting,africa,african,african american,age,age year,age years,aged,aged year,agent,agents,aggressive,aging,agonist,aid,aim,aim determine,aim evaluate,aim study,aim study evaluate,aimed,aims,air,airway,al,alarm,alcohol,alcohol use,alfa,algorithm,allergic,allergy,allocated,allocation,allogeneic,allow,allowed,allowing,allows,alpha,alteration,altered,alternative,alveolar,alzheimer,alzheimer disease,ambulatory,amd,american,amino,amino acid,aml,anaesthesia,analgesia,analgesic,analog,analogue,analysed,analyses,analysis,analysis performed,analyze,analyzed,anderson,androgen,anemia,anesthesia,anesthesiologist,anesthetic,aneurysm,angiography,angle,animal,ankle,annually,answer,antagonist,anterior,anti,anti inflammatory,antibiotic,antibodies,antibody,anticipated,antidepressant,antigen,antioxidant,antipsychotic,antiretroviral,anxiety,aortic,aortic valve,apart,apnea,app,appears,application,applied,apply,appointment,approach,appropriate,approval,approved,approximately,area,arm,arm patient,arm study,arms,art,arterial,artery,artery disease,arthritis,arthroplasty,ascending,asian,ask,asked,aspect,aspiration,aspirin,ass,ass effect,ass efficacy,ass safety,assay,assessed,assessing,assessment,assessments,assigned,assignment,assist,assistance,assisted,associated,associated increased,association,asthma,atherosclerosis,atopic,atopic dermatitis,atrial,atrial fibrillation,attack,attempt,attend,attending,attention,auditory,autism,autoimmune,autologous,automated,autonomic,availability,available,average,avoid,awareness,azd,baby,...,tested,testing,testosterone,tests,th,theory,therapeutic,therapies,therapist,therapy,therapy patient,thickness,thoracic,thought,threatening,threshold,thrombosis,thyroid,time,time day,time point,times,timing,tinnitus,tissue,tm,tnf,tobacco,tolerability,tolerability pharmacokinetics,tolerance,tolerated,tolerated dose,tolerated dose mtd,tomography,tool,tooth,topical,total,total knee,total patient,toxic,toxicity,toxin,track,tract,traditional,trained,training,training program,transcranial,transfer,transfusion,transition,transmission,transplant,transplantation,trauma,traumatic,treat,treated,treating,treating patient,treatment,treatment arm,treatment arms,treatment group,treatment option,treatment patient,treatment patient followed,treatment period,treatment phase,treatment regimen,treatment repeat,treatment study,treatments,trial,trial compare,trial comparing,trial evaluate,trial patient,trial study,trial studying,trials,triple,trust,tube,tuberculosis,tumor,tumor cell,tumor cells,tumors,tumour,twice,twice daily,type,type diabetes,type diabetes mellitus,typically,uc,uk,ulcer,ulcerative,ultimately,ultrasound,umbilical,unable,unacceptable,unacceptable toxicity,unclear,undergo,undergoing,undergone,underlying,understand,understanding,understood,underwent,unique,unit,united,united state,university,university california,university hospital,university medical,unknown,untreated,upper,uptake,urinary,urine,usa,use,used,used chemotherapy,used study,useful,user,using,usual,usual care,usually,uterine,utility,utilization,utilized,va,vaccination,vaccine,vaginal,validate,validated,validation,validity,value,valve,variability,variable,variables,variation,variety,various,vas,vascular,vegf,vein,velocity,venous,ventilation,ventricular,verbal,version,versus,versus placebo,vessel,veteran,video,view,viral,viral load,virus,vision,visit,visit week,visits,visual,vital,vitamin,vitro,vivo,volume,volunteer,volunteers,vomiting,vr,vs,wa,wa conducted,wa designed,wa performed,wa used,walk,walking,wall,want,warfarin,washout,water,wave,way,way stop,wear,web,week,week study,week treatment,week week,week weeks,weekly,weeks,weight,weight gain,weight loss,western,white,wide,widely,widely used,wish,withdrawal,woman,women,work,working,world,worldwide,worse,wound,written,year,year age,year follow,year old,years,yield,york,young,youth,zinc
0,NCT00058058,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,2,0,0,2,0,2,0,0,0,0
1,NCT01980602,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10,NCT01277458,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100,NCT01164891,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000,NCT03795727,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Export working dataset

In [28]:
# Check all dataframes in space
%who DataFrame

df	 df_word_counts	 word_counts	 


In [37]:
# Export datasets: df, df_word_counts (key: ntc_id)

path_to_working_datasets = os.path.abspath('../data/working_data')

try: 
    os.mkdir(path_to_working_datasets)
except:
    pass
    print('Error')

Error


--------

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
transformer = TfidfTransformer()
tweights = transformer.fit_transform(X)
tweights

In [None]:
# turn weights data into a dataframe
tf = pd.DataFrame(tweights.toarray(), columns=cv.get_feature_names())

In [None]:
# Top terms by average tf-idf weight
weights = np.asarray(tweights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cv.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(10)

In [None]:
# Create new dataframe with nct-id and merge wth tf by index
pd.set_option('display.max_columns', None) 

df_tf = df['nct_id']
df_tf.head()

In [None]:
df_tf = pd.merge(df_tf, tf, left_index=True, right_index=True)

In [None]:
df_tf.head()

## Test merged dataframes with tf-idf results

In [None]:
# import random

# def compare_results():
#     row = random.randint(0,3000)
#     df_tf.drop('nct_id', axis = 1)
#     print('Random row: {}'.format(row))
#     return df_tf.loc[row], tf.loc[row]

In [None]:
# compare_results()

## Calculate correlation between docs

In [None]:
# calculate similary
similarity = tweights * tweights.T

In [None]:
# Save all data in a dataframe
df_docs_similarity = pd.DataFrame(similarity.toarray())

<b>To-do: Give a NTC-ID record, find similar documents
& return dataframe with results</b>

In [None]:
# Find more similar documents of a given record

def find_similar_docs(record):
    trials_id = []
    treshold = 0.40
    similar_index = df_docs_similarity.iloc[record][df_docs_similarity.iloc[record] > treshold].index
    
    for i in similar_index.values:
        trials_id.append(i)
        
    print('{} similar trials with treshold {}'.format(len(trials_id), treshold))
    return df.iloc[trials_id]


In [None]:
find_similar_docs(255)