## Import internal medicine review dataset

In [169]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [268]:
path = "/Users/chongchen/Desktop/19Fall RA/data/generated_data/Internal Medicine Review.csv"
review_im = pd.read_csv(path, index_col = 0)

In [269]:
review_im.head()

Unnamed: 0,Review,hp_id,gender,hasorder,lower,no_digits,no_punctuation,words
0,"satisfactory but would not recommend coldness,...",19713,F,0,"satisfactory but would not recommend coldness,...","satisfactory but would not recommend coldness,...",satisfactory but would not recommend coldness ...,"['satisfactory', 'recommend', 'coldness', 'sol..."
1,Although Dr. Merlo does not appear to have an...,19713,F,0,although dr. merlo does not appear to have an...,although dr. merlo does not appear to have an...,although dr merlo does not appear to have an...,"['although', 'dr', 'merlo', 'appear', 'office'..."
2,"took my daughter for problems, never examined...",19713,F,0,"took my daughter for problems, never examined...","took my daughter for problems, never examined...",took my daughter for problems never examined...,"['took', 'daughter', 'problems', 'never', 'exa..."
3,"Yes, Dr. Merlot can be rude and arrogant, but...",19713,F,0,"yes, dr. merlot can be rude and arrogant, but...","yes, dr. merlot can be rude and arrogant, but...",yes dr merlot can be rude and arrogant but...,"['yes', 'dr', 'merlot', 'rude', 'arrogant', 'o..."
4,I called to make and appointment and a woman ...,19713,F,0,i called to make and appointment and a woman ...,i called to make and appointment and a woman ...,i called to make and appointment and a woman ...,"['called', 'make', 'appointment', 'woman', 'ca..."


In [270]:
# drop records without review
review_im.drop(review_im[review_im['Review'] == ' '].index, inplace = True)
review_im.reset_index(inplace = True, drop = True)

In [271]:
review_im['words'][0]

"['satisfactory', 'recommend', 'coldness', 'solo', 'helpfulness', 'limited', 'office', 'staff', 'isolation', 'diagnosis', 'solo']"

In [272]:
import ast
review_im['words'] = review_im['words'].map(lambda x: ast.literal_eval(x))

In [273]:
review_im['words'][0]

['satisfactory',
 'recommend',
 'coldness',
 'solo',
 'helpfulness',
 'limited',
 'office',
 'staff',
 'isolation',
 'diagnosis',
 'solo']

### 2. Sampling equal numeber of reviews of the two genders, tag documents.

In [274]:
review_im['gender'].value_counts()

M    61922
F    25953
Name: gender, dtype: int64

In [275]:
review_im['hasorder'].value_counts()

0    87301
1      574
Name: hasorder, dtype: int64

In [276]:
review_im.groupby(by= ['gender','hasorder'])[['hp_id']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,hp_id
gender,hasorder,Unnamed: 2_level_1
F,0,25833
F,1,120
M,0,61468
M,1,454


In [277]:
review = list(review_im['words'])

In [278]:
review[0][0]

'satisfactory'

In [279]:
gender = list(review_im['gender'])

In [280]:
hasorder = list(review_im['hasorder'])

In [281]:
len(review),len(gender),len(hasorder)

(87875, 87875, 87875)

In [282]:
# Tag document
import gensim
docs = [] # each review/row is a tagged document.
for i in range(len(review)):
    docs.append(gensim.models.doc2vec.TaggedDocument(review[i], [hasorder[i]]))

In [283]:
docs[0]

TaggedDocument(words=['satisfactory', 'recommend', 'coldness', 'solo', 'helpfulness', 'limited', 'office', 'staff', 'isolation', 'diagnosis', 'solo'], tags=[0])

In [284]:
docs_for_vocab = [] 
for i in docs:
    docs_for_vocab.append(i)

#### sampling equal number of sanction reviews & not sanction reviews

In [285]:
import random

In [286]:
# 10000 sanction reviews and 10000 not sanction reviews
repeat = {}
indices = []
batch = []
draw = 10000
s_candidates = list(review_im[review_im['hasorder']==1].index)
ns_candidates = list(review_im[review_im['hasorder']==0].index)
for b in range(draw):
    s = random.choice(s_candidates)
    indices.append(s)
    ns = random.choice(ns_candidates)
    indices.append(ns)
for i in indices:
    try:
        repeat[i]+=1
    except:
        repeat[i]=1
    batch.append(docs[i])

In [287]:
batch[:3]

[TaggedDocument(words=['dr', 'scott', 'hiatt', 'life', 'saver', 'guarding', 'angle', 'live', 'saved', 'dieing', 'fought', 'life', 'minutes', 'love', 'heart', 'pregante', 'got', 'sick', 'going', 'baby', 'heart', 'stop', 'heart', 'min', 'fought', 'called', 'dr', 'sean', 'lynn', 'nothing', 'save', 'life', 'dr', 'scott', 'hiatt', 'trust', 'life', 'baby', 'go', 'dr', 'sean', 'lynn', 'trust', 'dr', 'scott', 'hiatt', 'never', 'gave', 'mins', 'love', 'may', 'god', 'alwas', 'bless', 'special', 'spot', 'heaven', 'thank', 'dr', 'scott', 'hiatt', 'thank', 'dr', 'sean', 'lynn'], tags=[1]),
 TaggedDocument(words=['consistently', 'named', 'best', 'dallas', 'rightfully', 'never', 'rush', 'spends', 'much', 'time', 'necessary', 'go', 'concerns', 'answer', 'questions', 'explain', 'everything', 'easy', 'talk', 'truly', 'nice', 'person', 'cares', 'health', 'wife', 'use', 'dr', 'lau', 'pcp', 'several', 'years', 'never', 'disappointed', 'unique', 'ability', 'diagnose', 'problems', 'strongly', 'encourages', '

### 4. Import Gensim, run doc2vec

In [288]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing

In [289]:
cores = multiprocessing.cpu_count()

In [290]:
model = Doc2Vec(dm=0, window=20,vector_size=300,min_count=30,epochs=5,workers=cores,
               hs=0,negative=7,dbow_words=1,dm_concat=1)
model.build_vocab(docs_for_vocab)
model.train(documents = batch,total_examples=model.corpus_count,epochs=model.epochs)

In [291]:
model.save("doc_review_sanction.model")

In [292]:
from gensim.models.doc2vec import Doc2Vec
model= Doc2Vec.load("doc_review_sanction.model")

In [293]:
model.wv.most_similar(positive = [model.docvecs[0]],topn=15)

[('chan', 0.7140065431594849),
 ('hippocratic', 0.7129845023155212),
 ('norm', 0.7090793251991272),
 ('profile', 0.7077124118804932),
 ('joe', 0.7066259384155273),
 ('empathic', 0.7033420205116272),
 ('el', 0.7020017504692078),
 ('bonus', 0.6986516714096069),
 ('william', 0.6985780000686646),
 ('dream', 0.694002628326416),
 ('naturally', 0.6913329362869263),
 ('unreliable', 0.690306544303894),
 ('singh', 0.6895483732223511),
 ('drew', 0.6891516447067261),
 ('stinks', 0.6887184381484985)]

In [294]:
model.wv.most_similar(positive = [model.docvecs[1]],topn=15)

[('focuses', 0.48323020339012146),
 ('dr', 0.4810328185558319),
 ('ford', 0.4373607039451599),
 ('strives', 0.4266654849052429),
 ('doctor', 0.42209669947624207),
 ('humane', 0.4196663498878479),
 ('bonus', 0.41957253217697144),
 ('precious', 0.4179649353027344),
 ('machines', 0.415289044380188),
 ('text', 0.41443008184432983),
 ('smug', 0.40904855728149414),
 ('dream', 0.40693241357803345),
 ('chronically', 0.40690848231315613),
 ('establish', 0.4047136902809143),
 ('generations', 0.4046419858932495)]

### Change hyperparameters

In [295]:
model3 = Doc2Vec(dm=0,vector_size=300,min_count=50,epochs=5,workers=cores,dbow_words=1,dm_concat=1)
model3.build_vocab(docs_for_vocab)
model3.train(documents = batch,total_examples=model.corpus_count,epochs=model.epochs)
model3.save("doc_review3.model")

In [296]:
from gensim.models.doc2vec import Doc2Vec
model3= Doc2Vec.load("doc_review3.model")

In [297]:
model3.wv.most_similar(positive = [model.docvecs[0]],topn=15)

[('assured', 0.6123682260513306),
 ('discomfort', 0.6106825470924377),
 ('rose', 0.6089043021202087),
 ('dementia', 0.6075581312179565),
 ('godsend', 0.6069930791854858),
 ('regimen', 0.6063371896743774),
 ('dc', 0.6044229865074158),
 ('june', 0.603678822517395),
 ('throwing', 0.6030840277671814),
 ('pulmonologist', 0.60286545753479),
 ('neglected', 0.602419376373291),
 ('hematologist', 0.6016801595687866),
 ('onto', 0.6008667349815369),
 ('indeed', 0.5999284386634827),
 ('memorial', 0.5997985601425171)]

In [298]:
model3.wv.most_similar(positive = [model.docvecs[1]],topn=15)

[('yeah', 0.40610766410827637),
 ('brings', 0.39789503812789917),
 ('associate', 0.3929458260536194),
 ('among', 0.39180511236190796),
 ('accepts', 0.3868747353553772),
 ('yesterday', 0.37955841422080994),
 ('dozen', 0.3772449493408203),
 ('fellow', 0.36138030886650085),
 ('un', 0.361300528049469),
 ('teacher', 0.36034536361694336),
 ('gown', 0.3595094680786133),
 ('price', 0.3587324321269989),
 ('fired', 0.35831642150878906),
 ('operation', 0.35628965497016907),
 ('forgetful', 0.3547370433807373)]

#### Try not sampling 

In [299]:
s_candidates = list(review_im[review_im['hasorder']==1].index)
ns_candidates = list(review_im[review_im['hasorder']==0].index)

In [300]:
len(s_candidates)

574

In [301]:
len(ns_candidates)

87301

In [263]:
batch2 = []
for b in s_candidates:
    indices.append(s)
for c in ns_candidates:
    indices.append(ns)
for i in indices:
    batch2.append(docs[i])

In [302]:
model = Doc2Vec(dm=0, window=20,vector_size=300,min_count=30,epochs=5,workers=cores,
               hs=0,negative=7,dbow_words=1,dm_concat=1)
model.build_vocab(docs_for_vocab)
model.train(documents = batch2,total_examples=model.corpus_count,epochs=model.epochs)
model.save("doc_review_sanction_not_sampling.model")

In [303]:
from gensim.models.doc2vec import Doc2Vec
model_ns= Doc2Vec.load("doc_review_sanction_not_sampling.model")

In [304]:
model_ns.wv.most_similar(positive = [model.docvecs[0]],topn=15)

[('appointment', 0.8625890016555786),
 ('min', 0.7255848050117493),
 ('worst', 0.6971302628517151),
 ('early', 0.6963605284690857),
 ('deal', 0.6783980131149292),
 ('kane', 0.6604759693145752),
 ('symptons', 0.6585692763328552),
 ('resourceful', 0.6466319561004639),
 ('yet', 0.6365990042686462),
 ('receptionist', 0.6359545588493347),
 ('two', 0.6245956420898438),
 ('backed', 0.6068121790885925),
 ('professional', 0.5959866642951965),
 ('difficult', 0.5936208963394165),
 ('time', 0.5887964963912964)]

In [305]:
model_ns.wv.most_similar(positive = [model.docvecs[1]],topn=15)

[('copay', 0.6286177635192871),
 ('bother', 0.611724317073822),
 ('suck', 0.6056062579154968),
 ('doctor', 0.6024812459945679),
 ('told', 0.6021660566329956),
 ('oath', 0.6018040776252747),
 ('forget', 0.5853140354156494),
 ('patient', 0.5783458352088928),
 ('dont', 0.577888548374176),
 ('doctors', 0.572316586971283),
 ('hippocratic', 0.5718353986740112),
 ('staff', 0.5713257193565369),
 ('doesnt', 0.570040762424469),
 ('says', 0.5681448578834534),
 ('kind', 0.5652170181274414)]