In [1]:
import pandas as pd
import numpy as np
import datetime 
import re
from urlextract import URLExtract
import spacy
import time
import pickle
import matplotlib.pyplot as plt

In [2]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
extractor = URLExtract()
# remove all urls from posts
def replace_urls(x):
    urls = extractor.find_urls(x)
    if urls:
        x_new = replace_urls(x.replace(urls[0],''))
        return x_new
    else:
        return x

In [739]:
# df_full = pd.read_csv('data/anxiety_full.csv', low_memory=False)
depression = pd.read_csv('data/depression_topics_final.csv', low_memory=False, lineterminator='\n')
depression.drop(columns=['Unnamed: 0'], inplace=True)

In [740]:
anxiety = pd.read_csv('data/anxiety_topics_final.csv', low_memory=False, lineterminator='\n')
anxiety.drop(columns=['Unnamed: 0'], inplace=True)

In [742]:
depression['date'] = pd.to_datetime(depression.date)
anxiety['date'] = pd.to_datetime(anxiety.date)

In [744]:
anxiety.max_topic.value_counts()

medication                 17801
heart/chest                13333
panic                      12350
friends/hang/groups        10948
anxious/super/reason        9267
sleep                       9195
work                        9112
school                      8897
occupational                8794
scared/fear/die             7462
sick/eat/stomach            7146
social/media/situations     7134
driving                     5285
Name: max_topic, dtype: int64

In [788]:
anxiety.dropna(subset=['selftext'], inplace=True)
depression.dropna(subset=['selftext'], inplace=True)

In [749]:
7134 + 10948

18082

In [775]:

testing = anxiety
testing['social_label'] = anxiety['friends/hang/groups'] + anxiety['social/media/situations']
anxiety_social = testing.sort_values(by=['social_label'], ascending=False).head(18082)

In [677]:
import gensim
import multiprocessing
from tqdm import tqdm
from sklearn import utils
from gensim.models.callbacks import CallbackAny2Vec

In [756]:


class LossLogger(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now))
        self.epoch += 1

loss_logger = LossLogger()
cores = multiprocessing.cpu_count()

documents = anxiety_social.text_title
tokenized_docs = [gensim.utils.simple_preprocess(d) for d in documents]
model = gensim.models.Word2Vec(tokenized_docs,
                               size=300,
                               window=5,
                               min_count=5,
                               sg=1,
                               callbacks=[loss_logger],
                               compute_loss=True,
                               iter=40,
                               workers=cores)

Loss after epoch 0: 2894926.25
Loss after epoch 1: 2627175.75
Loss after epoch 2: 2477546.5
Loss after epoch 3: 2136043.5
Loss after epoch 4: 2119476.0
Loss after epoch 5: 2047649.0
Loss after epoch 6: 2059244.0
Loss after epoch 7: 1945131.0
Loss after epoch 8: 1945882.0
Loss after epoch 9: 1906844.0
Loss after epoch 10: 1923540.0
Loss after epoch 11: 1928678.0
Loss after epoch 12: 1910690.0
Loss after epoch 13: 1920568.0
Loss after epoch 14: 1909798.0
Loss after epoch 15: 1865176.0
Loss after epoch 16: 1115416.0
Loss after epoch 17: 1119476.0
Loss after epoch 18: 1135504.0
Loss after epoch 19: 1130872.0
Loss after epoch 20: 1081280.0
Loss after epoch 21: 1076020.0
Loss after epoch 22: 1083888.0
Loss after epoch 23: 1061964.0
Loss after epoch 24: 1058080.0
Loss after epoch 25: 1053748.0
Loss after epoch 26: 1072932.0
Loss after epoch 27: 1036812.0
Loss after epoch 28: 1033580.0
Loss after epoch 29: 1033564.0
Loss after epoch 30: 1009528.0
Loss after epoch 31: 993332.0
Loss after epoch 

In [758]:
model.wv.most_similar(['anxiety'], topn=10)

[('depression', 0.5161457657814026),
 ('panic', 0.48995161056518555),
 ('dysmorphic', 0.4710979461669922),
 ('gad', 0.46230143308639526),
 ('severe', 0.45712628960609436),
 ('fogginess', 0.4477344751358032),
 ('depressions', 0.4388280212879181),
 ('generalized', 0.4360097050666809),
 ('disabling', 0.4330070912837982),
 ('axiety', 0.4313054084777832)]

In [798]:
anxiety[anxiety.selftext.str.contains(' reddit ')].values[2]

array([1449943401, 'Anxiety', 'determinism101', 'self.Anxiety',
       'https://www.reddit.com/r/Anxiety/comments/3wjfio/why_has_the_idea_of_determinism_sent_me_crazy/',
       2, 3, 3.0, 0.0, 'Why has the idea of determinism sent me crazy?',
       "For some reason after reading about determinism I feel like I have gone absolutely crazy. The idea that are lives are unfolding and we don't have real control over it. Now when I look at people I feel like I'm looking at computers just doing what they do and feel like I have lost all connection with people. This thought has given me horrendous anxiety, I feel like I don't even know where I am any more. My question reddit is this just anxiety making me feel like the thought is crazy or is it the thought? Surely finding out something can't send you crazy... ",
       False, '3wjfio', nan, 0, nan, False, 1454923681, False, 'self',
       't5_2qmij', False, 'venting', nan, False, True, nan,
       '/r/Anxiety/comments/3wjfio/why_has_the_idea_o

In [765]:
model.wv.most_similar(positive=['anxiety', 'media'], topn=30)

[('social', 0.45255523920059204),
 ('vanity', 0.4509308934211731),
 ('facebook', 0.43780356645584106),
 ('medias', 0.436012864112854),
 ('deactivating', 0.4313421845436096),
 ('pmdd', 0.42716991901397705),
 ('anx', 0.42464113235473633),
 ('bdd', 0.4229803681373596),
 ('accounts', 0.4221058785915375),
 ('severe', 0.417378693819046),
 ('predominantly', 0.4134192168712616),
 ('dysmorphic', 0.41323986649513245),
 ('confuse', 0.41209423542022705),
 ('depression', 0.41192111372947693),
 ('ineptitude', 0.4109877645969391),
 ('ebook', 0.41043978929519653),
 ('axiety', 0.41033726930618286),
 ('instagram', 0.40737876296043396),
 ('profiles', 0.4044165015220642),
 ('disabling', 0.3997569680213928),
 ('sites', 0.39877018332481384),
 ('faux', 0.3962881565093994),
 ('deleted', 0.39503341913223267),
 ('generalized', 0.39392900466918945),
 ('it', 0.39080893993377686),
 ('logging', 0.3880566954612732),
 ('dysthymia', 0.3878366947174072),
 ('inattentive', 0.38360199332237244),
 ('cyclothymia', 0.3830176

In [760]:
model.wv.most_similar(positive=['media', 'anxiety'], negative=['social'], topn=30)

[('medias', 0.373568594455719),
 ('instagram', 0.36060455441474915),
 ('facebook', 0.35952162742614746),
 ('accounts', 0.3591877222061157),
 ('deleted', 0.3472468852996826),
 ('deactivated', 0.3239598274230957),
 ('sites', 0.323849618434906),
 ('scrolling', 0.3155139088630676),
 ('faux', 0.3129236400127411),
 ('snapchat', 0.3078605830669403),
 ('ebook', 0.3005869388580322),
 ('delete', 0.2996944487094879),
 ('profiles', 0.2996373474597931),
 ('unfollowed', 0.2954139709472656),
 ('deactivating', 0.2940328121185303),
 ('vanity', 0.2906012535095215),
 ('logging', 0.28902649879455566),
 ('celebrities', 0.28729158639907837),
 ('ineptitude', 0.2864249348640442),
 ('platforms', 0.2862335741519928),
 ('axiety', 0.2834794521331787),
 ('codependency', 0.2822232246398926),
 ('mindlessly', 0.28168582916259766),
 ('posts', 0.2783516049385071),
 ('anx', 0.27813559770584106),
 ('channels', 0.2777961194515228),
 ('screen', 0.2747234106063843),
 ('it', 0.2733500599861145),
 ('scroll', 0.273060649633407

In [None]:
# model.syn0

In [None]:
# model2.wv.most_similar(['motivation'], topn=30)

In [None]:
# model.save("models/anxiety2vec_full.model")

In [682]:
model1 = gensim.models.Word2Vec.load("models/anxiety2vec_full.model")
model2 = gensim.models.Word2Vec.load("models/depression2vec_full.model")

In [None]:
model2.wv.most_similar(['motivation'], topn=30)

In [None]:
model.wv.most_similar(positive=['circadian', 'b'], topn=30)

In [None]:
model2.wv.most_similar(positive=['circadian', 'insomnia'], topn=30)

In [None]:
model.wv.most_similar(['circadian'], topn=30)

In [None]:
model2.wv.most_similar(['circadian'], topn=30)

In [None]:
model2.wv.most_similar(positive=['sleep'], negative=['insomnia'], topn=20)

In [None]:
w2c = dict()
for item in model.wv.vocab:
    w2c[item]=model.wv.vocab[item].count

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(list(w2c.items()),columns = ['word','count']) 
df[['word', 'count']]

In [None]:
# model.wv.similar_by_vector(negative=['psychiatrist', 'prescribed'],topn=20)
# model2.wv.most_similar(positive=['psychiatrist'], negative=['prescribed'], topn=20)
# model2.wv.most_similar(positive=['addict'], negative=['drugs'], topn=20)
model2.wv.most_similar(positive=['addict'], negative=['drugs'], topn=20)



In [None]:
diff = model.wv['sleep'] - model.wv['prescribed']
model.wv.similar_by_vector(diff,topn=20)



In [None]:
# man_wom = model.wv['psychiatrist'] - model.wv['meds'] 
# model.wv.similar_by_vector(man_wom,topn=20)

diff = model.wv['psychiatrist'] - model.wv['meds']
model.wv.similar_by_vector(diff,topn=10)


In [None]:
model.wv.most_similar(positive=['cry', 'frustration'], negative=['sad'], topn=20)
# model2.wv.most_similar(positive=['cry', 'frustration'], negative=['sad'], topn=20)


In [None]:
model2.wv.most_similar(positive=['cry', 'frustration'], negative=['sad'], topn=20)


In [None]:
model2.wv.most_similar(positive=['cry', 'anxious'], negative=['sad'], topn=20)


In [None]:
# model.most_similar(positive=['paris', 'australia'], negative=['france'], topn=10)
model.wv.most_similar(positive=['cry', 'anxious'], negative=['sad'], topn=20)
# print()
# print(model2.wv.most_similar(positive=['cry', 'nervous'], negative=['sad'], topn=20))

# diff = model.wv['psychiatrist'] - model.wv['therapist']
# diff2 = model2.wv['psychiatrist'] - model2.wv['therapist']
# print(model.wv.similar_by_vector(diff,topn=10))
# print()
# print(model2.wv.similar_by_vector(diff2,topn=10))

In [None]:
diff = model2.wv['addict'] - model2.wv['drugs']
model2.wv.similar_by_vector(diff,topn=20)

In [None]:
model2.wv.most_similar(positive=['addict'], negative=['drugs'], topn=20)

In [None]:
model.wv.most_similar(positive=['alcoholic'], negative=['alcohol'], topn=20)

In [None]:
diff = model.wv['alcoholic'] - model.wv['alcohol']
model.wv.similar_by_vector(diff,topn=10)


In [None]:
# man_wom = model.wv['sex'] - model.wv['partner'] 
# model.wv.similar_by_vector(man_wom,topn=20)
man_wom = model2.wv['addict'] - model2.wv['drugs'] 
model2.wv.similar_by_vector(man_wom,topn=20)

In [None]:
man_wom = model.wv['home'] - model.wv['good'] 
model.wv.similar_by_vector(man_wom,topn=20)
# names = []
# vals = []

# for item in simvecs:
#     names.append(item[0])
#     vals.append(item[1])


    
# man_wom2 = model2.wv['addict'] - model2.wv['drugs'] 
# simvecs2 = model2.wv.similar_by_vector(man_wom2,topn=20)
# names2 = []
# vals2 = []

# for item in simvecs2:
#     names2.append(item[0])
#     vals2.append(item[1])
    
    

In [None]:
# fig, ax = plt.subplots(figsize=[10, 8])
# plt.title('addict - drugs', color='white', fontsize=15)
# ax.barh(names2[::-2], vals2[::-2], color = 'orange')

# ax.axes.tick_params(axis="y", colors='white', labelsize=15)
# ax.axes.tick_params(axis="x", colors="white", labelsize=10)
# plt.savefig('graph3.png', transparent=True, dpi=300)

In [None]:
names2

In [None]:

fig, ax = plt.subplots(figsize=[10, 8])
plt.title('addiction - drugs', color='white', fontsize=15)
ax.barh(names[::-2], vals[::-2], color = 'orange')

ax.axes.tick_params(axis="y", colors='white', labelsize=15)
ax.axes.tick_params(axis="x", colors="white", labelsize=10)
plt.savefig('graph2.png', transparent=True, dpi=300)

In [None]:
man_wom = model.wv['need'] - model.wv['want'] 
model.wv.similar_by_vector(man_wom,topn=20)

In [None]:
model.wv.most_similar('dead', topn=20)

In [None]:
man_wom = model.wv['friend'] - model.wv['love']
model.wv.similar_by_vector(man_wom,topn=20)

In [None]:
man_wom = model.wv['gender'] - model.wv['identity'] 
model.wv.similar_by_vector(man_wom,topn=20)

In [None]:
# psych = model.wv['pyschiatrist']
# meds = model.wv['meds']


In [None]:
# doc_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
# doc_model

In [None]:
df_full.text_title

In [None]:
# # anxiety = model.wv['anxious']
# import seaborn as sns

# sns.set_theme(style="whitegrid")
# a = model.wv['psychiatrist']
# b = model.wv['prescribed']

# x = a - b

# x2 = model.wv['therapist']

# # fig = plt.figure(figsize=[15,15])
# # ax = ax.gca(projection='3d')
# fig, ax = plt.subplots(figsize=[10, 8])
# V = np.array([a, b, x2])
# origin = np.array([[0, 0, 0],[0, 0, 0]]) # origin point

# ax.quiver(*origin, V[:,0], V[:,1], color=['r','b','g'], scale=.7)
# # plt.xlim([-.002, .002])
# # plt.show()
# # model.wv.similar_by_vector(x)

# ax.axes.tick_params(axis="y", colors='white', labelsize=10)
# ax.axes.tick_params(axis="x", colors="white", labelsize=10)
# plt.grid(False)
# plt.savefig('vecs.png', transparent=True, dpi=300)


In [None]:
# x, y  = [1, 2, 3], [0.5, 0.5, 0.5]
# u1,v1 = np.random.randn(3), np.random.randn(3)
# u2,v2 = np.random.randn(3), np.random.randn(3)
# u3,v3 = np.random.randn(3), np.random.randn(3)

# QV1 = plt.quiver(x, y, u1, v1, color='r')
# QV2 = plt.quiver(x, y, u2, v2, color='b')
# QV3 = plt.quiver(x, y, u3, v3, color='g')

In [None]:
zero = np.zeros(400,)

In [29]:
df_full.dropna(subset=['selftext'], inplace=True)
df_full.reset_index(inplace=True)
df_full.reset_index(inplace=True)

In [32]:
df_full.drop(columns=['index'], inplace=True)

In [507]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

text = df_full.text_title
tokenized_self_text = [gensim.utils.simple_preprocess(d) for d in text]

# title = df_full.title
# tokenized_title = [gensim.utils.simple_preprocess(d) for d in title]
# data = df_full.text_title

# tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
# tagged_data[1:10]

In [508]:
# filt = df_full.title.value_counts().sort_values(ascending=False)
# plt.hist(filt[filt > 1].values, bins=100);

In [509]:
# df_full.head(2)

In [510]:
# # model= Doc2Vec.load("d2v.model")
# data = (pd.DataFrame({'selftext': [' '.join(word) for word in tokenized_self_text],
#                       'title': [' '.join(word) for word in tokenized_title], 'topic':df_full.max_topic}))
# data['text_title'] = data.title + " " + data.selftext

In [511]:
# df_full.head(3)

In [890]:
# from spacy.lang.en.stop_words import STOP_WORDS
import spacy
nlp = spacy.load("en")
def process_text(nlp, text, pos_list, lemma=False):

    text_full = [] 
    for doc in nlp.pipe(text, disable=['ner']):
        for np in doc.noun_chunks:
            test = np
#         test = [np for np in doc.noun_chunks]
#         print(test)
        # if part of speech list isnt empty return matches for pos
        if pos_list:
            tokens = [(ent.text) for ent in doc if not ent.is_stop and not ent.is_punct and ent.pos_ in pos_list]
            cleaned_text = " ".join(tokens)
            text_full.append(cleaned_text)
        elif lemma == True:
            tokens = [(ent.lemma_) for ent in doc if not ent.is_stop and not ent.is_punct]
            cleaned_text = " ".join(tokens)
            text_full.append(cleaned_text)
        else:
            tokens = [(ent.text) for ent in doc if not ent.is_stop and not ent.is_punct]
#             print(tokens)
#             comps = [j for j in i.children if j.dep_ == "compound"]
            cleaned_text = " ".join(tokens)
#             text_full.append(cleaned_text)
            text_full.append(test)
            
    return text_full

corp = ['social media, hey', 'google chrome']
a = process_text(nlp, corp, pos_list=[], lemma=False)
a
# a

# from spacy.symbols import *

# np_labels = set([nsubj, nsubjpass, dobj, iobj, pobj]) # Probably others too
# def iter_nps(doc):
#     for word in doc:
#         if word.dep in np_labels:
#             yield word.subtree
            
# iter_nps(corp)

[social media, google chrome]

In [879]:
a

[social media, google chrome]

In [856]:
doc =nlp('Bananas are an excellent source of potassium.')
for np in doc.noun_chunks:
    print(np.text)
'''
  Bananas
  an excellent source
  potassium
'''

Bananas
an excellent source
potassium


'\n  Bananas\n  an excellent source\n  potassium\n'

In [802]:
compounds = [token for token in sent if token.dep_ == 'compound']

['media', 'flu symptoms']

In [260]:
# data['title_short'] = a
# data['title_short2'] = data.topic + ' ' + data.title
# data['title'] = data.title.apply(lambda x: np.nan if x == '' else x)
# data_no_na = data.dropna().reset_index()
# data_no_na.drop(columns=['index'], inplace=True)

In [448]:
# temp = data.title.value_counts().sort_values(ascending=False)
# testing = temp[temp.values>5].index.to_list()

# data['is_title_good'] = data.title.apply(lambda x: True if x in testing else False)

In [454]:
# aa = data[data.is_title_good==True].copy().reset_index()
# aa.drop(columns=['index'], inplace=True)

In [514]:
# tokenized_self_text[0]

In [515]:
docs = []
for index, doc in enumerate(tokenized_self_text):
    tagged = TaggedDocument(words=doc, tags=[index])
    docs.append(tagged)

In [516]:
docs[0]

TaggedDocument(words=['just', 'got', 'dismissed', 'from', 'my', 'university', 'had', 'reallllllly', 'bad', 'semester', 'my', 'ocd', 'and', 'anxiety', 'were', 'at', 'their', 'worst', 'and', 'ended', 'up', 'failing', 'couple', 'classes', 'while', 'was', 'already', 'on', 'academic', 'probation', 'so', 'got', 'dismissed', 'now', 'feel', 'worthless', 'and', 'down', 'and', 'can', 'barely', 'get', 'out', 'of', 'bed', 'have', 'no', 'motivation', 'to', 'get', 'job', 'because', 'feel', 'like', 'can', 'even', 'move', 'obviously', 'can', 'move', 'to', 'another', 'city', 'and', 'attend', 'another', 'college', 'but', 'my', 'gpa', 'is', 'pretty', 'bad', 'now', 'sorry', 'for', 'the', 'rant', 'just', 'really', 'wanted', 'to', 'know', 'if', 'their', 'could', 'possibly', 'be', 'any', 'good', 'outcomes', 'out', 'of', 'this', 'thanks', 'guys'], tags=[0])

In [517]:
# define parameters for doc2vec
doc_model = gensim.models.Doc2Vec(
                                  vector_size=300,
                                  window=5,
                                  min_count=5,
                                  workers=cores,
                                  epochs=10,
                                  alpha=.025,
                                  min_alpha = 0.0001)
doc_model.build_vocab(docs)

In [518]:
# train model
doc_model.train(docs, total_examples=doc_model.corpus_count, epochs=doc_model.epochs)

In [594]:
a = '''
doctor told me to go the er if i hurt myself again but i’m too ashamed and i don’t feel worth it.
they’re probably going to tell me it’s not serious and to essentially stop wasting time. I already feel like shit and i really don’t want to feel stupid too. has anyone had any good experiences with this, i just feel so useless and i don’t want to feel that somewhere else as well.
'''.split()

In [600]:
docs[158318]

TaggedDocument(words=['feel', 'useless', 'going', 'to', 'go', 'outside'], tags=[158318])

In [601]:
inferred_vector = doc_model.infer_vector(a)
sims = doc_model.docvecs.most_similar([inferred_vector], topn=len(doc_model.docvecs))
sims[:10]

[(357199, 0.5701062679290771),
 (334925, 0.557730495929718),
 (321864, 0.5497127175331116),
 (239190, 0.5470794439315796),
 (7400, 0.5399435758590698),
 (356810, 0.5391318798065186),
 (281831, 0.5377613306045532),
 (172762, 0.5372531414031982),
 (31219, 0.5338485240936279),
 (77443, 0.5309157371520996)]

In [526]:
testing = '''
i really need advice on my medication i just starting taking zoloft'''

testing = gensim.utils.simple_preprocess(testing)
testing

['really',
 'need',
 'advice',
 'on',
 'my',
 'medication',
 'just',
 'starting',
 'taking',
 'zoloft']

In [625]:
from nltk.cluster import KMeansClusterer, euclidean_distance

# X = doc_model.docvecs.vectors_docs


# NUM_CLUSTERS=3
# kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=5)
# assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

In [527]:
# doc2vec_best = gensim.models.Word2Vec.load("models/doc2vec.model")

In [None]:
# model = gensim.models.Word2Vec.load("models/anxiety2vec_full.model")
# model2 = gensim.models.Word2Vec.load("models/depression2vec_full.model")

# words = []
# counts =[]
# w2v = model2
# for index in range(len(w2v.wv.index2word)):
#     word = w2v.wv.index2word[index]
#     words.append(word)
#     counts.append(w2v.wv.vocab[word].count)
    
# pd.DataFrame({'word':words, 'count':counts}).to_csv('tensor_data/depression/depression_meta.tsv', index=False,sep='\t' )

# import io

# Vector file, `\t` seperated the vectors and `\n` seperate the words
# """
# 0.1\t0.2\t0.5\t0.9
# 0.2\t0.1\t5.0\t0.2
# 0.4\t0.1\t7.0\t0.8
# """

# path2 = '/Users/collinswestnedge/programming/project_05/tensor_data/anxiety/anxiety_'

# out_v = io.open(path2+'vecs.tsv', 'w', encoding='utf-8')

# # Meta data file, `\n` seperated word
# # """
# # token1
# # token2
# # token3
# # """
# # out_m = io.open(path2+'meta.tsv', 'w', encoding='utf-8')

# # Write meta file and vector file
# for index in range(len(w2v.wv.index2word)):
#     word = w2v.wv.index2word[index]
#     vec = w2v.wv.vectors[index]
#     count = w2v.wv.vocab[word].count
# #     out_m.write(word + "\n")
#     out_v.write('\t'.join([str(x) for x in vec]) + "\n")
# out_v.close()
# out_m.close()

In [630]:
doc_vec_df = pd.DataFrame(doc_model.docvecs.vectors_docs)
doc_vec_df['num_comments'] = df_full.num_comments
doc_vec_df['docs'] = df_full.text_title
doc_vec_df.head()

cols= [str(i) for i in range(doc_model.docvecs.vectors_docs.shape[1])]
cols.append('num_comments')
cols.append('docs')

doc_vec_df.columns = cols
doc_vec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,292,293,294,295,296,297,298,299,num_comments,docs
0,0.078888,0.161811,-0.10713,0.23403,-0.27997,0.053884,-0.106011,0.044602,-0.091572,-0.164289,...,-0.021283,-0.216137,-0.33321,-0.407201,0.085278,-0.081534,0.096892,-0.000399,0,just got dismissed from my university i had a ...
1,0.24304,0.238591,-0.194129,-0.079292,0.036467,-0.082717,0.333168,-0.015197,-0.08283,-0.000292,...,-0.187632,-0.106601,-0.0374,-0.346247,0.198628,0.169815,0.221606,0.299383,4,im a spoiled piece of shit im on a vacation wi...
2,0.08569,0.310152,-0.002498,-0.255833,-0.161582,-0.024896,-0.143368,-0.097935,-0.198977,0.461797,...,0.073579,-0.05336,-0.178241,-0.024364,-0.284419,0.278496,0.339424,-0.068813,0,caring about a person that doesn't i'm a 17 ye...
3,0.537546,0.141156,0.058936,0.040804,0.011771,0.426472,-0.006133,-0.117769,-0.161384,-0.001459,...,-0.03518,-0.111695,-0.498108,-0.473666,0.079978,0.241684,0.214186,0.132251,0,i really screwed up this year i feel like such...
4,0.453357,-0.000604,-0.17521,0.724042,0.47016,-0.321912,0.078653,-0.055731,0.095825,0.242058,...,-0.448972,-0.128222,-0.390948,0.058155,0.182607,-0.240953,-0.224155,0.686321,1,it has been 596 days since i have debated suic...


In [658]:
doc_vec_df.iloc[:,0:-5]



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.078888,0.161811,-0.107130,0.234030,-0.279970,0.053884,-0.106011,0.044602,-0.091572,-0.164289,...,-0.065170,-0.115179,-0.021283,-0.216137,-0.333210,-0.407201,0.085278,-0.081534,0.096892,-0.000399
1,0.243040,0.238591,-0.194129,-0.079292,0.036467,-0.082717,0.333168,-0.015197,-0.082830,-0.000292,...,-0.286022,-0.301353,-0.187632,-0.106601,-0.037400,-0.346247,0.198628,0.169815,0.221606,0.299383
2,0.085690,0.310152,-0.002498,-0.255833,-0.161582,-0.024896,-0.143368,-0.097935,-0.198977,0.461797,...,-0.526285,0.111233,0.073579,-0.053360,-0.178241,-0.024364,-0.284419,0.278496,0.339424,-0.068813
3,0.537546,0.141156,0.058936,0.040804,0.011771,0.426472,-0.006133,-0.117769,-0.161384,-0.001459,...,-0.192475,0.108178,-0.035180,-0.111695,-0.498108,-0.473666,0.079978,0.241684,0.214186,0.132251
4,0.453357,-0.000604,-0.175210,0.724042,0.470160,-0.321912,0.078653,-0.055731,0.095825,0.242058,...,-0.426778,-0.431006,-0.448972,-0.128222,-0.390948,0.058155,0.182607,-0.240953,-0.224155,0.686321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361270,-0.216155,-0.238402,-0.269877,0.023970,0.043256,-0.027265,0.142738,-0.150495,0.092939,0.227519,...,-0.162212,-0.010255,0.116205,0.099246,-0.176593,-0.094444,0.178244,-0.286559,-0.162561,-0.015202
361271,0.420884,-0.194318,0.308946,-0.639862,-0.459557,-0.030438,0.630819,-0.190128,0.462431,0.073191,...,-0.219007,-0.386471,-0.040322,0.097240,0.603163,-0.144438,0.414343,-0.060351,0.137738,-0.867416
361272,0.633937,0.302650,-0.668295,-0.248073,-0.451080,0.059631,-0.598430,-0.498062,-0.077794,-0.231575,...,-0.181393,-0.122025,0.073612,0.107578,-0.449006,0.363394,-0.023087,0.111334,-0.227908,0.246666
361273,0.136512,0.015302,-0.301967,0.014745,0.026121,0.008565,0.186953,-0.193920,-0.214579,0.237474,...,-0.202159,-0.122609,0.174777,0.114380,-0.302098,-0.198829,0.093275,0.047873,0.093695,0.180033


In [659]:
from sklearn import preprocessing
from sklearn.cluster import KMeans

# to normalise existing X
X_Norm = preprocessing.normalize(doc_vec_df.iloc[:,0:-5], axis=1)
X_Norm

kmeans = KMeans(n_clusters=5, random_state=0).fit(X_Norm)

In [660]:
kmeans.labels_

array([3, 1, 4, ..., 0, 3, 2], dtype=int32)

In [661]:
doc_vec_df['labels'] = kmeans.labels_
doc_vec_df[doc_vec_df.labels == 2]['docs'].values[6]

"celexa's sexual side effects not wearing off after discontinuation i've been on celexa/citalopram for about a year at 10mg and got off of it about 2 weeks ago. my emotions are somewhat dulled and i don't get horny or have full erections. i feel like a part of me is missing, like i might as well not have a penis. the stress it causes me isn't doing any good either. before anyone asks, yes, this is as important to me as treating my depression. **what can i do to get my libido back?** i exercise regularly and have a good diet, but there's been no change so far. **can anyone who's been through this give me some information or advice?** tl;dr: how long will celexa's sexual side effects take to wear off after i quit taking them? i'm a 175 pound, 18 year old male who took 10mg of celexa for a year. "

In [662]:
doc_vec_df.max_topic.unique()

array(['job', 'tired/sick', 'friend/best/girl', 'sad',
       'depression/anxiety/meds', 'depressed/suicidal/reason',
       'friends/lonely', 'school', 'sleep', 'kill/suicide'], dtype=object)

In [667]:
# doc_vec_df['max_topic'] = df_full.max_topic
# doc_vec_df[doc_vec_df.max_topic == 'kill/suicide'].labels.value_counts()

In [None]:
inertia = []
sil = []
# changing the number of clusters 
for k in range(5,30):
    
    km = KMeans(n_clusters=k, random_state=1)
    km.fit(X_Norm)
    y_pred = km.predict(X_Norm)
    inertia.append((k, km.inertia_))
    print(k)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1,2, figsize=(12,4))

# Plotting Elbow Curve
x_iner = [x[0] for x in inertia]
y_iner  = [x[1] for x in inertia]
ax[0].plot(x_iner, y_iner)
ax[0].set_xlabel('Number of Clusters')
ax[0].set_ylabel('Intertia')
ax[0].set_title('Elbow Curve')

