# Initialize Models
This notebook will walk you through building and saving the most basic 
models and matricies we used for analyzing our text data. 
This notebook allows you to build word2vec models and produce the following matrices for multiple subreddits.
The model itself is built on a corpus of all the data in the data folder so clusters are consistent across subreddit.

We first import the libraries and utility files we are going to be using.

In [1]:
# Import useful mathematical libraries
import numpy as np
import pandas as pd

# Import useful Machine learning libraries
import gensim

#Import nltk list of stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

#import function for normalization
from sklearn.preprocessing import normalize

#import TF function
from sklearn.feature_extraction.text import CountVectorizer


# Import utility files
from utils import read_df, remove_links, clean_sentence, save_object, load_object, make_clustering_objects, weeks_since, stopless_text_generator, months_since

from pprint import pprint

[nltk_data] Downloading package stopwords to /Users/Alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Setup directories

If this is the first time doing this analysis, 
we first will set up all the directories we need
to save and load the models we will be using

In [2]:
import os
directories = ['objects', 'models', 'clusters', 'matricies', 'excels']
for dirname in directories:
    if not os.path.exists(dirname):
        os.makedirs(dirname)

In [7]:
res = [(12,89),(34,25),(36,45)]


12

#### Name Model

Before begining the rest of our project, we select a name for our model.
This name will be used to save and load the files for this model

In [3]:
model_name = "SOsAnalysis"

#### Parse and Clean Data

We first parse and clean our data. Our data is assumed to be in csv format, 
in a directory labeled 'data'.
Files for all subreddits should be included in the data file.

In [4]:
# Get the data from the csv
df = read_df('SOsAnalysis',extension = "/*.csv")

In [5]:
# Do an inspection of our data to ensure nothing went wrong
#df.info gives basic info on the DataFrame, specifically make sure column titles contain 
#all intended fields and the number of entries
#The number of entries will tell you how many posts you have
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 149888 entries, 0 to 125
Data columns (total 15 columns):
title           149888 non-null object
created_utc     149888 non-null int64
author          149888 non-null object
ups             53528 non-null float64
downs           53528 non-null float64
num_comments    149888 non-null int64
id              149888 non-null object
name            45702 non-null object
from            0 non-null float64
from_id         0 non-null float64
selftext        141503 non-null object
subreddit       149888 non-null object
score           149888 non-null int64
url             149888 non-null object
permalink       149888 non-null object
dtypes: float64(4), int64(3), object(8)
memory usage: 18.3+ MB


In [6]:
#df.head() will print out the first 5 entries to the df
df.head()

Unnamed: 0,title,created_utc,author,ups,downs,num_comments,id,name,from,from_id,selftext,subreddit,score,url,permalink
0,Counseling,1470031667,ishihtzunot16,2.0,0.0,2,4vl48p,t3_4vl48p,,,I'm calling around to start counseling soon si...,BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vl48p/counseling/
1,Coming off olanzapine (hopefully!) - worried a...,1470042562,rogue_arrows,5.0,0.0,3,4vlluv,t3_4vlluv,,,"Hi everyone,\n\nI'm due for a doctor review to...",BipolarReddit,5,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vlluv/coming_off_ol...
2,nicotine and mania?,1470046312,sydalmighty,2.0,0.0,4,4vls32,t3_4vls32,,,"Hi, I recently tried snus the Skruf White Star...",BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vls32/nicotine_and_...
3,Manic Romance,1470055075,scurius,19.0,0.0,32,4vm9qn,t3_4vm9qn,,,I've been pulled away from Reddit very effecti...,BipolarReddit,19,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vm9qn/manic_romance/
4,help with dealing with on again off again boyf...,1470056821,thum96,2.0,0.0,3,4vme78,t3_4vme78,,,"sorry, this is going to be a long one, but i w...",BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vme78/help_with_dea...


In [7]:
# Clean the text in the dataframe

#Replace all NaN values with the empty string
df = df.replace(np.nan, '', regex = True)

#Replace all deleted posts and authors with an empty string
df = df.replace("\[deleted\]", '', regex = True)

#Concatenate title and post text
df["rawtext"] = df["title"] + " " + df["selftext"]

#Clean the raw text, removing links, lower casing, removed characters
df["cleantext"] = df["rawtext"].apply(remove_links).apply(clean_sentence)

In [8]:
# Check that the cleaning was successful
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 149888 entries, 0 to 125
Data columns (total 17 columns):
title           149888 non-null object
created_utc     149888 non-null int64
author          149888 non-null object
ups             149888 non-null object
downs           149888 non-null object
num_comments    149888 non-null int64
id              149888 non-null object
name            149888 non-null object
from            149888 non-null object
from_id         149888 non-null object
selftext        149888 non-null object
subreddit       149888 non-null object
score           149888 non-null int64
url             149888 non-null object
permalink       149888 non-null object
rawtext         149888 non-null object
cleantext       149888 non-null object
dtypes: int64(3), object(14)
memory usage: 20.6+ MB


In [9]:
df.head()

Unnamed: 0,title,created_utc,author,ups,downs,num_comments,id,name,from,from_id,selftext,subreddit,score,url,permalink,rawtext,cleantext
0,Counseling,1470031667,ishihtzunot16,2,0,2,4vl48p,t3_4vl48p,,,I'm calling around to start counseling soon si...,BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vl48p/counseling/,Counseling I'm calling around to start counsel...,counseling im calling around to start counseli...
1,Coming off olanzapine (hopefully!) - worried a...,1470042562,rogue_arrows,5,0,3,4vlluv,t3_4vlluv,,,"Hi everyone,\n\nI'm due for a doctor review to...",BipolarReddit,5,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vlluv/coming_off_ol...,Coming off olanzapine (hopefully!) - worried a...,coming off olanzapine hopefully worried a...
2,nicotine and mania?,1470046312,sydalmighty,2,0,4,4vls32,t3_4vls32,,,"Hi, I recently tried snus the Skruf White Star...",BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vls32/nicotine_and_...,"nicotine and mania? Hi, I recently tried snus ...",nicotine and mania hi i recently tried snus ...
3,Manic Romance,1470055075,scurius,19,0,32,4vm9qn,t3_4vm9qn,,,I've been pulled away from Reddit very effecti...,BipolarReddit,19,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vm9qn/manic_romance/,Manic Romance I've been pulled away from Reddi...,manic romance ive been pulled away from reddit...
4,help with dealing with on again off again boyf...,1470056821,thum96,2,0,3,4vme78,t3_4vme78,,,"sorry, this is going to be a long one, but i w...",BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vme78/help_with_dea...,help with dealing with on again off again boyf...,help with dealing with on again off again boyf...


In [10]:
#df.tail() will print out the last five entries of the df
df.tail()

Unnamed: 0,title,created_utc,author,ups,downs,num_comments,id,name,from,from_id,selftext,subreddit,score,url,permalink,rawtext,cleantext
121,Му_first_lifе_ехреriеnсе._I_didn’t_bеliеvе_thа...,1485770276,Julianruice,,,0,5qzt7a,,,,[removed],BipolarSOs,0,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5qzt7a/му_first_lifе_ех...,Му_first_lifе_ехреriеnсе._I_didn’t_bеliеvе_thа...,first lif rin i didnt bliv tht sit n hl t fi...
122,"New Here - Long, kind of a vent.",1485799040,DayDreamingofU,,,8,5r2c43,,,,Hi! Girlfriend of a wonderful man who has bipo...,BipolarSOs,2,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r2c43/new_here_long_ki...,"New Here - Long, kind of a vent. Hi! Girlfrien...",new here long kind of a vent hi girlfrien...
123,Loving someone with BP,1485817796,yesImind,,,2,5r4b5h,,,,https://i.imgur.com/VaeBUuj.gifv\n\n,BipolarSOs,32,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r4b5h/loving_someone_w...,Loving someone with BP https://i.imgur.com/Vae...,loving someone with bp https i imgur com vae...
124,Tell us something good!,1485836665,inlovedelicious,,,10,5r61ki,,,,What's something good that has been going on i...,BipolarSOs,9,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r61ki/tell_us_somethin...,Tell us something good! What's something good ...,tell us something good whats something good t...
125,Му rеаl stоrу аbоut 3 timеs with 3 diffеrеnt g...,1485858398,Noahrafi,,,0,5r7etw,,,,[removed],BipolarSOs,0,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r7etw/му_rеаl_stоrу_аb...,Му rеаl stоrу аbоut 3 timеs with 3 diffеrеnt g...,rl str but tims with diffrnt girls fr gus...


### Stop Words Removal

After parsing and cleaning the data we further preprocess the data
by removing the common words known as stop words using the help of nltk

In [11]:
#Get list of stop words
stop_words = stopwords.words('english')

In [12]:
#The list of stop words for reference
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [13]:
#Add a column to the DataFrame for the clean text with stop words removed
df["stoplesstext"] = df["cleantext"].apply(lambda s: stopless_text_generator(s,stop_words))

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 149888 entries, 0 to 125
Data columns (total 18 columns):
title           149888 non-null object
created_utc     149888 non-null int64
author          149888 non-null object
ups             149888 non-null object
downs           149888 non-null object
num_comments    149888 non-null int64
id              149888 non-null object
name            149888 non-null object
from            149888 non-null object
from_id         149888 non-null object
selftext        149888 non-null object
subreddit       149888 non-null object
score           149888 non-null int64
url             149888 non-null object
permalink       149888 non-null object
rawtext         149888 non-null object
cleantext       149888 non-null object
stoplesstext    149888 non-null object
dtypes: int64(3), object(15)
memory usage: 21.7+ MB


In [15]:
df.head()

Unnamed: 0,title,created_utc,author,ups,downs,num_comments,id,name,from,from_id,selftext,subreddit,score,url,permalink,rawtext,cleantext,stoplesstext
0,Counseling,1470031667,ishihtzunot16,2,0,2,4vl48p,t3_4vl48p,,,I'm calling around to start counseling soon si...,BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vl48p/counseling/,Counseling I'm calling around to start counsel...,counseling im calling around to start counseli...,counseling im calling around start counseling ...
1,Coming off olanzapine (hopefully!) - worried a...,1470042562,rogue_arrows,5,0,3,4vlluv,t3_4vlluv,,,"Hi everyone,\n\nI'm due for a doctor review to...",BipolarReddit,5,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vlluv/coming_off_ol...,Coming off olanzapine (hopefully!) - worried a...,coming off olanzapine hopefully worried a...,coming olanzapine hopefully worried withdrawal...
2,nicotine and mania?,1470046312,sydalmighty,2,0,4,4vls32,t3_4vls32,,,"Hi, I recently tried snus the Skruf White Star...",BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vls32/nicotine_and_...,"nicotine and mania? Hi, I recently tried snus ...",nicotine and mania hi i recently tried snus ...,nicotine mania hi recently tried snus skruf wh...
3,Manic Romance,1470055075,scurius,19,0,32,4vm9qn,t3_4vm9qn,,,I've been pulled away from Reddit very effecti...,BipolarReddit,19,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vm9qn/manic_romance/,Manic Romance I've been pulled away from Reddi...,manic romance ive been pulled away from reddit...,manic romance ive pulled away reddit effective...
4,help with dealing with on again off again boyf...,1470056821,thum96,2,0,3,4vme78,t3_4vme78,,,"sorry, this is going to be a long one, but i w...",BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vme78/help_with_dea...,help with dealing with on again off again boyf...,help with dealing with on again off again boyf...,help dealing boyfriend suspected bipolar sorry...


### Create Posts Lists

We now need to create a list of posts.
The posts lists will be a list of lists, where each internal list is a list of tokens for each post.
We will create a list of posts for each subreddit as well as a total list.

In [16]:
#Create allPosts
allPosts = df["stoplesstext"].apply(lambda str: str.split()).tolist()

In [17]:
#Place the list of subreddits contained in the data file in this list
#IF ONLY ONE SUBREDDIT LEAVE BLANK!!!
#Make sure each subreddit is spelled correctly
subreddits = ["BipolarReddit","BipolarSOs","BPD","BPDlovedones"]
#Place a lables for each subreddit in this list
subreddit_abbr = ["Bipolar","BipolarSOs","BPD","BPDSOs"]

In [18]:
#Create a dictionary to house the information for each subreddit (label, df, posts)
#The keys to the dictionary are the names of the subreddit
subreddit_info = dict()

for subreddit in subreddits:
    #Create innner dictionary
    subreddit_info[subreddit] = dict()
    #Put in abbreviation
    subreddit_info[subreddit]['abbr'] = subreddit_abbr[subreddits.index(subreddit)]
    #Create df specific to subreddit
    subreddit_info[subreddit]['df'] = pd.DataFrame(df.loc[df['subreddit'].isin([subreddit])])
    #Create posts list specific to subreddit
    subreddit_info[subreddit]['posts'] = subreddit_info[subreddit]['df']["stoplesstext"].apply(lambda s: s.split()).tolist()
    

In [19]:
for s in subreddit_info:
    print(s)
    print(subreddit_info[s]['abbr'])
    print(len(subreddit_info[s]['df']))
    print(len(subreddit_info[s]['posts']))
    #print(subreddit_info[s]['df'].head())
    print(subreddit_info[s]['posts'][:10])
    print()

BipolarReddit
Bipolar
30200
30200
[['counseling', 'im', 'calling', 'around', 'start', 'counseling', 'soon', 'since', 'deductible', 'finally', 'met', 'prepare', 'like', 'nervous', 'know', 'need', 'help'], ['coming', 'olanzapine', 'hopefully', 'worried', 'withdrawal', 'weight', 'come', 'hi', 'everyone', 'im', 'due', 'doctor', 'review', 'tomorrow', 'fairly', 'certain', 'ill', 'coming', 'olanzapine', 'years', 'took', 'mg', 'day', 'majority', 'years', 'except', 'last', 'months', 'ive', 'mg', 'lower', 'dosage', 'affect', 'tangible', 'way', 'wondering', 'anything', 'worried', 'terms', 'withdrawal', 'ive', 'read', 'scary', 'stuff', 'coming', 'olanzapine', 'context', 'im', 'medication', 'prescribed', 'olanzapine', 'psychotic', 'episode', 'induced', 'lack', 'sleep', 'stress', 'religious', 'spiritual', 'personal', 'growth', 'possibly', 'marijuana', 'sensitivity', 'since', 'first', 'episode', 'ive', 'one', 'hiccup', 'relapse', 'wasnt', 'sleeping', 'stressed', 'exposed', 'lot', 'religious', 'ideolo

In [20]:
len(allPosts)

149888

In [21]:
#Make sure total equals value above
tot = 0
if len(subreddits) > 0:
    for s in subreddits:
        tot += len(subreddit_info[s]['posts'])
        print(s, ":  ", len(subreddit_info[s]['posts']))
    print("Total:", tot)

    assert tot == len(allPosts)

BipolarReddit :   30200
BipolarSOs :   5520
BPD :   89840
BPDlovedones :   24328
Total: 149888


### Phrase Analysis

After parsing and cleaning the data we run the gensim phraser
tool on our text data to join phrases like "new york city" 
together to form the word "new_york_city"

In [22]:
# Train a phraseDetector to join two word phrases together
two_word_phrases = gensim.models.Phrases(allPosts)
two_word_phraser = gensim.models.phrases.Phraser(two_word_phrases)

In [23]:
# Train a phraseDetector to join three word phrases together
three_word_phrases = gensim.models.Phrases(two_word_phraser[allPosts])
three_word_phraser = gensim.models.phrases.Phraser(three_word_phrases)

In [24]:
#Update allPosts to reflect phrasing
allPosts = list(three_word_phraser[two_word_phraser[allPosts]])

In [25]:
#Update posts for each subreddit
for s in subreddits:
    subreddit_info[s]['posts'] = list(three_word_phraser[two_word_phraser[subreddit_info[s]['posts']]])

In [26]:
# Update Data frame
df["phrasetext"] = df["stoplesstext"].apply(lambda str: " ".join(three_word_phraser[two_word_phraser[str.split()]]))

In [27]:
df.head()

Unnamed: 0,title,created_utc,author,ups,downs,num_comments,id,name,from,from_id,selftext,subreddit,score,url,permalink,rawtext,cleantext,stoplesstext,phrasetext
0,Counseling,1470031667,ishihtzunot16,2,0,2,4vl48p,t3_4vl48p,,,I'm calling around to start counseling soon si...,BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vl48p/counseling/,Counseling I'm calling around to start counsel...,counseling im calling around to start counseli...,counseling im calling around start counseling ...,counseling im calling around start counseling ...
1,Coming off olanzapine (hopefully!) - worried a...,1470042562,rogue_arrows,5,0,3,4vlluv,t3_4vlluv,,,"Hi everyone,\n\nI'm due for a doctor review to...",BipolarReddit,5,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vlluv/coming_off_ol...,Coming off olanzapine (hopefully!) - worried a...,coming off olanzapine hopefully worried a...,coming olanzapine hopefully worried withdrawal...,coming olanzapine hopefully worried withdrawal...
2,nicotine and mania?,1470046312,sydalmighty,2,0,4,4vls32,t3_4vls32,,,"Hi, I recently tried snus the Skruf White Star...",BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vls32/nicotine_and_...,"nicotine and mania? Hi, I recently tried snus ...",nicotine and mania hi i recently tried snus ...,nicotine mania hi recently tried snus skruf wh...,nicotine mania hi recently tried snus skruf wh...
3,Manic Romance,1470055075,scurius,19,0,32,4vm9qn,t3_4vm9qn,,,I've been pulled away from Reddit very effecti...,BipolarReddit,19,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vm9qn/manic_romance/,Manic Romance I've been pulled away from Reddi...,manic romance ive been pulled away from reddit...,manic romance ive pulled away reddit effective...,manic romance ive pulled_away reddit effective...
4,help with dealing with on again off again boyf...,1470056821,thum96,2,0,3,4vme78,t3_4vme78,,,"sorry, this is going to be a long one, but i w...",BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vme78/help_with_dea...,help with dealing with on again off again boyf...,help with dealing with on again off again boyf...,help dealing boyfriend suspected bipolar sorry...,help dealing boyfriend suspected_bipolar sorry...


In [28]:
df.tail()

Unnamed: 0,title,created_utc,author,ups,downs,num_comments,id,name,from,from_id,selftext,subreddit,score,url,permalink,rawtext,cleantext,stoplesstext,phrasetext
121,Му_first_lifе_ехреriеnсе._I_didn’t_bеliеvе_thа...,1485770276,Julianruice,,,0,5qzt7a,,,,[removed],BipolarSOs,0,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5qzt7a/му_first_lifе_ех...,Му_first_lifе_ехреriеnсе._I_didn’t_bеliеvе_thа...,first lif rin i didnt bliv tht sit n hl t fi...,first lif rin didnt bliv tht sit n hl find th ...,first lif rin didnt bliv tht sit n hl find th ...
122,"New Here - Long, kind of a vent.",1485799040,DayDreamingofU,,,8,5r2c43,,,,Hi! Girlfriend of a wonderful man who has bipo...,BipolarSOs,2,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r2c43/new_here_long_ki...,"New Here - Long, kind of a vent. Hi! Girlfrien...",new here long kind of a vent hi girlfrien...,new long kind vent hi girlfriend wonderful man...,new long kind vent hi girlfriend wonderful_man...
123,Loving someone with BP,1485817796,yesImind,,,2,5r4b5h,,,,https://i.imgur.com/VaeBUuj.gifv\n\n,BipolarSOs,32,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r4b5h/loving_someone_w...,Loving someone with BP https://i.imgur.com/Vae...,loving someone with bp https i imgur com vae...,loving someone bp https imgur com vaebuuj gifv,loving someone bp https_imgur_com vaebuuj gifv
124,Tell us something good!,1485836665,inlovedelicious,,,10,5r61ki,,,,What's something good that has been going on i...,BipolarSOs,9,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r61ki/tell_us_somethin...,Tell us something good! What's something good ...,tell us something good whats something good t...,tell us something good whats something good go...,tell us something good whats something good go...
125,Му rеаl stоrу аbоut 3 timеs with 3 diffеrеnt g...,1485858398,Noahrafi,,,0,5r7etw,,,,[removed],BipolarSOs,0,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r7etw/му_rеаl_stоrу_аb...,Му rеаl stоrу аbоut 3 timеs with 3 diffеrеnt g...,rl str but tims with diffrnt girls fr gus...,rl str tims diffrnt girls fr gus removed,rl str tims diffrnt_girls fr gus removed


In [29]:
for s in subreddits:
    #Create df specific to subreddit
    subreddit_info[s]['df'] = pd.DataFrame(df.loc[df['subreddit'].isin([s])])

In [30]:
# Check that the dataframe was updated correctly
#Can take a long time, prints progress
#for i in range(len(allPosts)):
#    if i % 500 == 0:
#        print(i)
#    if not " ".join(allPosts[i]) == list(df["phrasetext"])[i]:
#        print("index :" + str(i) + " is incorrect")

In [31]:
#for s in subreddits:
#    print(s)
#    for i in range(len(subreddit_info[s]['posts'])):
#        if i % 500 == 0:
#            print(i)
#        if not " ".join(subreddit_info[s]['posts'][i]) == list(subreddit_info[s]['df']["phrasetext"])[i]:
#            print("index :" + str(i) + " is incorrect")

### Time Element

Add a column to the DataFrame that indicates which week the post occured in
Add a column to the DataFrame that indicates which month the post occured in


In [32]:
baseyear = int(input('What year does the data start in: '))
df["week_no"] = df["created_utc"].apply(lambda utc: int(weeks_since(utc, baseyear)))

What year does the data start in: 2012


In [76]:
df.head(1000)

Unnamed: 0,title,created_utc,author,ups,downs,num_comments,id,name,from,from_id,...,subreddit,score,url,permalink,rawtext,cleantext,stoplesstext,phrasetext,week_no,month_no
0,Counseling,1470031667,ishihtzunot16,2,0,2,4vl48p,t3_4vl48p,,,...,BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vl48p/counseling/,Counseling I'm calling around to start counsel...,counseling im calling around to start counseli...,counseling im calling around start counseling ...,counseling im calling around start counseling ...,240,55
1,Coming off olanzapine (hopefully!) - worried a...,1470042562,rogue_arrows,5,0,3,4vlluv,t3_4vlluv,,,...,BipolarReddit,5,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vlluv/coming_off_ol...,Coming off olanzapine (hopefully!) - worried a...,coming off olanzapine hopefully worried a...,coming olanzapine hopefully worried withdrawal...,coming olanzapine hopefully worried withdrawal...,240,55
2,nicotine and mania?,1470046312,sydalmighty,2,0,4,4vls32,t3_4vls32,,,...,BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vls32/nicotine_and_...,"nicotine and mania? Hi, I recently tried snus ...",nicotine and mania hi i recently tried snus ...,nicotine mania hi recently tried snus skruf wh...,nicotine mania hi recently tried snus skruf wh...,240,55
3,Manic Romance,1470055075,scurius,19,0,32,4vm9qn,t3_4vm9qn,,,...,BipolarReddit,19,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vm9qn/manic_romance/,Manic Romance I've been pulled away from Reddi...,manic romance ive been pulled away from reddit...,manic romance ive pulled away reddit effective...,manic romance ive pulled_away reddit effective...,240,55
4,help with dealing with on again off again boyf...,1470056821,thum96,2,0,3,4vme78,t3_4vme78,,,...,BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vme78/help_with_dea...,help with dealing with on again off again boyf...,help with dealing with on again off again boyf...,help dealing boyfriend suspected bipolar sorry...,help dealing boyfriend suspected_bipolar sorry...,240,55
5,Lithium and slurred speech.,1470059322,Skeptic_mama,1,0,3,4vmkp9,t3_4vmkp9,,,...,BipolarReddit,1,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vmkp9/lithium_and_s...,Lithium and slurred speech. Is there any way t...,lithium and slurred speech is there any way t...,lithium slurred speech way help symptom lower ...,lithium slurred_speech way help symptom lower_...,240,55
6,Bipolar + ADHD,1470071516,nbvalkyrie,1,0,15,4vnkqg,t3_4vnkqg,,,...,BipolarReddit,1,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vnkqg/bipolar_adhd/,Bipolar + ADHD Does anyone have experience wit...,bipolar adhd does anyone have experience wit...,bipolar adhd anyone experience combination dia...,bipolar adhd anyone_experience combination dia...,240,55
7,Feeling somewhat confused about diagnosis,1470079222,zhongshiifu,2,0,2,4vo9dc,t3_4vo9dc,,,...,BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vo9dc/feeling_somew...,Feeling somewhat confused about diagnosis Hi.\...,feeling somewhat confused about diagnosis hi ...,feeling somewhat confused diagnosis hi twentie...,feeling somewhat confused diagnosis hi twentie...,240,55
8,Bipolar mom and stepmom struggling with exhaus...,1470082189,stepmamabear,6,0,2,4voim0,t3_4voim0,,,...,BipolarReddit,6,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4voim0/bipolar_mom_a...,Bipolar mom and stepmom struggling with exhaus...,bipolar mom and stepmom struggling with exhaus...,bipolar mom stepmom struggling exhaustion im t...,bipolar mom stepmom struggling exhaustion im t...,240,55
9,Anxiety/ Stressful thoughts make my blood pres...,1470109622,LickingTheLimeLight,3,0,8,4vqktw,t3_4vqktw,,,...,BipolarReddit,3,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vqktw/anxiety_stres...,Anxiety/ Stressful thoughts make my blood pres...,anxiety stressful thoughts make my blood pres...,anxiety stressful thoughts make blood pressure...,anxiety stressful thoughts make blood_pressure...,240,55


In [77]:
df.tail()

Unnamed: 0,title,created_utc,author,ups,downs,num_comments,id,name,from,from_id,...,subreddit,score,url,permalink,rawtext,cleantext,stoplesstext,phrasetext,week_no,month_no
121,Му_first_lifе_ехреriеnсе._I_didn’t_bеliеvе_thа...,1485770276,Julianruice,,,0,5qzt7a,,,,...,BipolarSOs,0,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5qzt7a/му_first_lifе_ех...,Му_first_lifе_ехреriеnсе._I_didn’t_bеliеvе_thа...,first lif rin i didnt bliv tht sit n hl t fi...,first lif rin didnt bliv tht sit n hl find th ...,first lif rin didnt bliv tht sit n hl find th ...,266,60
122,"New Here - Long, kind of a vent.",1485799040,DayDreamingofU,,,8,5r2c43,,,,...,BipolarSOs,2,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r2c43/new_here_long_ki...,"New Here - Long, kind of a vent. Hi! Girlfrien...",new here long kind of a vent hi girlfrien...,new long kind vent hi girlfriend wonderful man...,new long kind vent hi girlfriend wonderful_man...,266,60
123,Loving someone with BP,1485817796,yesImind,,,2,5r4b5h,,,,...,BipolarSOs,32,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r4b5h/loving_someone_w...,Loving someone with BP https://i.imgur.com/Vae...,loving someone with bp https i imgur com vae...,loving someone bp https imgur com vaebuuj gifv,loving someone bp https_imgur_com vaebuuj gifv,266,60
124,Tell us something good!,1485836665,inlovedelicious,,,10,5r61ki,,,,...,BipolarSOs,9,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r61ki/tell_us_somethin...,Tell us something good! What's something good ...,tell us something good whats something good t...,tell us something good whats something good go...,tell us something good whats something good go...,266,60
125,Му rеаl stоrу аbоut 3 timеs with 3 diffеrеnt g...,1485858398,Noahrafi,,,0,5r7etw,,,,...,BipolarSOs,0,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r7etw/му_rеаl_stоrу_аb...,Му rеаl stоrу аbоut 3 timеs with 3 diffеrеnt g...,rl str but tims with diffrnt girls fr gus...,rl str tims diffrnt girls fr gus removed,rl str tims diffrnt_girls fr gus removed,266,60


In [78]:
df["month_no"] = df["created_utc"].apply(lambda utc: int(months_since(utc, baseyear)))

In [79]:
df.head(1500)

Unnamed: 0,title,created_utc,author,ups,downs,num_comments,id,name,from,from_id,...,subreddit,score,url,permalink,rawtext,cleantext,stoplesstext,phrasetext,week_no,month_no
0,Counseling,1470031667,ishihtzunot16,2,0,2,4vl48p,t3_4vl48p,,,...,BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vl48p/counseling/,Counseling I'm calling around to start counsel...,counseling im calling around to start counseli...,counseling im calling around start counseling ...,counseling im calling around start counseling ...,240,55
1,Coming off olanzapine (hopefully!) - worried a...,1470042562,rogue_arrows,5,0,3,4vlluv,t3_4vlluv,,,...,BipolarReddit,5,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vlluv/coming_off_ol...,Coming off olanzapine (hopefully!) - worried a...,coming off olanzapine hopefully worried a...,coming olanzapine hopefully worried withdrawal...,coming olanzapine hopefully worried withdrawal...,240,55
2,nicotine and mania?,1470046312,sydalmighty,2,0,4,4vls32,t3_4vls32,,,...,BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vls32/nicotine_and_...,"nicotine and mania? Hi, I recently tried snus ...",nicotine and mania hi i recently tried snus ...,nicotine mania hi recently tried snus skruf wh...,nicotine mania hi recently tried snus skruf wh...,240,55
3,Manic Romance,1470055075,scurius,19,0,32,4vm9qn,t3_4vm9qn,,,...,BipolarReddit,19,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vm9qn/manic_romance/,Manic Romance I've been pulled away from Reddi...,manic romance ive been pulled away from reddit...,manic romance ive pulled away reddit effective...,manic romance ive pulled_away reddit effective...,240,55
4,help with dealing with on again off again boyf...,1470056821,thum96,2,0,3,4vme78,t3_4vme78,,,...,BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vme78/help_with_dea...,help with dealing with on again off again boyf...,help with dealing with on again off again boyf...,help dealing boyfriend suspected bipolar sorry...,help dealing boyfriend suspected_bipolar sorry...,240,55
5,Lithium and slurred speech.,1470059322,Skeptic_mama,1,0,3,4vmkp9,t3_4vmkp9,,,...,BipolarReddit,1,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vmkp9/lithium_and_s...,Lithium and slurred speech. Is there any way t...,lithium and slurred speech is there any way t...,lithium slurred speech way help symptom lower ...,lithium slurred_speech way help symptom lower_...,240,55
6,Bipolar + ADHD,1470071516,nbvalkyrie,1,0,15,4vnkqg,t3_4vnkqg,,,...,BipolarReddit,1,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vnkqg/bipolar_adhd/,Bipolar + ADHD Does anyone have experience wit...,bipolar adhd does anyone have experience wit...,bipolar adhd anyone experience combination dia...,bipolar adhd anyone_experience combination dia...,240,55
7,Feeling somewhat confused about diagnosis,1470079222,zhongshiifu,2,0,2,4vo9dc,t3_4vo9dc,,,...,BipolarReddit,2,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vo9dc/feeling_somew...,Feeling somewhat confused about diagnosis Hi.\...,feeling somewhat confused about diagnosis hi ...,feeling somewhat confused diagnosis hi twentie...,feeling somewhat confused diagnosis hi twentie...,240,55
8,Bipolar mom and stepmom struggling with exhaus...,1470082189,stepmamabear,6,0,2,4voim0,t3_4voim0,,,...,BipolarReddit,6,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4voim0/bipolar_mom_a...,Bipolar mom and stepmom struggling with exhaus...,bipolar mom and stepmom struggling with exhaus...,bipolar mom stepmom struggling exhaustion im t...,bipolar mom stepmom struggling exhaustion im t...,240,55
9,Anxiety/ Stressful thoughts make my blood pres...,1470109622,LickingTheLimeLight,3,0,8,4vqktw,t3_4vqktw,,,...,BipolarReddit,3,https://www.reddit.com/r/BipolarReddit/comment...,/r/BipolarReddit/comments/4vqktw/anxiety_stres...,Anxiety/ Stressful thoughts make my blood pres...,anxiety stressful thoughts make my blood pres...,anxiety stressful thoughts make blood pressure...,anxiety stressful thoughts make blood_pressure...,240,55


In [80]:
df.tail()

Unnamed: 0,title,created_utc,author,ups,downs,num_comments,id,name,from,from_id,...,subreddit,score,url,permalink,rawtext,cleantext,stoplesstext,phrasetext,week_no,month_no
121,Му_first_lifе_ехреriеnсе._I_didn’t_bеliеvе_thа...,1485770276,Julianruice,,,0,5qzt7a,,,,...,BipolarSOs,0,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5qzt7a/му_first_lifе_ех...,Му_first_lifе_ехреriеnсе._I_didn’t_bеliеvе_thа...,first lif rin i didnt bliv tht sit n hl t fi...,first lif rin didnt bliv tht sit n hl find th ...,first lif rin didnt bliv tht sit n hl find th ...,266,60
122,"New Here - Long, kind of a vent.",1485799040,DayDreamingofU,,,8,5r2c43,,,,...,BipolarSOs,2,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r2c43/new_here_long_ki...,"New Here - Long, kind of a vent. Hi! Girlfrien...",new here long kind of a vent hi girlfrien...,new long kind vent hi girlfriend wonderful man...,new long kind vent hi girlfriend wonderful_man...,266,60
123,Loving someone with BP,1485817796,yesImind,,,2,5r4b5h,,,,...,BipolarSOs,32,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r4b5h/loving_someone_w...,Loving someone with BP https://i.imgur.com/Vae...,loving someone with bp https i imgur com vae...,loving someone bp https imgur com vaebuuj gifv,loving someone bp https_imgur_com vaebuuj gifv,266,60
124,Tell us something good!,1485836665,inlovedelicious,,,10,5r61ki,,,,...,BipolarSOs,9,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r61ki/tell_us_somethin...,Tell us something good! What's something good ...,tell us something good whats something good t...,tell us something good whats something good go...,tell us something good whats something good go...,266,60
125,Му rеаl stоrу аbоut 3 timеs with 3 diffеrеnt g...,1485858398,Noahrafi,,,0,5r7etw,,,,...,BipolarSOs,0,https://www.reddit.com/r/BipolarSOs/comments/5...,/r/BipolarSOs/comments/5r7etw/му_rеаl_stоrу_аb...,Му rеаl stоrу аbоut 3 timеs with 3 diffеrеnt g...,rl str but tims with diffrnt girls fr gus...,rl str tims diffrnt girls fr gus removed,rl str tims diffrnt_girls fr gus removed,266,60


In [81]:
for s in subreddits:
    #Create df specific to subreddit
    subreddit_info[s]['df'] = pd.DataFrame(df.loc[df['subreddit'].isin([s])])

### Data Saving

After cleaning and parsing all of our data, we can now
save it, so that we can analysis it later without having
to go through lengthy computations

In [82]:
save_object(allPosts, 'objects/', model_name + "-allPosts")
for s in subreddits:
    save_object(subreddit_info[s]['posts'], 'objects/', model_name + "-" + subreddit_info[s]['abbr'] + "Posts")


In [83]:
save_object(df, 'objects/', model_name + "-df")
for s in subreddits:
    save_object(subreddit_info[s]['df'], 'objects/', model_name + "-" + subreddit_info[s]['abbr'] + "_df")

In [84]:
#posts = load_object('objects/', model_name + "-posts")
#df    = load_object('objects/', model_name + "-df")

### Initialize Word2Vec Model

After all of our data has been parsed and saved, 
we generate our Word2Vec Model

In [85]:
# Set the minimum word count to 10. This removes all words that appear less than 10 times in the data
minimum_word_count = 10
# Set skip gram to 1. This sets gensim to use the skip gram model instead of the Continuous Bag of Words model
skip_gram = 1
# Set Hidden layer size to 300.
hidden_layer_size = 300
# Set the window size to 5. 
window_size = 5
# Set hierarchical softmax to 1. This sets gensim to use hierarchical softmax
hierarchical_softmax = 1
# Set negative sampling to 20. This is good for relatively small data sets, but becomes harder for larger datasets
negative_sampling = 20

In [86]:
# Build the model
model = gensim.models.Word2Vec(allPosts, min_count = minimum_word_count, sg = skip_gram, size = hidden_layer_size,
                                   window = window_size, hs = hierarchical_softmax, negative = negative_sampling)

### Basic Model test

After generating our model, we run some basic tests
to ensure that it has captured some semantic information results

In [87]:
model.wv.most_similar(positive = ["kitten"])

[('dog', 0.5431555509567261),
 ('puppy', 0.5408093333244324),
 ('cat', 0.5299469232559204),
 ('cats', 0.4829835891723633),
 ('cat_food', 0.44943973422050476),
 ('dogs', 0.443569153547287),
 ('pup', 0.4304730296134949),
 ('baby', 0.42699265480041504),
 ('christmas_tree', 0.4180358946323395),
 ('two_dogs', 0.4175592064857483)]

In [88]:
model.wv.most_similar(positive = ["father", "woman"], negative = ["man"])

[('mother', 0.7084722518920898),
 ('mom', 0.6182020902633667),
 ('sister', 0.612905740737915),
 ('dad', 0.5882495045661926),
 ('brother', 0.5774303674697876),
 ('parents', 0.5569106340408325),
 ('grandmother', 0.5566683411598206),
 ('daughter', 0.5366436243057251),
 ('step_father', 0.5353041291236877),
 ('molesting', 0.5307393074035645)]

In [90]:
model.wv.most_similar(positive = ["king", "woman"], negative = ["man"])

[('vincent', 0.4315842390060425),
 ('queen', 0.38722145557403564),
 ('marilyn_monroe', 0.3707948625087738),
 ('eternal_sunshine_spotless_mind', 0.36773523688316345),
 ('ellen_forney', 0.367343932390213),
 ('squirrel', 0.36536705493927),
 ('diamonds', 0.36462223529815674),
 ('netflix_show', 0.36105847358703613),
 ('poet', 0.35972946882247925),
 ('beyonce', 0.3593361973762512)]

### Save Model

After generating our model, and runing some basic tests,
we now save it so that we can analysis it later without having
to go through lengthy computations. We also delete and then reload
the model, as an example of how to do so.

In [47]:
model.save('models/' + model_name + '.model')
del model

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [48]:
model = gensim.models.Word2Vec.load('models/' + model_name + '.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Generate Matricies

After generating our Word2Vec Model, we generate 
a collection of matricies that will be useful for
analysis. This includes a Words By feature matrix,
and a Post By Words Matrix. Note, we will use camelCase 
for matrix names, and only matrix names

In [49]:
# Initialize the list of words used
vocab_list = sorted(list(model.wv.vocab))

In [50]:
# Extract the word vectors
vecs = []
for word in vocab_list:
    vecs.append(model.wv[word].tolist())

In [51]:
# change array format into numpy array
WordsByFeatures = np.array(vecs)
WordsByFeatures.shape

(39180, 300)

In [52]:
countvec = CountVectorizer(vocabulary = vocab_list, analyzer = (lambda lst:list(map((lambda s:s), lst))), min_df = 0)

# Make Posts By Words Matrix
allPostsByWords = countvec.fit_transform(allPosts)
allPostsByWords.shape

(149888, 39180)

In [53]:
for s in subreddits:
    countvec = CountVectorizer(vocabulary = vocab_list, analyzer = (lambda lst:list(map((lambda s:s), lst))), min_df = 0)
    subreddit_info[s]["PostsByWords"] = countvec.fit_transform(subreddit_info[s]['posts'])
    print(s)
    print(subreddit_info[s]["PostsByWords"].shape)

BipolarReddit
(30200, 39180)
BipolarSOs
(5520, 39180)
BPD
(89840, 39180)
BPDlovedones
(24328, 39180)


### Basic Matrix tests

After generating our matricies, we run some basic tests
to ensure that they seem resaonable later without having
to go through lengthy computations

In [54]:
# Check that PostsByWords is the number of Posts by the number of words
print(allPostsByWords.shape[0] == len(allPosts))
for s in subreddits:
    print(subreddit_info[s]['PostsByWords'].shape[0] == len(subreddit_info[s]['posts']))

True
True
True
True
True


### Save Matricies

After generating our matricies, we save them so we can 
analyze them later without having to go through lengthy
computations.

In [55]:
save_object(allPostsByWords,'matricies/', model_name + "-allPostsByWords")
save_object(WordsByFeatures,'matricies/', model_name + "-WordsByFeatures")
for s in subreddits:
    save_object(subreddit_info[s]['PostsByWords'], 'matricies/', model_name + "-" + subreddit_info[s]['abbr'] + "PostsByWords")


### Generate Word Clusters

Now that we have generated and saved our matricies,
we will proceed to generate word clusters using 
kmeans clustering, and save them for later analysis.

In [56]:
from sklearn.cluster import KMeans
# can take a long time
# get the fit for different values of K
test_points = [12] + list(range(25, 401, 25))
fit = []
for point in test_points:
    print(point)
    kmeans = KMeans(n_clusters = point, random_state = 42).fit(WordsByFeatures)
    save_object(kmeans, 'clusters/', model_name + "-words-cluster_model-" + str(point))
    fit.append(kmeans.inertia_)

12
25
50
75
100
125
150
175
200
225
250
275
300
325
350
375
400


In [57]:
save_object(fit, 'objects/', model_name + "-words" + "-fit")
save_object(test_points, 'objects/', model_name + "-words" + "-test_points")
del fit
del test_points

### Create Months By Posts Matrix

In [58]:
#Create proper shape matrix with all 0s
MonthsByAllPosts = np.zeros( (df["month_no"].max()+1, len(df) ) )
for s in subreddits:
    subreddit_info[s]['MonthsByPosts'] = np.zeros( ( subreddit_info[s]['df']['month_no'].max()+1 , len(subreddit_info[s]['df'] ) ) )


In [59]:
print(MonthsByAllPosts.shape)
for s in subreddits:
    print(s)
    print(subreddit_info[s]["MonthsByPosts"].shape)

(89, 149888)
BipolarReddit
(89, 30200)
BipolarSOs
(89, 5520)
BPD
(89, 89840)
BPDlovedones
(89, 24328)


In [60]:
i = 0
for post in df.itertuples():
    MonthsByAllPosts[post[-1]][i] = 1
    i += 1
print(MonthsByAllPosts.shape)

for i in range(MonthsByAllPosts.shape[1]):
    assert MonthsByAllPosts[:,i].sum() == 1
    

(89, 149888)


In [61]:
for s in subreddits:
    i = 0
    for post in subreddit_info[s]['df'].itertuples():
        subreddit_info[s]['MonthsByPosts'][post[-1]][i] = 1
        i += 1

    print(subreddit_info[s]['MonthsByPosts'].shape)
    for i in range(subreddit_info[s]['MonthsByPosts'].shape[1]):
        assert subreddit_info[s]['MonthsByPosts'][:,i].sum() == 1

(89, 30200)
(89, 5520)
(89, 89840)
(89, 24328)


In [62]:
save_object(MonthsByAllPosts, 'matricies/', model_name + "-MonthsByAllPosts")

for s in subreddits:
    save_object(subreddit_info[s]['MonthsByPosts'], 'matricies/', model_name + "-" + subreddit_info[s]['abbr'] + "MonthsByPosts")



### Create Posts By Clusters Matrix 

Now we need to create the PostsByClusters matrix using the below equation:

PostsByWords X WordsByClusters = PostsByClusters

In [63]:
#Initialize a word clustering to use
num_word_clusters = 100
kmeans = load_object('clusters/', model_name + '-words-cluster_model-' + str(num_word_clusters))

clusters = make_clustering_objects(model, kmeans, vocab_list, WordsByFeatures)

clusterWords = list(map(lambda x: list(map(lambda y: y[0] , x["word_list"])), clusters))

countvec = CountVectorizer(vocabulary = vocab_list, analyzer = (lambda lst:list(map((lambda s: s), lst))), min_df = 0)

#Create ClustersByWords matrix
ClusterByWords = countvec.fit_transform(clusterWords)

#Look at Dimensions
print(ClusterByWords.shape)

#Create WordsByClusters
WordsByClusters = ClusterByWords.transpose()
print(WordsByClusters.shape)
save_object(WordsByClusters, 'matricies/', model_name + "WordsByClusters-" + str(num_word_clusters) + 'clusters')



(100, 39180)
(39180, 100)


In [64]:
for i in range(100):
    for word in clusters[i]['word_list']:
        if word[0] == 'williams':
            print(i)

39


In [65]:
clusters[45]

{'unique_words': 384,
 'total_freq': 231953,
 'word_list': [('acquaintance', 154),
  ('acquaintances', 255),
  ('acquainted', 26),
  ('active_social_life', 10),
  ('actively_avoid', 21),
  ('actively_avoiding', 12),
  ('afar', 47),
  ('alienate', 90),
  ('alienated', 210),
  ('alienates', 10),
  ('announcing', 28),
  ('approving', 10),
  ('arms_length', 38),
  ('around', 19609),
  ('asocial', 18),
  ('avoided', 509),
  ('avoiding_eye_contact', 10),
  ('backburner', 14),
  ('backstabbed', 14),
  ('bad_influence', 33),
  ('bad_influences', 11),
  ('bad_mouthing', 26),
  ('badmouthed', 16),
  ('badmouthing', 16),
  ('bailing', 68),
  ('barely_speak', 33),
  ('befriend', 86),
  ('befriended', 50),
  ('befriending', 33),
  ('behind_back', 505),
  ('behind_closed_doors', 41),
  ('behind_scenes', 26),
  ('bestest', 11),
  ('bestfriends', 11),
  ('besties', 16),
  ('bore', 93),
  ('boss_coworkers', 12),
  ('bounce_around', 10),
  ('boyfriends', 885),
  ('boyfriends_girlfriends', 19),
  ('bpdfr

In [66]:
#Create Posts By Clusters through matrix multiplication
allPostsByClusters = allPostsByWords.dot(WordsByClusters.toarray())
allPostsByClusters = np.matrix(allPostsByClusters)
print(allPostsByClusters.shape)

for s in subreddits:
    subreddit_info[s]["PostsByClusters"] = subreddit_info[s]["PostsByWords"].dot(WordsByClusters.toarray())
    subreddit_info[s]["PostsByClusters"] = np.matrix(subreddit_info[s]["PostsByClusters"])
    print(s)
    print(subreddit_info[s]["PostsByClusters"].shape)

(149888, 100)
BipolarReddit
(30200, 100)
BipolarSOs
(5520, 100)
BPD
(89840, 100)
BPDlovedones
(24328, 100)


In [67]:
save_object(allPostsByClusters, 'matricies/', model_name + "-allPostsByClusters-" + str(num_word_clusters) + 'clusters')

for s in subreddits:
    save_object(subreddit_info[s]["PostsByClusters"], 'matricies/', model_name + '-' + subreddit_info[s]["abbr"] + "PostsByClusters-" + str(num_word_clusters) + 'clusters')
   

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvec = TfidfVectorizer(vocabulary = vocab_list, analyzer = (lambda lst:list(map((lambda s:s), lst))), min_df = 0)

# Make Posts By Words Matrix
TFIDFPostsByWords = tfidfvec.fit_transform(allPosts)
TFIDFPostsByWords.shape

(149888, 39180)

In [69]:
#Create TFIDF Posts By Clusters through matrix multiplication
TFIDFPostsByClusters = TFIDFPostsByWords.dot(WordsByClusters.toarray())
TFIDFPostsByClusters = np.matrix(TFIDFPostsByClusters)
print(TFIDFPostsByClusters.shape)

(149888, 100)


In [70]:
x = TFIDFPostsByClusters[:,36]
x = np.asarray(x.transpose())
x = x[0]
idx = np.argsort(x)
idx[-10:]

array([129752,  86017, 116568, 102186,  21786, 115207,  97127, 136415,
       115492, 143180])

In [71]:
for i in idx[-10:]:
    print(i)
    print(df.iloc[i]['rawtext'])
    print()

129752
Another friendship ruined. 

86017
Is this discard behavior? Or more push/pull? 

116568
Why do the discards/break ups get worse each time? 

102186
Why do the discards/break ups get worse each time? 

21786
Is it possible for an ex pwBPD to discard you and not split you black? I was discarded after she had told me I was her soulmate and her forever. She had suddenly become cold and detached. It was almost if she looked right through me and all of her adoration for me just disappeared.
She said she had just fallen out of love and wanted to be alone. 
She said she would think of me fondly and cherish the moments we spent together. She also said she wasn’t heartbroken and said sorry for making me think she wanted a life with me.

Off she went.

Not too long ago, i found out that she had reconnected with an old friend when she discarded me.
They have been in a relationship since the day she left me.

I confronted her about her betrayal and she responded with cold, cruel words. The 

In [72]:
sorted( range(len(x)), key = lambda i: x[i], reverse = True)[:10]

[143180, 115492, 136415, 97127, 115207, 21786, 102186, 116568, 86017, 129752]

### Create Months By Clusters Matrix 

Now we need to create the MonthsByClusters matrix using the below equation:

MonthsByPosts X PostsByClusters = MonthsByClusters

In [73]:
#Create MonthsByClusters matrix through matrix multiplication
ALL_MonthsByClusters = MonthsByAllPosts.dot(allPostsByClusters)
ALL_MonthsByClusters.shape

(89, 100)

In [74]:
for s in subreddits:
    subreddit_info[s]["MonthsByClusters"] = subreddit_info[s]["MonthsByPosts"].dot(subreddit_info[s]["PostsByClusters"])
    print(s)
    print(subreddit_info[s]["MonthsByClusters"].shape)

BipolarReddit
(89, 100)
BipolarSOs
(89, 100)
BPD
(89, 100)
BPDlovedones
(89, 100)


In [75]:
save_object(ALL_MonthsByClusters, 'matricies/', model_name + "-ALL_MonthsByClusters-" + str(num_word_clusters) + 'clusters')
for s in subreddits:
    save_object(subreddit_info[s]["MonthsByClusters"], 'matricies/', model_name + "-" + subreddit_info[s]["abbr"] + "_MonthsByClusters-" + str(num_word_clusters) + 'clusters')


### Create Months By Clusters Matrix Normalizations

Normalize the Months By Clusters matrix

In [77]:
amXc_df = pd.DataFrame(ALL_MonthsByClusters)
filepath = 'excels/' + model_name + "-ALL_MonthsByClusters-" + str(num_word_clusters) + 'clusters.xlsx'
amXc_df.to_excel(filepath, index = False)

In [78]:
ALL_L2A1MonthsByClusters = normalize(ALL_MonthsByClusters, norm='l2', axis = 1)
save_object(ALL_L2A1MonthsByClusters, 'matricies/', model_name + "-ALL_L2A1MonthsByClusters-" + str(num_word_clusters) + 'clusters')
L2A1amXc_df = pd.DataFrame(ALL_L2A1MonthsByClusters)
filepath = 'excels/' + model_name + "-ALL_L2A1MonthsByClusters-" + str(num_word_clusters) + 'clusters.xlsx'
L2A1amXc_df.to_excel(filepath, index = False)

In [79]:
for s in subreddits:
    mXc_df = pd.DataFrame(subreddit_info[s]["MonthsByClusters"])
    filepath = 'excels/' + model_name + '-' + subreddit_info[s]['abbr'] + "-MonthsByClusters-" + str(num_word_clusters) + 'clusters.xlsx'
    mXc_df.to_excel(filepath, index = False)

In [80]:
for s in subreddits:
    L2A1Normalized = normalize(subreddit_info[s]["MonthsByClusters"], norm = 'l2', axis = 1)
    save_object(L2A1Normalized, 'matricies/', model_name+'-'+subreddit_info[s]['abbr']+'-L2A1MonthsByClusters-'+str(num_word_clusters)+'clusters' )
    NmXc_df = pd.DataFrame(L2A1Normalized)
    filepath = 'excels/'+model_name+'-'+subreddit_info[s]['abbr']+'-L2A1MonthsByClusters'+str(num_word_clusters)+'clusters.xlsx'
    NmXc_df.to_excel(filepath, index = False)

### Create Weeks By Posts Matrix

In [81]:
#Create proper shape matrix with all 0s
WeeksByAllPosts = np.zeros( (df["week_no"].max()+1, len(df) ) )
for s in subreddits:
    subreddit_info[s]['WeeksByPosts'] = np.zeros( ( subreddit_info[s]['df']['week_no'].max()+1 , len(subreddit_info[s]['df'] ) ) )


In [82]:
print(WeeksByAllPosts.shape)
for s in subreddits:
    print(s)
    print(subreddit_info[s]["WeeksByPosts"].shape)

(358, 68139)


In [83]:
i = 0
for post in df.itertuples():
    WeeksByAllPosts[post[-2]][i] = 1
    i += 1

print(WeeksByAllPosts.shape)
for i in range(WeeksByAllPosts.shape[1]):
    assert WeeksByAllPosts[:,i].sum() == 1

(358, 68139)


In [84]:
for s in subreddits:
    i = 0
    for post in subreddit_info[s]['df'].itertuples():
        subreddit_info[s]['WeeksByPosts'][post[-2]][i] = 1
        i += 1

    print(subreddit_info[s]['WeeksByPosts'].shape)
    for i in range(subreddit_info[s]['WeeksByPosts'].shape[1]):
        assert subreddit_info[s]['WeeksByPosts'][:,i].sum() == 1

In [85]:
save_object(WeeksByAllPosts, 'matricies/', model_name + "-WeeksByAllPosts")

for s in subreddits:
    save_object(subreddit_info[s]['WeeksByPosts'], 'matricies/', model_name + "-" + subreddit_info[s]['abbr'] + "WeeksByPosts")



### Create Weeks By Clusters Matrix 

Now we need to create the WeeksByClusters matrix using the below equation:

WeeksByPosts X PostsByClusters = WeeksByClusters

In [86]:
#Create MonthsByClusters matrix through matrix multiplication
ALL_WeeksByClusters = WeeksByAllPosts.dot(allPostsByClusters)
ALL_WeeksByClusters.shape

(358, 100)

In [87]:
for s in subreddits:
    subreddit_info[s]["WeeksByClusters"] = subreddit_info[s]["WeeksByPosts"].dot(subreddit_info[s]["PostsByClusters"])
    print(s)
    print(subreddit_info[s]["WeeksByClusters"].shape)

In [88]:
save_object(ALL_WeeksByClusters, 'matricies/', model_name + "-ALL_WeeksByClusters-" + str(num_word_clusters) + 'clusters')
for s in subreddits:
    save_object(subreddit_info[s]["WeeksByClusters"], 'matricies/', model_name + "-" + subreddit_info[s]["abbr"] + "_WeeksByClusters-" + str(num_word_clusters) + 'clusters')


### Create Weeks By Clusters Matrix Normalizations

Normalize the Weeks By Clusters matrix

In [89]:
awXc_df = pd.DataFrame(ALL_WeeksByClusters)
filepath = 'excels/' + model_name + "-ALL_WeeksByClusters-" + str(num_word_clusters) + 'clusters.xlsx'
awXc_df.to_excel(filepath, index = False)

In [90]:
ALL_L2A1WeeksByClusters = normalize(ALL_WeeksByClusters, norm='l2', axis = 1)
save_object(ALL_L2A1WeeksByClusters, 'matricies/', model_name + "ALL_L2A1WeeksByClusters-" + str(num_word_clusters) + 'clusters')
L2A1awXc_df = pd.DataFrame(ALL_L2A1WeeksByClusters)
filepath = 'excels/' + model_name + "-ALL_L2A1WeeksByClusters-" + str(num_word_clusters) + 'clusters.xlsx'
L2A1awXc_df.to_excel(filepath, index = False)

In [91]:
for s in subreddits:
    wXc_df = pd.DataFrame(subreddit_info[s]["WeeksByClusters"])
    filepath = 'excels/' + model_name + '-' + subreddit_info[s]['abbr'] + "-WeeksByClusters-" + str(num_word_clusters) + 'clusters.xlsx'
    wXc_df.to_excel(filepath, index = False)

In [92]:
for s in subreddits:
    L2A1Normalized = normalize(subreddit_info[s]["WeeksByClusters"], norm = 'l2', axis = 1)
    save_object(L2A1Normalized, 'matricies/', model_name+'-'+subreddit_info[s]['abbr']+'-L2A1WeeksByClusters-'+str(num_word_clusters)+'clusters' )
    NwXc_df = pd.DataFrame(L2A1Normalized)
    filepath = 'excels/'+model_name+'-'+subreddit_info[s]['abbr']+'-L2A1WeeksByClusters'+str(num_word_clusters)+'clusters.xlsx'
    NwXc_df.to_excel(filepath, index = False)

In [93]:
len(df)

68139

In [94]:
len(dep_df)

NameError: name 'dep_df' is not defined

In [None]:
len(sw_df)

In [None]:
len(an_df)

In [None]:
len(dep_df) + len(sw_df) + len(an_df)

In [None]:
depPostsByWords

In [None]:
in_dep = 0
not_in_dep = 0
for i in range(depPostsByWords.shape[1]):
    if i % 1000 == 0:
            print((i / 90335) * 100)
            print("in_dep: ", in_dep)
            print("not_in_dep: ", not_in_dep)
    if depPostsByWords[:,i].sum() < 10:
        not_in_dep += 1
    else:
        in_dep += 1

In [None]:
print(in_dep)
print(not_in_dep)
print(in_dep + not_in_dep)

In [None]:
in_sw = 0
not_in_sw = 0
for i in range(swPostsByWords.shape[1]):
    if i % 1000 == 0:
            print((i / 90335) * 100)
            print("in_sw: ", in_sw)
            print("not_in_sw: ", not_in_sw)
    if swPostsByWords[:,i].sum() < 10:
        not_in_sw += 1
    else:
        in_sw += 1


In [None]:
print(in_sw)
print(not_in_sw)
print(in_sw + not_in_sw)

In [None]:
in_an = 0
not_in_an = 0
for i in range(anPostsByWords.shape[1]):
    if i % 1000 == 0:
            print((i / 90335) * 100)
            print("in_an: ", in_an)
            print("not_in_an: ", not_in_an)
    if anPostsByWords[:,i].sum() < 10:
        not_in_an += 1
    else:
        in_an += 1
        

In [None]:
print(in_an)
print(not_in_an)
print(in_an + not_in_an)

In [None]:
for i in range(len(vocab_list)):
    if i % 1000 == 0:
        print("PROGRESS: ", i/90335)
    if depPostsByWords[:,i].sum() < 10:
        print(vocab_list[i])

In [None]:
for word in vocab_list:
    if model.wv.vocab[word].count < 10:
        print(word)

In [None]:
for i in range(len(vocab_list)):
    if i % 1000 == 0:
        print("PROGRESS: ", i/90335)
    if allPostsByWords[:,i].sum() < 10:
        print(vocab_list[i])

In [None]:
cnt = 0
for i in range(len(vocab_list)):
    if i % 1000 == 0:
        print("PROGRESS: ", i/90335)
        print(cnt)
    if depPostsByWords[:,i].sum() < 10:
        cnt += 1
        #print(vocab_list[i])
print("cnt  ", cnt)

cnt = 0
for i in range(len(vocab_list)):
    if i % 1000 == 0:
        print("PROGRESS: ", i/90335)
        print(cnt)
    if swPostsByWords[:,i].sum() < 10:
        cnt += 1
        #print(vocab_list[i])
print("cnt  ", cnt)

In [None]:
for i in range(len(vocab_list)):
    if i % 1000 == 0:
        print("PROGRESS: ", i/90335)
    if anPostsByWords[:,i].sum() < 10:
        print(vocab_list[i])

In [None]:
cnt = 0
for i in range(len(vocab_list)):
    if i % 1000 == 0:
        print("PROGRESS: ", i/90335)
        print(cnt)
    if anPostsByWords[:,i].sum() < 10:
        cnt += 1
        #print(vocab_list[i])
print("cnt  ", cnt)

In [None]:
len(vocab_list) - cnt