# Initialize Models
This notebook will walk you through building and saving the most basic 
models and matricies we used for analyzing our text data. 
This notebook allows you to build word2vec models and produce the following matrices for multiple subreddits.
The model itself is built on a corpus of all the data in the data folder so clusters are consistent across subreddit.

We first import the libraries and utility files we are going to be using.

In [1]:
# Import useful mathematical libraries
import numpy as np
import pandas as pd

# Import useful Machine learning libraries
import gensim

#Import nltk list of stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

#import function for normalization
from sklearn.preprocessing import normalize

#import TF function
from sklearn.feature_extraction.text import CountVectorizer


# Import utility files
from utils import read_df, remove_links,remove_comments, clean_sentence, save_object, load_object, make_clustering_objects, weeks_since, stopless_text_generator, months_since, clean_id

from pprint import pprint

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eliseglaser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Setup directories

If this is the first time doing this analysis, 
we first will set up all the directories we need
to save and load the models we will be using

In [2]:
import os
directories = ['objects', 'models', 'clusters', 'matricies', 'excels']
for dirname in directories:
    if not os.path.exists(dirname):
        os.makedirs(dirname)

#### Name Model

Before begining the rest of our project, we select a name for our model.
This name will be used to save and load the files for this model

In [3]:
model_name = "2019_SW_PandC"

In [4]:
#Make a dataframe from just the posts
dfP = read_df('SuicideWatchPosts',extension = "/*.csv")

#Make comment data frame and join to keep all info
dfP["type"] = 'P'
dfC = read_df('SuicideWatchComments',extension = "/*.csv")
dfC["type"] = 'C'
df = dfP.merge(dfC, on=['created_utc','id','author','ups','score','downs','subreddit','name', 'type'], how='outer')


In [None]:
# Do an inspection of our data to ensure nothing went wrong
#df.info gives basic info on the DataFrame, specifically make sure column titles contain 
#all intended fields and the number of entries
#The number of entries will tell you how many posts you have
df.info()

In [None]:
#df.head() will print out the first 5 entries to the df
df.head()

In [5]:
#Replace all NaN values with the empty string
df = df.replace(np.nan, '', regex = True)
# Remove any comments that aren't directly to post
df = remove_comments(df)
#Replace all deleted posts and authors with an empty string
df = df.replace("\[deleted\]", '', regex = True)

#Concatenate title and post text if submission
df["rawtext"] = df["title"] + " " + df["selftext"] + " " +df["body"]

#Clean the raw text, removing links, lower casing, removed characters
df["cleantext"] = df["rawtext"].apply(remove_links).apply(clean_sentence)

In [None]:
# Check that the cleaning was successful
df.info()

In [None]:
df.head()

In [None]:
#df.tail() will print out the last five entries of the df
df.tail()

### Stop Words Removal

After parsing and cleaning the data we further preprocess the data
by removing the common words known as stop words using the help of nltk

In [6]:
#Get list of stop words
stop_words = stopwords.words('english')

In [None]:
#The list of stop words for reference
stop_words

In [7]:
#Add a column to the DataFrame for the clean text with stop words removed
df["stoplesstext"] = df["cleantext"].apply(lambda s: stopless_text_generator(s,stop_words))

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

### Create Posts Lists

We now need to create a list of posts.
The posts lists will be a list of lists, where each internal list is a list of tokens for each post.
We will create a list of posts for each subreddit as well as a total list.

In [8]:
#Split dataframe again into just posts
dfP = df[df["type"] == 'P']

#Create list of post text
posts = dfP["stoplesstext"].apply(lambda str: str.split()).tolist()

#Create list of comment text
dfC = df[df["type"] == 'C']
comments = dfC["stoplesstext"].apply(lambda str: str.split()).tolist()

#Saves scores for later linear regression
scores = dfC['score']

#Create list of both
postsAndComments = posts + comments

## Clean and match posts and responses
The next few steps use the id of the post to get all of its responses together. It cleans the data by removing empty or duplicate comments.

In [9]:
#Get just the cleaned text and post id.
dfPtext = dfP[['id', 'cleantext']]
#Cleans the comment ID to match it to the post ID
dfC["id"] = dfC["link_id"].apply(clean_id)
#Clean out duplicates
dfC = dfC.drop_duplicates('cleantext').sort_index()
#Clean out empty commments
dfC = dfC[dfC['cleantext'].map(len) > 2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [10]:
#Run this to get all comments for a post
dfCtext = dfC[['id', 'cleantext']]
dfCtext = dfCtext.groupby(['id'])['cleantext'].apply(', '.join).reset_index()
dfCtext.head()

Unnamed: 0,id,cleantext
0,1002ce,just wanted to let you know that your chance...
1,1002uz,hey im here if you wanna pm me if not we c...
2,10033x,it will not be painless this semester has ...
3,1003yf,glad to hear youre regaining control and get...
4,10054c,you should probably warm it back up first b...


In [None]:
#run this to get top comment for a post (or top comments if equal score)
dfCtoptext = dfC[['id', 'cleantext', 'score']]
dfCtoptext = dfCtext[dfCtext['score'] == dfCtext.groupby('id')['score'].transform('max')]
dfCtoptext = dfCtext.groupby(['id'])['cleantext'].apply(', '.join).reset_index()
dfCtoptext.head()

In [11]:
#Merge the posts and their comments by matching their IDs
#Change dfCtext to dfCtoptext if only looking at highest scored comments
dfMatch = dfPtext.merge(dfCtext, on=['id'], how='inner')
dfMatch.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56469 entries, 0 to 56468
Data columns (total 3 columns):
id             56469 non-null object
cleantext_x    56469 non-null object
cleantext_y    56469 non-null object
dtypes: object(3)
memory usage: 1.7+ MB


In [None]:
dfMatch.head()

### Phrase Analysis

After parsing and cleaning the data we run the gensim phraser
tool on our text data to join phrases like "new york city" 
together to form the word "new_york_city"

In [16]:
# Train a phraseDetector to join two word phrases together
#This is used to generate a model on all text from both posts and comments
two_word_phrases = gensim.models.Phrases(postsAndComments)
two_word_phraser = gensim.models.phrases.Phraser(two_word_phrases)


In [17]:
# If you want one model for each
two_word_phrases_P = gensim.models.Phrases(posts)
two_word_phraser_P = gensim.models.phrases.Phraser(two_word_phrases_P)

two_word_phrases_C = gensim.models.Phrases(comments)
two_word_phraser_C = gensim.models.phrases.Phraser(two_word_phrases_C)

In [18]:
# Train a phraseDetector to join three word phrases together
three_word_phrases = gensim.models.Phrases(two_word_phraser[postsAndComments])
three_word_phraser = gensim.models.phrases.Phraser(three_word_phrases)

In [19]:
#For two models
# If you want one model for each
three_word_phrases_P = gensim.models.Phrases(two_word_phraser[posts])
three_word_phraser_P = gensim.models.phrases.Phraser(three_word_phrases_P)

three_word_phrases_C = gensim.models.Phrases(two_word_phraser[comments])
three_word_phraser_C = gensim.models.phrases.Phraser(three_word_phrases_C)

In [20]:
#Update lists to reflect phrasing
postsAndComments = list(three_word_phraser[two_word_phraser[postsAndComments]])

In [21]:
posts = list(three_word_phraser_P[two_word_phraser_P[posts]])
comments= list(three_word_phraser_C[two_word_phraser_C[comments]])

In [25]:
# Update Data frames
df["phrasetext"] = df["stoplesstext"].apply(lambda str: " ".join(three_word_phraser[two_word_phraser[str.split()]]))
dfP["phrasetext"] = dfP["stoplesstext"].apply(lambda str: " ".join(three_word_phraser_P[two_word_phraser_P[str.split()]]))
dfC["phrasetext"] = dfC["stoplesstext"].apply(lambda str: " ".join(three_word_phraser_C[two_word_phraser_C[str.split()]]))





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [26]:
df.head()

Unnamed: 0,title,created_utc,author,ups,downs,num_comments,id,name,from,from_id,...,url,permalink,type,body,link_id,parent_id,rawtext,cleantext,stoplesstext,phrasetext
0,Give me a reason to not kill myself,1493598118,,,,6,68ivqc,,,,...,https://www.reddit.com/r/SuicideWatch/comments...,/r/SuicideWatch/comments/68ivqc/give_me_a_reas...,P,,,,Give me a reason to not kill myself,give me a reason to not kill myself,give reason kill,give reason kill
1,heartbroken.,1493598528,mareikenebel,,,33,68iwwh,,,,...,https://www.reddit.com/r/SuicideWatch/comments...,/r/SuicideWatch/comments/68iwwh/heartbroken/,P,,,,heartbroken. I'm new to Reddit. my boyfriend u...,heartbroken im new to reddit my boyfriend us...,heartbroken im new reddit boyfriend used app l...,heartbroken im new reddit boyfriend used app l...
2,"I don't see any reason to live, but I still ha...",1493599344,Throwyourbluesaway,,,2,68iz9m,,,,...,https://www.reddit.com/r/SuicideWatch/comments...,/r/SuicideWatch/comments/68iz9m/i_dont_see_any...,P,,,,"I don't see any reason to live, but I still ha...",i dont see any reason to live but i still hav...,dont see reason live still havent killed yet p...,dont see reason live still havent_killed_yet p...
3,All I can think about,1493599645,throwaway10802930,,,0,68j066,,,,...,https://www.reddit.com/r/SuicideWatch/comments...,/r/SuicideWatch/comments/68j066/all_i_can_thin...,P,,,,All I can think about is taking a long drive t...,all i can think about is taking a long drive t...,think taking long drive beach watching waves c...,think taking long drive_beach watching waves c...
4,Idk what to type here,1493599725,helpme11233211,,,17,68j0ed,,,,...,https://www.reddit.com/r/SuicideWatch/comments...,/r/SuicideWatch/comments/68j0ed/idk_what_to_ty...,P,,,,Idk what to type here I tried to kill myself a...,idk what to type here i tried to kill myself a...,idk type tried kill week ago still feel suicid...,idk type tried kill week_ago still feel suicid...


In [27]:
df.tail()

Unnamed: 0,title,created_utc,author,ups,downs,num_comments,id,name,from,from_id,...,url,permalink,type,body,link_id,parent_id,rawtext,cleantext,stoplesstext,phrasetext
1118878,,1393629047,megaflubbie,1,0,,cfrbi68,t1_cfrbi68,,,...,,,C,What is wrong? tell your story please. I am in...,t3_1z7x3r,t3_1z7x3r,What is wrong? tell your story please. I am ...,what is wrong tell your story please i am ...,wrong tell story please interested,wrong tell story please interested
1118879,,1393629181,Beetle559,1,0,,cfrbk69,t1_cfrbk69,,,...,,,C,Admins were messaged and the post has been rem...,t3_1z7spg,t3_1z7spg,Admins were messaged and the post has been r...,admins were messaged and the post has been r...,admins messaged post removed ill deleting thanks,admins messaged post removed ill deleting thanks
1118880,,1393629192,JamjamR,1,0,,cfrbkc7,t1_cfrbkc7,,,...,,,C,"Keep your head up! If possible, try to turn of...",t3_1z7so3,t3_1z7so3,"Keep your head up! If possible, try to turn ...",keep your head up if possible try to turn ...,keep head possible try turn phone computer go ...,keep head possible try turn phone computer go ...
1118882,,1393629209,skyqween,1,0,,cfrbkk7,t1_cfrbkk7,,,...,,,C,Please read the sidebar rules. Advertising you...,t3_1z7giv,t3_1z7giv,Please read the sidebar rules. Advertising y...,please read the sidebar rules advertising y...,please read sidebar rules advertising helper m...,please_read_sidebar rules_advertising helper_m...
1118888,,1393629936,,1,0,,cfrbv31,t1_cfrbv31,,,...,,,C,,t3_1z7x3r,t3_1z7x3r,,,,


In [None]:
# Check that the dataframe was updated correctly
#Can take a long time, prints progress
#for i in range(len(allPosts)):
#   if i % 500 == 0:
#        print(i)
#    if not " ".join(allPosts[i]) == list(df["phrasetext"])[i]:
#       print("index :" + str(i) + " is incorrect")

### Time Element

Add a column to the DataFrame that indicates which week the post occured in
Add a column to the DataFrame that indicates which month the post occured in


In [None]:
baseyear = int(input('What year does the data start in: '))
df["week_no"] = df["created_utc"].apply(lambda utc: int(weeks_since(utc, baseyear)))

In [None]:
df.head(1000)

In [None]:
df.tail()

In [None]:
df["month_no"] = df["created_utc"].apply(lambda utc: int(months_since(utc, baseyear)))

In [None]:
df.head(1500)

In [None]:
df.tail()

### Data Saving

After cleaning and parsing all of our data, we can now
save it, so that we can analysis it later without having
to go through lengthy computations

### Initialize Word2Vec Model

After all of our data has been parsed and saved, 
we generate our Word2Vec Model

In [32]:
# Set the minimum word count to 10. This removes all words that appear less than 10 times in the data
minimum_word_count = 10
# Set skip gram to 1. This sets gensim to use the skip gram model instead of the Continuous Bag of Words model
skip_gram = 1
# Set Hidden layer size to 300.
hidden_layer_size = 300
# Set the window size to 5. 
window_size = 5
# Set hierarchical softmax to 1. This sets gensim to use hierarchical softmax
hierarchical_softmax = 1
# Set negative sampling to 20. This is good for relatively small data sets, but becomes harder for larger datasets
negative_sampling = 20

In [33]:
# Build the model on all the text
model = gensim.models.Word2Vec(postsAndComments, min_count = minimum_word_count, sg = skip_gram, size = hidden_layer_size,
                                   window = window_size, hs = hierarchical_softmax, negative = negative_sampling)

KeyboardInterrupt: 

In [36]:
#If building two models
modelP = gensim.models.Word2Vec(posts, min_count = minimum_word_count, sg = skip_gram, size = hidden_layer_size,
                                   window = window_size, hs = hierarchical_softmax, negative = negative_sampling)
modelC = gensim.models.Word2Vec(comments, min_count = minimum_word_count, sg = skip_gram, size = hidden_layer_size,
                                   window = window_size, hs = hierarchical_softmax, negative = negative_sampling)

KeyboardInterrupt: 

### Basic Model test

After generating our model, we run some basic tests
to ensure that it has captured some semantic information results

In [39]:
model = gensim.models.Word2Vec.load('models/' + model_name + '.model')
modelP = gensim.models.Word2Vec.load('models/' + '2019_SW_P' + '.model')
modelC = gensim.models.Word2Vec.load('models/' + '2019_SW_C' + '.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [40]:
model.wv.most_similar(positive = ["sober"])

[('ive_seen_alcoholics', 0.5500342845916748),
 ('fault_responsibility_ensure', 0.5276337265968323),
 ('clean_sober', 0.505244255065918),
 ('battling_addiction_well', 0.5051607489585876),
 ('happy_fulfilling_lives', 0.4990171790122986),
 ('drunk', 0.47476282715797424),
 ('sobriety', 0.45957720279693604),
 ('worse_benzos_special', 0.4538528323173523),
 ('quit_drinking_start_addressing', 0.44979527592658997),
 ('stay_sober', 0.44713395833969116)]

In [41]:
modelP.wv.most_similar(positive = ["sober"])

[('drunk', 0.5707206130027771),
 ('clean_sober', 0.5328336954116821),
 ('drinking', 0.5269137620925903),
 ('relapsed', 0.5151023268699646),
 ('stayed_sober', 0.5045028924942017),
 ('stay_sober', 0.5033601522445679),
 ('smoking_cigs', 0.49789664149284363),
 ('intoxicated', 0.49661320447921753),
 ('rehab', 0.4857616722583771),
 ('detoxed', 0.4766559898853302)]

In [42]:
modelC.wv.most_similar(positive = ["sober"])

[('happy_fulfilling_lives', 0.5655555129051208),
 ('ive_seen_alcoholics', 0.5632500648498535),
 ('quit_drinking_start_addressing', 0.4905615448951721),
 ('waving', 0.42963454127311707),
 ('well_rested', 0.4177454710006714),
 ('aa_na', 0.4107852578163147),
 ('drug_addicted', 0.3982229232788086),
 ('altered_state', 0.39123648405075073),
 ('recovering', 0.38190215826034546),
 ('relapsed', 0.3588559627532959)]

In [43]:
model.wv.most_similar(positive = ["father", "woman"], negative = ["man"])

[('husband', 0.4344366788864136),
 ('mother', 0.42998647689819336),
 ('okay_mourn_passing', 0.4196780323982239),
 ('sister', 0.41342467069625854),
 ('forced_abortion', 0.4105724096298218),
 ('still_connected_perfectly', 0.4092947244644165),
 ('previous_marriage', 0.40773332118988037),
 ('remarried', 0.39508742094039917),
 ('siblings', 0.38945552706718445),
 ('fiance', 0.3832973837852478)]

In [44]:
model.wv.most_similar(positive = ["family", "angry"], negative = ["love"])

[('parents', 0.3977805972099304),
 ('mom', 0.3689156174659729),
 ('sad_hear_incompetence', 0.3616834282875061),
 ('irate', 0.3553171753883362),
 ('upset', 0.3504619598388672),
 ('unsupportive', 0.34635305404663086),
 ('family_members', 0.3441495895385742),
 ('vital_essential_personnel_supportive', 0.34184131026268005),
 ('angry_frustrated', 0.3396082818508148),
 ('everything_short_sweet', 0.339534193277359)]

### Save Model

After generating our model, and runing some basic tests,
we now save it so that we can analysis it later without having
to go through lengthy computations. We also delete and then reload
the model, as an example of how to do so.

In [45]:
model.save('models/' + model_name + '.model')
modelP.save('models/' + model_name + '-posts' + '.model')
modelC.save('models/' + model_name + '-comments' + '.model')
del model

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [46]:
model = gensim.models.Word2Vec.load('models/' + model_name + '.model')
modelP = gensim.models.Word2Vec.load('models/' + model_name + '-posts' + '.model')
modelC = gensim.models.Word2Vec.load('models/' + model_name + '-comments' + '.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Generate Matricies

After generating our Word2Vec Model, we generate 
a collection of matricies that will be useful for
analysis. This includes a Words By feature matrix,
and a Post By Words Matrix. Note, we will use camelCase 
for matrix names, and only matrix names

In [47]:
# Initialize the list of words used
vocab_list = sorted(list(model.wv.vocab))

In [48]:
vocab_listP = sorted(list(modelP.wv.vocab))
vocab_listC = sorted(list(modelC.wv.vocab))

In [55]:
# Extract the word vectors
vecs = []
for word in vocab_list:
    vecs.append(model.wv[word].tolist())

In [56]:
vecsP = []
for word in vocab_listP:
    vecsP.append(modelP.wv[word].tolist())
    
vecsC = []
for word in vocab_listC:
    vecsC.append(modelC.wv[word].tolist())

In [54]:
len(vocab_listP)

0

In [57]:
# change array format into numpy array
WordsByFeatures = np.array(vecs)
WordsByFeatures.shape

(68601, 300)

In [58]:
#just words from posts
PostWordsByFeatures = np.array(vecsP)
PostWordsByFeatures.shape

(44155, 300)

In [59]:
#just words from comments
CommentWordsByFeatures = np.array(vecsC)
CommentWordsByFeatures.shape

(45621, 300)

In [None]:
#Use this as check to see they match
vocab_list[58355]
print(WordsByFeatures[58355])

In [60]:
countvec = CountVectorizer(vocabulary = vocab_list, analyzer = (lambda lst:list(map((lambda s:s), lst))), min_df = 0)

# Make Posts By Words Matrix
PostsByWords = countvec.fit_transform(posts)
PostsByWords.shape

#Make comments by words matrix
CommentsByWords = countvec.fit_transform(comments)
CommentsByWords.shape


(376604, 68601)

In [61]:
countvecP = CountVectorizer(vocabulary = vocab_listP, analyzer = (lambda lst:list(map((lambda s:s), lst))), min_df = 0)

# Make Posts By Words Matrix- only post words
PostsByPostWords = countvecP.fit_transform(posts)
PostsByPostWords.shape

countvecC = CountVectorizer(vocabulary = vocab_listC, analyzer = (lambda lst:list(map((lambda s:s), lst))), min_df = 0)

#Make comments by words matrix- only comment words
CommentsByCommentWords = countvecC.fit_transform(comments)
CommentsByCommentWords.shape

KeyboardInterrupt: 

In [62]:
countvecP = CountVectorizer(vocabulary = vocab_listP, analyzer = (lambda lst:list(map((lambda s:s), lst))), min_df = 0)
countvecC = CountVectorizer(vocabulary = vocab_listC, analyzer = (lambda lst:list(map((lambda s:s), lst))), min_df = 0)

#Make post mapped to comments matrices
postsMatched = dfMatch["cleantext_x"].apply(lambda str: str.split()).tolist()
commentsMatched = dfMatch["cleantext_y"].apply(lambda str: str.split()).tolist()

#This matrix will be used for association rules, so these will use the two separate models
PostsMatchedByWords = countvecP.fit_transform(postsMatched)
PostsMatchedByWords.shape

CommentsMatchedByWords = countvecC.fit_transform(commentsMatched)
CommentsMatchedByWords.shape

(56469, 45621)

In [None]:
#make for both
PostsAndCommentsByWords = countvec.fit_transform(postsAndComments)
PostsAndCommentsByWords.shape

In [63]:
#Check that both shapes are equal to lists and each other
print(PostsMatchedByWords.shape[0] == len(postsMatched) == CommentsMatchedByWords.shape[0] == len(commentsMatched))


True


### Basic Matrix tests

After generating our matricies, we run some basic tests
to ensure that they seem resaonable later without having
to go through lengthy computations

In [64]:
#Check that PostsByWords is the number of Posts by the number of words
print(PostsByWords.shape[0] == len(posts))
#Same for comments
print(CommentsByWords.shape[0] == len(comments))


True
True


### Save Matricies

After generating our matricies, we save them so we can 
analyze them later without having to go through lengthy
computations.

In [65]:
save_object(PostsByWords,'matricies/', model_name + "-PostsByWords")
save_object(CommentsByWords,'matricies/', model_name + "-CommentsByWords")

In [None]:
save_object(CommentsByWords,'matricies/', model_name + "-CommentsByWords")

In [66]:
save_object(PostsMatchedByWords,'matricies/', model_name + "-PostsMatchedByWords")
save_object(CommentsMatchedByWords,'matricies/', model_name + "-CommentsMatchedByWords")

### Generate Word Clusters

Now that we have generated and saved our matricies,
we will proceed to generate word clusters using 
kmeans clustering, and save them for later analysis.

In [None]:
#fast way if u know num clusters you want
from sklearn.cluster import KMeans
num_clusters = 100
fit = []
kmeans = KMeans(n_clusters = num_clusters, random_state = 42).fit(WordsByFeatures)
save_object(kmeans, 'clusters/', model_name + "-words-cluster_model-" + str(num_clusters))
fit.append(kmeans.inertia_)

In [67]:
from sklearn.cluster import KMeans
# can take a long time
# get the fit for different values of K
test_points = [12] + list(range(25, 101, 25))
fit = []
for point in test_points:
    print(point)
    kmeans = KMeans(n_clusters = point, random_state = 42).fit(WordsByFeatures)
    save_object(kmeans, 'clusters/', model_name + "-words-cluster_model-" + str(point))
    fit.append(kmeans.inertia_)

12
25
50
75
100


In [68]:
#Create clusters for just posts
fitP = []
for point in test_points:
    print(point)
    kmeansP = KMeans(n_clusters = point, random_state = 42).fit(PostWordsByFeatures)
    save_object(kmeans, 'clusters/', model_name + "-post-words-cluster_model-" + str(point))
    fit.append(kmeans.inertia_)

12
25
50
75
100


In [69]:
#Create clusters for just posts
fitC = []
for point in test_points:
    print(point)
    kmeansC = KMeans(n_clusters = point, random_state = 42).fit(CommentWordsByFeatures)
    save_object(kmeans, 'clusters/', model_name + "-comment-words-cluster_model-" + str(point))
    fit.append(kmeans.inertia_)

12
25
50
75
100


In [70]:
save_object(fit, 'objects/', model_name + "-words" + "-fit")
save_object(fitP, 'objects/', model_name + "-post-words" + "-fit")
save_object(fitC, 'objects/', model_name + "-comment-words" + "-fit")
save_object(test_points, 'objects/', model_name + "-words" + "-test_points")
del fit
del test_points

### Create Months By Posts Matrix

Have to add time portion for comments 

In [None]:
#Did not do time element on responses, but it would be easy to implement!

In [None]:
#Create proper shape matrix with all 0s
MonthsByAllPosts = np.zeros( (df["month_no"].max()+1, len(df) ) )
for s in subreddits:
    subreddit_info[s]['MonthsByPosts'] = np.zeros( ( subreddit_info[s]['df']['month_no'].max()+1 , len(subreddit_info[s]['df'] ) ) )


In [None]:
print(MonthsByAllPosts.shape)
for s in subreddits:
    print(s)
    print(subreddit_info[s]["MonthsByPosts"].shape)

In [None]:
i = 0
for post in df.itertuples():
    MonthsByAllPosts[post[-1]][i] = 1
    i += 1
print(MonthsByAllPosts.shape)

for i in range(MonthsByAllPosts.shape[1]):
    assert MonthsByAllPosts[:,i].sum() == 1
    

In [None]:
for s in subreddits:
    i = 0
    for post in subreddit_info[s]['df'].itertuples():
        subreddit_info[s]['MonthsByPosts'][post[-1]][i] = 1
        i += 1

    print(subreddit_info[s]['MonthsByPosts'].shape)
    for i in range(subreddit_info[s]['MonthsByPosts'].shape[1]):
        assert subreddit_info[s]['MonthsByPosts'][:,i].sum() == 1

In [None]:
save_object(MonthsByAllPosts, 'matricies/', model_name + "-MonthsByAllPosts")

for s in subreddits:
    save_object(subreddit_info[s]['MonthsByPosts'], 'matricies/', model_name + "-" + subreddit_info[s]['abbr'] + "MonthsByPosts")



### Create Posts By Clusters Matrix 

Now we need to create the PostsByClusters matrix using the below equation:

PostsByWords X WordsByClusters = PostsByClusters

In [71]:
#Initialize a word clustering to use for all posts and comments
num_word_clusters = 100
kmeans = load_object('clusters/', model_name + '-words-cluster_model-' + str(num_word_clusters))

clusters = make_clustering_objects(model, kmeans, vocab_list, WordsByFeatures)

clusterWords = list(map(lambda x: list(map(lambda y: y[0] , x["word_list"])), clusters))

countvec = CountVectorizer(vocabulary = vocab_list, analyzer = (lambda lst:list(map((lambda s: s), lst))), min_df = 0)

#Create ClustersByWords matrix
ClusterByWords = countvec.fit_transform(clusterWords)

#Look at Dimensions
print(ClusterByWords.shape)

#Create WordsByClusters
WordsByClusters = ClusterByWords.transpose()
print(WordsByClusters.shape)
save_object(WordsByClusters, 'matricies/', model_name + "WordsByClusters-" + str(num_word_clusters) + 'clusters')

#Put into readable excel doc
wordsByC_df = pd.DataFrame(ClusterByWords)
filepath = 'excels/' + model_name + "-ClusterByWords-" + str(num_word_clusters) + 'clusters.xlsx'
wordsByC_df.to_excel(filepath, index = False)


(100, 68601)
(68601, 100)


In [72]:
#Create post words by clusters matrix
kmeansP = load_object('clusters/', model_name + '-post-words-cluster_model-' + str(num_word_clusters))

clustersP = make_clustering_objects(modelP, kmeansP, vocab_listP, PostWordsByFeatures)

PclusterWords = list(map(lambda x: list(map(lambda y: y[0] , x["word_list"])), clustersP))

countvecP = CountVectorizer(vocabulary = vocab_listP, analyzer = (lambda lst:list(map((lambda s: s), lst))), min_df = 0)

#Create ClustersByWords matrix
ClusterByPostWords = countvecP.fit_transform(PclusterWords)

#Look at Dimensions
print(ClusterByPostWords.shape)

#Create WordsByClusters
PostWordsByClusters = ClusterByPostWords.transpose()
print(PostWordsByClusters.shape)
save_object(PostWordsByClusters, 'matricies/', model_name + "PostWordsByClusters-" + str(num_word_clusters) + 'clusters')

#Put into readable excel doc
wordsByC_df = pd.DataFrame(ClusterByPostWords)
filepath = 'excels/' + model_name + "-ClusterByPostWords-" + str(num_word_clusters) + 'clusters.xlsx'
wordsByC_df.to_excel(filepath, index = False)


(100, 44155)
(44155, 100)


In [73]:
#Create clusters for just comments
kmeansC = load_object('clusters/', model_name + '-comment-words-cluster_model-' + str(num_word_clusters))

clustersC = make_clustering_objects(modelC, kmeansC, vocab_listC, CommentWordsByFeatures)

CclusterWords = list(map(lambda x: list(map(lambda y: y[0] , x["word_list"])), clustersC))

countvecC = CountVectorizer(vocabulary = vocab_listC, analyzer = (lambda lst:list(map((lambda s: s), lst))), min_df = 0)

#Create ClustersByWords matrix
ClusterByCommentWords = countvecC.fit_transform(CclusterWords)

#Look at Dimensions
print(ClusterByCommentWords.shape)

#Create WordsByClusters
CommentWordsByClusters = ClusterByCommentWords.transpose()
print(CommentWordsByClusters.shape)
save_object(CommentWordsByClusters, 'matricies/', model_name + "CommentWordsByClusters-" + str(num_word_clusters) + 'clusters')

#Put into readable excel doc
wordsByC_df = pd.DataFrame(ClusterByCommentWords)
filepath = 'excels/' + model_name + "-ClusterByCommentWords-" + str(num_word_clusters) + 'clusters.xlsx'
wordsByC_df.to_excel(filepath, index = False)


(100, 45621)
(45621, 100)


In [None]:
for i in range(100):
    for word in clusters[i]['word_list']:
        if word[0] == 'williams':
            print(i)

In [None]:
clusters[49]

In [74]:
# Create Posts By Clusters and comments by clusters through matrix multiplication
PostsByClusters = PostsByWords.dot(WordsByClusters.toarray())
CommentsByClusters = CommentsByWords.dot(WordsByClusters.toarray())

PostsByClusters = np.matrix(PostsByClusters)
CommentsByClusters = np.matrix(CommentsByClusters)

print(PostsByClusters.shape)
print(CommentsByClusters.shape)


(228962, 100)
(376604, 100)


In [75]:
PostsMatchedByClusters = PostsMatchedByWords.dot(PostWordsByClusters.toarray())
CommentsMatchedByClusters = CommentsMatchedByWords.dot(CommentWordsByClusters.toarray())

PostsMatchedByClusters = np.matrix(PostsMatchedByClusters)
CommentsMatchedByClusters = np.matrix(CommentsMatchedByClusters)

In [76]:
PCmatchedByClusters = np.hstack((PostsMatchedByClusters, CommentsMatchedByClusters))

In [77]:
save_object(PostsByClusters, 'matricies/', model_name + "-PostsByClusters-" + str(num_word_clusters) + 'clusters')
save_object(CommentsByClusters, 'matricies/', model_name + "-CommentsByClusters-" + str(num_word_clusters) + 'clusters')

In [None]:
save_object(PostsAndCommentsByClusters,'matricies/', model_name + "-PostsAndCommentsByClusters-" + str(num_word_clusters) + 'clusters')

for s in subreddits:
    save_object(subreddit_info[s]["PostsByClusters"], 'matricies/', model_name + '-' + subreddit_info[s]["abbr"] + "PostsByClusters-" + str(num_word_clusters) + 'clusters')
    save_object(subreddit_info[s]["CommentsByClusters"], 'matricies/', model_name + '-' + subreddit_info[s]["abbr"] + "CommentsByClusters-" + str(num_word_clusters) + 'clusters')
    save_object(subreddit_info[s]["PostsAndCommentsByClusters"], 'matricies/', model_name + '-' + subreddit_info[s]["abbr"] + "PostsAndCommentsByClusters-" + str(num_word_clusters) + 'clusters')

In [78]:
save_object(PostsMatchedByClusters, 'matricies/', model_name + "-PostMatchedsByClusters-" + str(num_word_clusters) + 'clusters')
save_object(CommentsMatchedByClusters, 'matricies/', model_name + "-CommentsMatchedByClusters-" + str(num_word_clusters) + 'clusters')

In [79]:
save_object(PCmatchedByClusters, 'matricies/', model_name + "-PCMatchedByClusters-" + str(num_word_clusters) + 'clusters')

# Linear Regression of Comment Scores

In [80]:
CommentsByClusters = load_object('matricies/', model_name + "-CommentsByClusters-" + str(num_word_clusters) + 'clusters')

In [81]:
#Make comment by score matrix
CommentsByScore = np.matrix(scores).T
CommentsByScore.shape
save_object(CommentsByScore, 'matricies/', model_name + "-CommentsByScore-" + str(num_word_clusters) + 'clusters')

## TFIDF
We did not use this part or the later trends over time part for the responses.

In [None]:
#Add column on just 1s to CommentByClusters for B0 value
n,m = CommentsByClusters.shape 
X0 = np.ones((n,1))
CommentsByClustersOnes = np.hstack((X0, CommentsByClusters))
CommentsByClustersOnes.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvec = TfidfVectorizer(vocabulary = vocab_list, analyzer = (lambda lst:list(map((lambda s:s), lst))), min_df = 0)

# Make Posts By Words Matrix
TFIDFPostsByWords = tfidfvec.fit_transform(posts)
TFIDFPostsByWords.shape
TFIDFCommentsByWords = tfidfvec.fit_transform(comments)
TFIDFCommentsByWords.shape
TFIDFPostsAndCommentsByWords = tfidfvec.fit_transform(postsAndComments)
TFIDFPostsAndCommentsByWords.shape


In [None]:
#Create TFIDF Posts By Clusters through matrix multiplication
TFIDFPostsByClusters = TFIDFPostsByWords.dot(WordsByClusters.toarray())
TFIDFPostsByClusters = np.matrix(TFIDFPostsByClusters)
TFIDFCommentsByClusters = TFIDFCommentsByWords.dot(WordsByClusters.toarray())
TFIDFCommentsByClusters = np.matrix(TFIDFCommentsByClusters)
TFIDFPostsAndCommentsByClusters = TFIDFPostsAndCommentsByWords.dot(WordsByClusters.toarray())
TFIDFPostsAndCommentsByClusters = np.matrix(TFIDFPostsAndCommentsByClusters)

print(TFIDFPostsByClusters.shape)

In [None]:
x = TFIDFPostsByClusters[:,36]
x = np.asarray(x.transpose())
x = x[0]
idx = np.argsort(x)
idx[-10:]

In [None]:
x[838]

In [None]:
for i in idx[-10:]:
    print(i)
    print(df.iloc[i]['rawtext'])
    print()

In [None]:
sorted( range(len(x)), key = lambda i: x[i], reverse = True)[:10]

### Create Months By Clusters Matrix 

Now we need to create the MonthsByClusters matrix using the below equation:

MonthsByPosts X PostsByClusters = MonthsByClusters

In [None]:
#Create MonthsByClusters matrix through matrix multiplication
ALL_MonthsByClusters = MonthsByAllPosts.dot(PostsByClusters)
ALL_MonthsByClusters.shape

In [None]:
for s in subreddits:
    subreddit_info[s]["MonthsByClusters"] = subreddit_info[s]["MonthsByPosts"].dot(subreddit_info[s]["PostsByClusters"])
    print(s)
    print(subreddit_info[s]["MonthsByClusters"].shape)

In [None]:
save_object(ALL_MonthsByClusters, 'matricies/', model_name + "-ALL_MonthsByClusters-" + str(num_word_clusters) + 'clusters')
for s in subreddits:
    save_object(subreddit_info[s]["MonthsByClusters"], 'matricies/', model_name + "-" + subreddit_info[s]["abbr"] + "_MonthsByClusters-" + str(num_word_clusters) + 'clusters')


### Create Months By Clusters Matrix Normalizations

Normalize the Months By Clusters matrix

In [None]:
amXc_df = pd.DataFrame(ALL_MonthsByClusters)
filepath = 'excels/' + model_name + "-ALL_MonthsByClusters-" + str(num_word_clusters) + 'clusters.xlsx'
amXc_df.to_excel(filepath, index = False)

In [None]:
ALL_L2A1MonthsByClusters = normalize(ALL_MonthsByClusters, norm='l2', axis = 1)
save_object(ALL_L2A1MonthsByClusters, 'matricies/', model_name + "-ALL_L2A1MonthsByClusters-" + str(num_word_clusters) + 'clusters')
L2A1amXc_df = pd.DataFrame(ALL_L2A1MonthsByClusters)
filepath = 'excels/' + model_name + "-ALL_L2A1MonthsByClusters-" + str(num_word_clusters) + 'clusters.xlsx'
L2A1amXc_df.to_excel(filepath, index = False)

In [None]:
for s in subreddits:
    mXc_df = pd.DataFrame(subreddit_info[s]["MonthsByClusters"])
    filepath = 'excels/' + model_name + '-' + subreddit_info[s]['abbr'] + "-MonthsByClusters-" + str(num_word_clusters) + 'clusters.xlsx'
    mXc_df.to_excel(filepath, index = False)

In [None]:
for s in subreddits:
    L2A1Normalized = normalize(subreddit_info[s]["MonthsByClusters"], norm = 'l2', axis = 1)
    save_object(L2A1Normalized, 'matricies/', model_name+'-'+subreddit_info[s]['abbr']+'-L2A1MonthsByClusters-'+str(num_word_clusters)+'clusters' )
    NmXc_df = pd.DataFrame(L2A1Normalized)
    filepath = 'excels/'+model_name+'-'+subreddit_info[s]['abbr']+'-L2A1MonthsByClusters'+str(num_word_clusters)+'clusters.xlsx'
    NmXc_df.to_excel(filepath, index = False)

### Create Weeks By Posts Matrix

In [None]:
#Create proper shape matrix with all 0s
WeeksByAllPosts = np.zeros( (df["week_no"].max()+1, len(df) ) )
for s in subreddits:
    subreddit_info[s]['WeeksByPosts'] = np.zeros( ( subreddit_info[s]['df']['week_no'].max()+1 , len(subreddit_info[s]['df'] ) ) )


In [None]:
print(WeeksByAllPosts.shape)
for s in subreddits:
    print(s)
    print(subreddit_info[s]["WeeksByPosts"].shape)

In [None]:
i = 0
for post in df.itertuples():
    WeeksByAllPosts[post[-2]][i] = 1
    i += 1

print(WeeksByAllPosts.shape)
for i in range(WeeksByAllPosts.shape[1]):
    assert WeeksByAllPosts[:,i].sum() == 1

In [None]:
for s in subreddits:
    i = 0
    for post in subreddit_info[s]['df'].itertuples():
        subreddit_info[s]['WeeksByPosts'][post[-2]][i] = 1
        i += 1

    print(subreddit_info[s]['WeeksByPosts'].shape)
    for i in range(subreddit_info[s]['WeeksByPosts'].shape[1]):
        assert subreddit_info[s]['WeeksByPosts'][:,i].sum() == 1

In [None]:
save_object(WeeksByAllPosts, 'matricies/', model_name + "-WeeksByAllPosts")

for s in subreddits:
    save_object(subreddit_info[s]['WeeksByPosts'], 'matricies/', model_name + "-" + subreddit_info[s]['abbr'] + "WeeksByPosts")



### Create Weeks By Clusters Matrix 

Now we need to create the WeeksByClusters matrix using the below equation:

WeeksByPosts X PostsByClusters = WeeksByClusters

In [None]:
#Create MonthsByClusters matrix through matrix multiplication
ALL_WeeksByClusters = WeeksByAllPosts.dot(allPostsByClusters)
ALL_WeeksByClusters.shape

In [None]:
for s in subreddits:
    subreddit_info[s]["WeeksByClusters"] = subreddit_info[s]["WeeksByPosts"].dot(subreddit_info[s]["PostsByClusters"])
    print(s)
    print(subreddit_info[s]["WeeksByClusters"].shape)

In [None]:
save_object(ALL_WeeksByClusters, 'matricies/', model_name + "-ALL_WeeksByClusters-" + str(num_word_clusters) + 'clusters')
for s in subreddits:
    save_object(subreddit_info[s]["WeeksByClusters"], 'matricies/', model_name + "-" + subreddit_info[s]["abbr"] + "_WeeksByClusters-" + str(num_word_clusters) + 'clusters')


### Create Weeks By Clusters Matrix Normalizations

Normalize the Weeks By Clusters matrix

In [None]:
awXc_df = pd.DataFrame(ALL_WeeksByClusters)
filepath = 'excels/' + model_name + "-ALL_WeeksByClusters-" + str(num_word_clusters) + 'clusters.xlsx'
awXc_df.to_excel(filepath, index = False)

In [None]:
ALL_L2A1WeeksByClusters = normalize(ALL_WeeksByClusters, norm='l2', axis = 1)
save_object(ALL_L2A1WeeksByClusters, 'matricies/', model_name + "ALL_L2A1WeeksByClusters-" + str(num_word_clusters) + 'clusters')
L2A1awXc_df = pd.DataFrame(ALL_L2A1WeeksByClusters)
filepath = 'excels/' + model_name + "-ALL_L2A1WeeksByClusters-" + str(num_word_clusters) + 'clusters.xlsx'
L2A1awXc_df.to_excel(filepath, index = False)

In [None]:
for s in subreddits:
    wXc_df = pd.DataFrame(subreddit_info[s]["WeeksByClusters"])
    filepath = 'excels/' + model_name + '-' + subreddit_info[s]['abbr'] + "-WeeksByClusters-" + str(num_word_clusters) + 'clusters.xlsx'
    wXc_df.to_excel(filepath, index = False)

In [None]:
for s in subreddits:
    L2A1Normalized = normalize(subreddit_info[s]["WeeksByClusters"], norm = 'l2', axis = 1)
    save_object(L2A1Normalized, 'matricies/', model_name+'-'+subreddit_info[s]['abbr']+'-L2A1WeeksByClusters-'+str(num_word_clusters)+'clusters' )
    NwXc_df = pd.DataFrame(L2A1Normalized)
    filepath = 'excels/'+model_name+'-'+subreddit_info[s]['abbr']+'-L2A1WeeksByClusters'+str(num_word_clusters)+'clusters.xlsx'
    NwXc_df.to_excel(filepath, index = False)

In [None]:
len(df)

In [None]:
len(dep_df)

In [None]:
len(sw_df)

In [None]:
len(an_df)

In [None]:
len(dep_df) + len(sw_df) + len(an_df)

In [None]:
depPostsByWords

In [None]:
in_dep = 0
not_in_dep = 0
for i in range(depPostsByWords.shape[1]):
    if i % 1000 == 0:
            print((i / 90335) * 100)
            print("in_dep: ", in_dep)
            print("not_in_dep: ", not_in_dep)
    if depPostsByWords[:,i].sum() < 10:
        not_in_dep += 1
    else:
        in_dep += 1

In [None]:
print(in_dep)
print(not_in_dep)
print(in_dep + not_in_dep)

In [None]:
in_sw = 0
not_in_sw = 0
for i in range(swPostsByWords.shape[1]):
    if i % 1000 == 0:
            print((i / 90335) * 100)
            print("in_sw: ", in_sw)
            print("not_in_sw: ", not_in_sw)
    if swPostsByWords[:,i].sum() < 10:
        not_in_sw += 1
    else:
        in_sw += 1


In [None]:
print(in_sw)
print(not_in_sw)
print(in_sw + not_in_sw)

In [None]:
in_an = 0
not_in_an = 0
for i in range(anPostsByWords.shape[1]):
    if i % 1000 == 0:
            print((i / 90335) * 100)
            print("in_an: ", in_an)
            print("not_in_an: ", not_in_an)
    if anPostsByWords[:,i].sum() < 10:
        not_in_an += 1
    else:
        in_an += 1
        

In [None]:
print(in_an)
print(not_in_an)
print(in_an + not_in_an)

In [None]:
for i in range(len(vocab_list)):
    if i % 1000 == 0:
        print("PROGRESS: ", i/90335)
    if depPostsByWords[:,i].sum() < 10:
        print(vocab_list[i])

In [None]:
for word in vocab_list:
    if model.wv.vocab[word].count < 10:
        print(word)

In [None]:
for i in range(len(vocab_list)):
    if i % 1000 == 0:
        print("PROGRESS: ", i/90335)
    if allPostsByWords[:,i].sum() < 10:
        print(vocab_list[i])

In [None]:
cnt = 0
for i in range(len(vocab_list)):
    if i % 1000 == 0:
        print("PROGRESS: ", i/90335)
        print(cnt)
    if depPostsByWords[:,i].sum() < 10:
        cnt += 1
        #print(vocab_list[i])
print("cnt  ", cnt)

cnt = 0
for i in range(len(vocab_list)):
    if i % 1000 == 0:
        print("PROGRESS: ", i/90335)
        print(cnt)
    if swPostsByWords[:,i].sum() < 10:
        cnt += 1
        #print(vocab_list[i])
print("cnt  ", cnt)

In [None]:
for i in range(len(vocab_list)):
    if i % 1000 == 0:
        print("PROGRESS: ", i/90335)
    if anPostsByWords[:,i].sum() < 10:
        print(vocab_list[i])

In [None]:
cnt = 0
for i in range(len(vocab_list)):
    if i % 1000 == 0:
        print("PROGRESS: ", i/90335)
        print(cnt)
    if anPostsByWords[:,i].sum() < 10:
        cnt += 1
        #print(vocab_list[i])
print("cnt  ", cnt)

In [None]:
len(vocab_list) - cnt