# Create a Natural Language Processing model for the topics in a given subreddit

## Load packages

In [211]:
import os.path
from os import path
import sys
import ciso8601
import time
import datetime 
import requests
import json
import csv
import praw
import numpy as np
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import gensim
import gensim.corpora
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.test.utils import common_corpus
from gensim.test.utils import datapath
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
import sklearn
import sklearn.model_selection as skmodsel
import sklearn.linear_model as sklinmod
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
import multiprocessing as mp
import pickle
import warnings; warnings.simplefilter('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dmartizzi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load data from a pickle file (temporary)

In [212]:
#subrname = "politicaldiscussion"
#subrname = "history"
subrname = "quotes"
#subrname = "changemyview"

#inpath = "./"
#fname = "dump_r-"+subrname+"_2019-09-12.pkl"
inpath = "./subreddit_data/"
fname = inpath+"r-"+subrname+"-export.pkl"
dfraw = pd.read_pickle(fname)

# Filter out removed posts
sel = dfraw["selftext"].str.strip() == "[removed]"
dfraw["selftext"][sel] = ""
dfraw["title+selftext"] = dfraw["title"]+" "+dfraw["selftext"]

# Filter out posts with less than 3 words
dfraw["n_words"] = dfraw["title+selftext"][:].str.split()
for i in range(0,len(dfraw["n_words"])):
    dfraw["n_words"][i] = len(dfraw["n_words"][i])

sel = dfraw["n_words"] > 3
df = dfraw[sel]    

df.head(10)


Unnamed: 0,subreddit,sub_id,title,author,created,url,permalink,score,numComms,flair,selftext,upvote_ratio,title+selftext,n_words
0,quotes,abh541,“May All Your Troubles Last As Long As Your Ne...,WildeAquarius,2019-01-01 02:15:10,https://www.reddit.com/r/quotes/comments/abh54...,/r/quotes/comments/abh541/may_all_your_trouble...,1,0,,,0.88,“May All Your Troubles Last As Long As Your Ne...,15
1,quotes,abh956,"""The penalty for laughing in a courtroom is si...",ScrambledShow,2019-01-01 02:37:19,https://www.reddit.com/r/quotes/comments/abh95...,/r/quotes/comments/abh956/the_penalty_for_laug...,1,0,,,0.94,"""The penalty for laughing in a courtroom is si...",30
2,quotes,abh957,"""Man is a clever animal who behaves like an im...",ScrambledShow,2019-01-01 02:37:19,https://www.reddit.com/r/quotes/comments/abh95...,/r/quotes/comments/abh957/man_is_a_clever_anim...,1,0,,,0.96,"""Man is a clever animal who behaves like an im...",13
3,quotes,abh95a,"""Beauty is in the eye of the beholder and it m...",ScrambledShow,2019-01-01 02:37:19,https://www.reddit.com/r/quotes/comments/abh95...,/r/quotes/comments/abh95a/beauty_is_in_the_eye...,1,0,,,0.98,"""Beauty is in the eye of the beholder and it m...",30
4,quotes,abh95c,"""To say that a man is made up of certain chemi...",ScrambledShow,2019-01-01 02:37:20,https://www.reddit.com/r/quotes/comments/abh95...,/r/quotes/comments/abh95c/to_say_that_a_man_is...,1,0,,,0.93,"""To say that a man is made up of certain chemi...",31
5,quotes,abhl67,"""If we confine something to one place, it will...",i-Wayfarer,2019-01-01 03:39:51,https://www.reddit.com/r/quotes/comments/abhl6...,/r/quotes/comments/abhl67/if_we_confine_someth...,1,0,,,0.93,"""If we confine something to one place, it will...",39
6,quotes,abhwat,"""We become what we think about."" - Earl Nighti...",finneganishome,2019-01-01 04:37:14,https://www.reddit.com/r/quotes/comments/abhwa...,/r/quotes/comments/abhwat/we_become_what_we_th...,1,1,,,0.96,"""We become what we think about."" - Earl Nighti...",9
7,quotes,abhyke,"If you don't like something, change it. If you...",osamanasim,2019-01-01 04:48:54,https://www.reddit.com/r/quotes/comments/abhyk...,/r/quotes/comments/abhyke/if_you_dont_like_som...,1,0,,,0.82,"If you don't like something, change it. If you...",19
8,quotes,abhzmc,"""All straight men are into traps - some are ju...",DiSyndra,2019-01-01 04:54:28,https://www.reddit.com/r/quotes/comments/abhzm...,/r/quotes/comments/abhzmc/all_straight_men_are...,1,0,,,0.57,"""All straight men are into traps - some are ju...",15
9,quotes,abi7yh,"Hold fast to dreams, for if dreams die, life i...",Letitgo23607,2019-01-01 05:34:15,https://www.reddit.com/r/quotes/comments/abi7y...,/r/quotes/comments/abi7yh/hold_fast_to_dreams_...,1,0,,,0.9,"Hold fast to dreams, for if dreams die, life i...",19


## Preprocess the submissions 

In [213]:
def lemmatize_stemming(text):
    '''Function to lemmatize text'''
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [214]:
def preprocess(text):
    '''Function to pre-process text'''
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [215]:
# Preprocess titles 
processed_subm = df["title+selftext"].map(preprocess)

In [216]:
# Create Bag of Words 
dictionary = gensim.corpora.Dictionary(processed_subm)

# Filter out irrelevant words 
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=5000)

# For each tweet, create a dictionary reporting how many
# words and how many times those words appear.
bow_corpus = [dictionary.doc2bow(subm) for subm in processed_subm]

# Experimental: load LDA model or create it if it doesn't exist

In [217]:
if path.exists("lda_models"):
    print("Directory lda_models exists!")
else:
    !mkdir lda_models
    print("Directory lda_models created!")

Directory lda_models exists!


In [218]:
# Choose number of topics for LDA
num_topics_lda = 10

In [219]:
outpath = "./lda_models/"
ldaoutfile = outpath+"lda_model_"+str(num_topics_lda)+"topics_r-"+subrname
if path.exists(ldaoutfile):
    print(ldaoutfile+"model exists!")
    lda_model = gensim.models.LdaModel.load(ldaoutfile)
else :
    # Create Latent Dirichlet Allocation model with a given number of topics
    ncores = mp.cpu_count()
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=num_topics_lda, id2word=dictionary, \
                passes=10, workers=ncores)
    # Save LDA model to disc (it's expensive to regenerate)
    lda_model.save(ldaoutfile)

## Perform exploratory data analysis with the LDA model

In [220]:
def get_lda_topics(model, num_topics):
    word_dict = {}
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20)
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]
    return pd.DataFrame(word_dict)

In [221]:
lda_topics_df = get_lda_topics(lda_model, num_topics_lda)
lda_topics_df.head(10)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
0,quot,good,love,life,live,think,world,peopl,life,love
1,know,face,good,want,life,great,peopl,time,lose,heart
2,http,life,thing,http,quot,time,happi,go,fear,peopl
3,time,best,like,quot,year,mean,http,like,know,know
4,come,peopl,peopl,html,creat,peopl,want,think,mean,chang
5,think,better,look,blogspot,great,make,thing,thing,thing,life
6,feel,unknown,hate,lifehealthrelax,experi,control,youtu,know,hand,world
7,best,idea,person,chang,time,want,know,happi,peopl,fall
8,human,light,time,thing,world,world,better,teach,think,dream
9,kind,shoot,quot,like,understand,like,quot,truth,robert,mind


In [222]:
# Test LDA model
test_strings = {"test_strings" : list(df["title+selftext"].values)}
df_test_strings = pd.DataFrame(data = test_strings)

# Same procedure as above
processed_test_strings = df_test_strings["test_strings"].map(preprocess)
test_corpus = [dictionary.doc2bow(text) for text in processed_test_strings]
raw = lda_model[test_corpus]
shape = (len(raw),num_topics_lda)
predicted_topic_lda = np.zeros(shape)
for i,tups in enumerate(raw):
    for j in range(0,len(tups)):
        predicted_topic_lda[i,j] = tups[j][1]
    #print(df_test_strings["test_strings"].iloc[i],predicted_topic_lda[i],"\n")

# Experimental: load NMF model or create it if it doesn't exist

In [223]:
if path.exists("nmf_models"):
    print("Directory nmf_models exists!")
else:
    !mkdir nmf_models
    print("Directory nmf_models created!")

Directory nmf_models exists!


In [224]:
# Number of topics for NMF
num_topics_nmf = 10

In [225]:
# Create sentences
processed_subm_sentences = [' '.join(text) for text in processed_subm]

# Word counts
vectorizer = CountVectorizer(analyzer='word', max_features=5000)
x_counts = vectorizer.fit_transform(processed_subm_sentences)

# TF-IDF transform
transformer = TfidfTransformer(smooth_idf=False)
x_tfidf = transformer.fit_transform(x_counts)

# Normalize to unit length
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)

In [226]:
outpath = "./nmf_models/"
nmfoutfile = outpath+"nmf_model_"+str(num_topics_lda)+"topics_r-"+subrname
if path.exists(nmfoutfile):
    print(nmfoutfile+"model exists!")
    nmf_model = pickle.load(open(nmfoutfile, 'rb'))
else :
    # Create NMF model.
    nmf_model = NMF(n_components=num_topics_nmf,init='nndsvd')
    # Fit the model
    nmf_model.fit(xtfidf_norm)
    # Save NMF model to disc (it's expensive to regenerate)
    pickle.dump(nmf_model, open(nmfoutfile, 'wb'))


In [227]:
def get_nmf_topics(model, n_top_words, num_topics):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {}
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-20 - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words
    
    return pd.DataFrame(word_dict)

In [228]:
nmf_topics_df = get_nmf_topics(nmf_model, 10, num_topics_nmf)
nmf_topics_df.head(10)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
0,want,quot,life,love,know,think,thing,live,happi,peopl
1,time,best,chang,fall,go,chang,good,right,make,unknown
2,need,inspir,death,hate,say,care,mean,learn,money,like
3,georg,motiv,short,heart,person,start,beauti,dream,depend,world
4,world,help,go,william,wisdom,littl,best,west,need,feel
5,chang,http,creat,croft,truth,time,happen,bear,unhappi,tell
6,heaven,famou,experi,fear,true,wrong,chang,die,pursuit,time
7,person,say,easi,need,knowledg,world,better,tomorrow,birthday,believ
8,everybodi,look,like,save,come,cooley,littl,forev,famili,chang
9,help,rememb,time,feel,understand,see,great,nietzsch,sagan,care


In [229]:
from scipy import sparse

# Test NMF model
test_strings = {"test_strings" : df["title+selftext"].values}
df_test_strings = pd.DataFrame(data = test_strings)

# Same procedure as above
processed_test_strings = df_test_strings["test_strings"].map(preprocess)
test_sentences = [' '.join(text) for text in processed_test_strings]
x_test_counts = vectorizer.transform(test_sentences)
x_test_tfidf = transformer.transform(x_test_counts)
xtfidf_test_norm = normalize(x_test_tfidf, norm='l1', axis=1)

y = nmf_model.transform(xtfidf_test_norm)
predicted_topic_nmf = normalize(y, norm='l1', axis=1)
#for i in range(0,len(predicted_topic_nmf)):
#    print(df_test_strings["test_strings"].iloc[i],predicted_topic_nmf[i])

# Experimental: perform regression to predict popularity and controversiality

## Setup the Popularity and Controversiality 

In [230]:
# Build labels and features

features = predicted_topic_nmf

# Popularity of a post
p = [0] * len(df["numComms"].values)
popular = np.array(p); del p
cutoff_pop = np.quantile(popularity,q=0.5)
selp0 = df["numComms"].values > cutoff_pop
popular[selp0] = 1

# Controversiality of a post
c = [0] * len(df["upvote_ratio"].values)
controversial = np.array(c); del c
selc0 = np.logical_and(df["upvote_ratio"].values.astype(float) > 0.25, 
                    df["upvote_ratio"].values.astype(float) < 0.75)
controversial[selc0] = 1
selc1 = np.logical_or(df["upvote_ratio"].values.astype(float) < 0.25, 
                    df["upvote_ratio"].values.astype(float) > 0.75)


print("Fraction of Popular Posts = ",float(len(popular[selp0])/len(popular)))
print("Fraction of Controversial Posts = ",float(len(controversial[selc0])/len(controversial)))

Fraction of Popular Posts =  0.11686968512985521
Fraction of Controversial Posts =  0.19834520799816135


## Split in training, testing and validation sets

In [231]:
test_size = 0.2
validation_size = 0.2
train_size = 1.0 - test_size - validation_size

X, featurep_test, Y, popular_test = \
    skmodsel.train_test_split(features,popular,test_size=test_size)
featurep_train, featurep_validation, popular_train, popular_validation = \
    skmodsel.train_test_split(features,popular,test_size=validation_size/(1.0-test_size))

X, featurec_test, Y, controversial_test = \
    skmodsel.train_test_split(features,controversial,test_size=test_size)
featurec_train, featurec_validation, controversial_train, controversial_validation = \
    skmodsel.train_test_split(features,controversial,test_size=validation_size/(1.0-test_size))

del X,Y

## Create and train logistic regression models

In [232]:
popular_logregmodel = popular_logregmodel.fit(featurep_train,popular_train)
controversial_logregmodel = controversial_logregmodel.fit(featurec_train,controversial_train)

print("Logistic regression for Popularity. Score on Training Set = ", \
      popular_logregmodel.score(featurep_train,popular_train))
print("Logistic regression for Popularity. Score on Test Set = ", \
      popular_logregmodel.score(featurep_test,popular_test))
print("Logistic regression for Popularity. Score on Validation Set = ", \
      popular_logregmodel.score(featurep_validation,popular_validation),"\n")

print("Logistic regression for Controversiality. Score  on Training Set = ", \
      controversial_logregmodel.score(featurec_train,controversial_train))
print("Logistic regression for Controversiality. Score  on Training Set = ", \
      controversial_logregmodel.score(featurec_test,controversial_test))
print("Logistic regression for Controversiality. Score  on Training Set = ", \
      controversial_logregmodel.score(featurec_validation,controversial_validation),"\n")

Logistic regression for Popularity. Score on Training Set =  0.8844710028345975
Logistic regression for Popularity. Score on Test Set =  0.8822177535191037
Logistic regression for Popularity. Score on Validation Set =  0.8791082509767869 

Logistic regression for Controversiality. Score  on Training Set =  0.8015015705201869
Logistic regression for Controversiality. Score  on Training Set =  0.7997701809824763
Logistic regression for Controversiality. Score  on Training Set =  0.8021144564467938 



# Production model creation and optimization 