# Create a Natural Language Processing model for the topics in a given subreddit

## Load packages

In [10]:
import ciso8601
import time
import datetime 
import requests
import json
import csv
import praw
import numpy as np
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.test.utils import common_corpus
from gensim.test.utils import datapath
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import multiprocessing as mp
import warnings; warnings.simplefilter('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dmartizzi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load data from a pickle file (temporary)

In [119]:
#subrname = "politicaldiscussion"
#subrname = "history"
#subrname = "quotes"
subrname = "changemyview"

#inpath = "./"
#fname = "dump_r-"+subrname+"_2019-09-12.pkl"
inpath = "./subreddit_data/"
fname = inpath+"r-"+subrname+"-export.pkl"
df = pd.read_pickle(fname)
sel = df["selftext"].str.strip() == "[removed]"
df["selftext"][sel] = ""

df["title+selftext"] = df["title"]+" "+df["selftext"]
display(df)


Unnamed: 0,subreddit,sub_id,title,author,created,url,permalink,score,numComms,flair,selftext,upvote_ratio,title+selftext
0,changemyview,abghnz,CMV: Asian hetero men should die out alone sin...,stupidattraction,2019-01-01 00:12:10,https://www.reddit.com/r/changemyview/comments...,/r/changemyview/comments/abghnz/cmv_asian_hete...,1,0,,,1,CMV: Asian hetero men should die out alone sin...
1,changemyview,abgy7x,"CMV: Affirmative action makes sense in theory,...",showershortz,2019-01-01 01:37:48,https://www.reddit.com/r/changemyview/comments...,/r/changemyview/comments/abgy7x/cmv_affirmativ...,1,8,,I preface this by saying I am from the United ...,0.77,"CMV: Affirmative action makes sense in theory,..."
2,changemyview,abgyqc,CMV: Scientology is a cult and helps no one.,SightWithoutEyes,2019-01-01 01:40:31,https://www.reddit.com/r/changemyview/comments...,/r/changemyview/comments/abgyqc/cmv_scientolog...,1,0,,,0.75,CMV: Scientology is a cult and helps no one.
3,changemyview,abh0k4,"At 41, I’m finally old enough to appreciate da...",ciciohme2,2019-01-01 01:50:12,https://www.reddit.com/r/changemyview/comments...,/r/changemyview/comments/abh0k4/at_41_im_final...,1,0,,,1,"At 41, I’m finally old enough to appreciate da..."
4,changemyview,abh6d6,"CMV: Men not only can live without women, but ...",Bad_Company173,2019-01-01 02:22:27,https://www.reddit.com/r/changemyview/comments...,/r/changemyview/comments/abh6d6/cmv_men_not_on...,1,15,,"In this day and age, being single or celibate-...",0.24,"CMV: Men not only can live without women, but ..."
5,changemyview,abhfsc,CMV: People who earn more should pay less taxe...,eagleye101,2019-01-01 03:11:09,https://www.reddit.com/r/changemyview/comments...,/r/changemyview/comments/abhfsc/cmv_people_who...,1,43,,*Disclosure: I don' consider my self rich alth...,0.27,CMV: People who earn more should pay less taxe...
6,changemyview,abhglk,CMV: The Left in its present form can't be rec...,VirileMember,2019-01-01 03:15:07,https://www.reddit.com/r/changemyview/comments...,/r/changemyview/comments/abhglk/cmv_the_left_i...,1,60,,As long as the Left defines policies as racist...,Nan,CMV: The Left in its present form can't be rec...
7,changemyview,abhqys,"CMV: In this post, I am going to offer a funda...",adamski4554,2019-01-01 04:10:21,https://www.reddit.com/r/changemyview/comments...,/r/changemyview/comments/abhqys/cmv_in_this_po...,1,4,,So.\nYou've all heard about the four elements ...,0.5,"CMV: In this post, I am going to offer a funda..."
8,changemyview,abhss0,CMV: You should not have a right to bear arms,blindpacemaker,2019-01-01 04:18:56,https://www.reddit.com/r/changemyview/comments...,/r/changemyview/comments/abhss0/cmv_you_should...,1,394,,\n\nConvince me that you have a good enough r...,0.57,CMV: You should not have a right to bear arms ...
9,changemyview,abi3yg,I should be about to make suicide joke because...,Bung-Motor,2019-01-01 05:14:59,https://www.reddit.com/r/changemyview/comments...,/r/changemyview/comments/abi3yg/i_should_be_ab...,1,0,,,1,I should be about to make suicide joke because...


## Preprocess the submissions 

In [120]:
def lemmatize_stemming(text):
    '''Function to lemmatize text'''
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [121]:
def preprocess(text):
    '''Function to pre-process text'''
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [122]:
# Preprocess titles 
processed_subm = df["title+selftext"].map(preprocess)

In [123]:
# Create Bag of Words 
dictionary = gensim.corpora.Dictionary(processed_subm)

# Filter out irrelevant words 
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=50000)

# For each tweet, create a dictionary reporting how many
# words and how many times those words appear.
bow_corpus = [dictionary.doc2bow(subm) for subm in processed_subm]

In [124]:
# Create Latent Dirichlet Allocation model with a given number of topics
ncores = mp.cpu_count()
ntopics = 10
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=ntopics, id2word=dictionary, \
            passes=10, workers=ncores)

In [125]:
# Print topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.011*"peopl" + 0.010*"like" + 0.008*"think" + 0.008*"know" + 0.007*"time" + 0.007*"religion" + 0.007*"thing" + 0.006*"sport" + 0.006*"christian" + 0.005*"good"
Topic: 1 
Words: 0.022*"reddit" + 0.018*"peopl" + 0.017*"http" + 0.017*"like" + 0.012*"changemyview" + 0.012*"comment" + 0.010*"rule" + 0.010*"messag" + 0.010*"thing" + 0.010*"movi"
Topic: 2 
Words: 0.021*"game" + 0.018*"like" + 0.009*"music" + 0.008*"play" + 0.008*"time" + 0.007*"video" + 0.006*"peopl" + 0.006*"think" + 0.006*"good" + 0.006*"watch"
Topic: 3 
Words: 0.013*"peopl" + 0.010*"money" + 0.008*"compani" + 0.008*"work" + 0.007*"govern" + 0.006*"product" + 0.006*"need" + 0.005*"chang" + 0.005*"cost" + 0.005*"like"
Topic: 4 
Words: 0.015*"vote" + 0.015*"trump" + 0.013*"http" + 0.010*"democrat" + 0.008*"polit" + 0.008*"elect" + 0.008*"peopl" + 0.008*"parti" + 0.008*"like" + 0.007*"presid"
Topic: 5 
Words: 0.028*"peopl" + 0.021*"women" + 0.016*"gender" + 0.016*"white" + 0.011*"black" + 0.011*"like" + 0.009

In [126]:
# Save LDA model to disc (it's expensive to regenerate)
outpath = "./lda_models/"
ldaoutfile = outpath+"/lda_model_"+str(ntopics)+"topics_r-"+subrname
lda_model.save(ldaoutfile)

## Perform EDA

In [127]:
# Load LDA model from disc (it's expensive to regenerate)
try :
    lda_model
except :
    subrname = "politicaldiscussion"
    ntopics = 10
    outpath = "./lda_models/"
    ldaoutfile = outpath+"/lda_model_"+str(ntopics)+"topics_r-"+subrname+"_test"
    #ldaoutfile = path_to_data_directory+"/lda_model_20topics"
    lda_model = gensim.models.LdaModel.load(ldaoutfile)

In [128]:
test_strings = {"test_strings" : ["I like ice cream, it is my life"]}
dtest = pd.DataFrame(data = test_strings)

display(dtest)

processed_test_strings = dtest["test_strings"].map(preprocess)

test_corpus = [dictionary.doc2bow(text) for text in processed_test_strings]

# Print topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))
print("\n\n")
    
for i,unseen_string in enumerate(test_corpus):
    vector = lda_model[unseen_string]
    print(dtest["test_strings"].iloc[i],vector,"\n")


Unnamed: 0,test_strings
0,"I like ice cream, it is my life"


Topic: 0 
Words: 0.011*"peopl" + 0.010*"like" + 0.008*"think" + 0.008*"know" + 0.007*"time" + 0.007*"religion" + 0.007*"thing" + 0.006*"sport" + 0.006*"christian" + 0.005*"good"

Topic: 1 
Words: 0.022*"reddit" + 0.018*"peopl" + 0.017*"http" + 0.017*"like" + 0.012*"changemyview" + 0.012*"comment" + 0.010*"rule" + 0.010*"messag" + 0.010*"thing" + 0.010*"movi"

Topic: 2 
Words: 0.021*"game" + 0.018*"like" + 0.009*"music" + 0.008*"play" + 0.008*"time" + 0.007*"video" + 0.006*"peopl" + 0.006*"think" + 0.006*"good" + 0.006*"watch"

Topic: 3 
Words: 0.013*"peopl" + 0.010*"money" + 0.008*"compani" + 0.008*"work" + 0.007*"govern" + 0.006*"product" + 0.006*"need" + 0.005*"chang" + 0.005*"cost" + 0.005*"like"

Topic: 4 
Words: 0.015*"vote" + 0.015*"trump" + 0.013*"http" + 0.010*"democrat" + 0.008*"polit" + 0.008*"elect" + 0.008*"peopl" + 0.008*"parti" + 0.008*"like" + 0.007*"presid"

Topic: 5 
Words: 0.028*"peopl" + 0.021*"women" + 0.016*"gender" + 0.016*"white" + 0.011*"black" + 0.011*"like" + 