In [2]:
import nltk
import pandas as pd
import numpy as np
import utils
import os


In [3]:
raw_comments_df = pd.read_csv(os.path.join("static","data","raw_comments_239858.csv"))

In [13]:
raw_comments_df

Unnamed: 0,author,subreddit,body,score,body_tokens
0,depressedbutimlit,AskReddit,"Haven't been in one, I'm 19",1,[]
1,Anonstarr,AskReddit,Male sea horses are the ones who actually get ...,1,"[male, sea, horse, actually, pregnant, give, b..."
2,div_tiw,AskReddit,Learnt polar form calculus,1,"[learn, polar, form, calculus]"
3,RAMI_XXL,AskReddit,"I would say the fried chicken, ham, steak and ...",1,"[fry, chicken, ham, steak, chocolate, fountain..."
4,JesusChristSuperDerp,AskReddit,Wonder Boy,1,"[wonder, boy]"
...,...,...,...,...,...
239853,Bensemus,technology,It also doesn’t take hundreds of millions or b...,13,"[doe, hundred, million, billion, dollar, brand..."
239854,besselfunctions,technology,Thank you for a serious and thoughtful response.,4,"[thank, serious, thoughtful, response]"
239855,the_monkey_knows,technology,It’ll be dead by a thousand cuts compared to t...,2,"[dead, thousand, cut, compare, swift, effect, ..."
239856,I_am_the_night,technology,"Ah yes, the party of small government",2,"[yes, party, small, government]"


In [4]:
raw_comments_df["body_tokens"] = raw_comments_df["body"].apply(utils.preprocess_text)

In [5]:
# len(raw_comments_df["body_tokens"][0])
raw_comments_df.shape

(239858, 5)

In [109]:
# only interested in rows where the length of the list of clean tokens is > 0, 
# i.e. distinguishing sentences with at least one non-stopword
cleaned_comments_df = raw_comments_df.loc[raw_comments_df["body_tokens"].astype(str) != "[]", :]
cleaned_comments_df = cleaned_comments_df.reset_index(drop=True)
cleaned_comments_df["body"] = cleaned_comments_df["body"].astype(str)

In [116]:
cleaned_comments_df.isna().any()

author          True
subreddit      False
body           False
score          False
body_tokens     True
dtype: bool

In [126]:
# removing empty comments. maybe these comments really were "NaN"
cleaned_comments_df = cleaned_comments_df.dropna(subset = ["body_tokens"]).reset_index(drop=True)
cleaned_comments_df.shape

(235575, 5)

In [130]:
print(utils.stop_words)

{'yourself', 'not', 'look', 'time', 'being', 'very', 'thing', 'no', 'ain', 'our', 'from', "weren't", 'theirs', 'mightn', "mustn't", 'say', 'while', 'she', 'how', "hadn't", 'having', 'my', 'during', 'reddit.', 'any', 'with', 'same', "shan't", 'hasn', 'itself', 'his', 'to', 'okay', 'on', '...', 'said', 'people', 'want', 'should', 'yours', 'y', 'against', 'down', ' could', 'him', 'her', 'but', 'its', "that'll", "doesn't", 'at', 'we', 't', 'com', 'yeah', 'few', 'ourselves', 'are', 'he', 'before', 'which', 'into', 'such', 'name', 'http', 'can', 'll', 'doesn', 'until', 'as', 'their', 'become', 'between', 'www', 'shan', "you've", 'in', 've', 'hers', 'wasn', 'what', 'or', 'make', 'did', 'subreddit', 'know', 'for', 'above', 'hadn', 'reddit', 'too', 'needn', 'www.', 'an', "won't", 'over', 'won', 'they', 'do', 'own', 'them', 'further', "wasn't", 'wouldn', "didn't", 'when', "wouldn't", "don't", 'the', 'mustn', 'think', 'then', 'subject', 'of', 'again', 'your', 'now', 'after', 'it', 'these', "shoul

In [127]:
cleaned_comments_df.to_csv(os.path.join("data","cleaned_comments_235575.csv"), index=False)

In [None]:
explore_df = pd.DataFrame({"Total Number of Tokens": subreddit_tokens.apply(lambda x: len(x)),
                        "Number of Unique Tokens": subreddit_tokens.apply(lambda x: len(set(x)))})

explore_df["Lexical Diversity"] = explore_df['Number of Unique Tokens'] / explore_df['Total Number of Tokens']

explore_df

In [None]:
from collections import Counter
import operator

def wordListToFreqList(wordlist, top_n=10):
    """Compile a list of all words and their frequency of occurence"""
    
    # count each term's number of occurrences
    freqDict = Counter(wordlist)
    
    # sort the frequency dictionary by its values descending and return the items as a list of tuples
    sortedFreqs = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
    
    return sortedFreqs[:top_n]

In [None]:
freqs = subreddit_tokens.apply(lambda tokens: wordListToFreqList(tokens))
freqs

In [None]:
freqs.index[1]

In [None]:
import math
import matplotlib.pyplot as plt
from matplotlib import gridspec

In [None]:
# Using gridspec allows us to dynamically add subplots in grid
N = len(freqs.keys())
cols = 2
rows = int(math.ceil(N / cols))
gs = gridspec.GridSpec(rows, cols)

# define the figure space for the plots
fig = plt.figure()
fig.set_figheight(N*2)
fig.set_figwidth(20)

# iterate over number of categories to plot each one's top terms
for i in range(N):
    
    # add a plot to the figure
    ax = fig.add_subplot(gs[i])
    ax.set_title(f"Most Frequent Words for: {freqs.index[i]}", fontsize=14, fontweight='bold')
    
    # break the terms and term counts into two lists/tuples
    x,y = zip(*freqs[i])
    #plot the data
    ax.bar(x,y)
    # increase x-label font size
    plt.xticks(fontsize=14)
    # place numeric label on the bar
    for j, v in enumerate(y):
        ax.text(j, v/2, str(v), color='white', fontweight='bold', ha='center')
    

fig.tight_layout()

In [None]:
cleaned_comments_df["body_tokens_spaced"] = cleaned_comments_df["body_tokens"].apply(lambda x: ' '.join(x))

In [None]:
cleaned_comments_df.to_csv(os.path.join("static","data","cleaned_comments.csv"))

In [None]:
cleaned_comments_df = pd.read_csv(os.path.join("static","data","cleaned_comments.csv"))

In [None]:
tf_vectorizer = CountVectorizer()
tf = tf_vectorizer.fit_transform(cleaned_comments_df["body_tokens_spaced"])
tf_feature_names = tf_vectorizer.get_feature_names()

In [None]:
tf_feature_names

In [None]:
n_topics = 249
lda = LatentDirichletAllocation(n_components=n_topics, random_state=4).fit(tf)

In [None]:
def display_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic: {topic_idx}")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

n_top_words = 10
display_topics(lda, tf_feature_names, n_top_words)