In [1]:
import os
import re
from textblob import TextBlob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
from nltk import word_tokenize
from nltk import ngrams
from nltk.corpus import stopwords
import string
stop = set(stopwords.words('english')+list(string.punctuation))

os.chdir('C:/Users/AGB/Desktop/WeCloud_Materials/Project/Subreddit_Comments')
print(os.getcwd())

C:\Users\AGB\Desktop\WeCloud_Materials\Project\Subreddit_Comments


## Data Cleaning

In [2]:
# Read comments dataframe 
subreddit = 'Anarchism'

comment_title = subreddit+'_comments.csv'
comments = pd.read_csv(comment_title,
                          dtype = {'author':object,
                                   'body':str,
                                   'score':float,
                                   'created_utc':object,
                                   'id':object,
                                   'link_id':object,
                                   'parent_id':object,
                                   'hour':float,
                                   'day':float,
                                   'month':float,
                                   'year':float})

In [3]:
# Drop rows without values in year or comment body
comments.dropna(subset = ['year','body'],inplace=True, axis=0)
comments['created_utc'] = comments['created_utc'].astype(float)

In [4]:
# Rename columns and copy dataframe for easier processing
df = comments.copy()
df.rename(columns={'body': 'comment'}, inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11603 entries, 0 to 11602
Data columns (total 11 columns):
author         11603 non-null object
comment        11603 non-null object
score          11603 non-null float64
created_utc    11603 non-null float64
id             11603 non-null object
link_id        11603 non-null object
parent_id      11603 non-null object
hour           11603 non-null float64
day            11603 non-null float64
month          11603 non-null float64
year           11603 non-null float64
dtypes: float64(6), object(5)
memory usage: 1.1+ MB


In [6]:
df.head()

Unnamed: 0,author,comment,score,created_utc,id,link_id,parent_id,hour,day,month,year
0,[deleted],I just realised you are right and that im an i...,2.0,1514765000.0,ds0n5o3,t3_7mxfem,t1_ds0mznv,0.0,1.0,1.0,2018.0
1,jphuffinstuff,No problem! It happens to the best of us!,1.0,1514765000.0,ds0ncnp,t3_7mxfem,t1_ds0n5o3,0.0,1.0,1.0,2018.0
2,thrashgoat555,What missing information would that be.,3.0,1514765000.0,ds0nfc8,t3_7n88zr,t1_ds0ki36,0.0,1.0,1.0,2018.0
3,Adventure_Inc,"As someone who lives in Chicago, this pisses m...",17.0,1514765000.0,ds0nfj5,t3_7nbjsc,t3_7nbjsc,0.0,1.0,1.0,2018.0
4,[deleted],Happy new year btw :),1.0,1514765000.0,ds0ng9h,t3_7mxfem,t1_ds0ncnp,0.0,1.0,1.0,2018.0


In [7]:
# Replace author NaNs and drop [removed] comments
df['author'].fillna('None',inplace=True)
df = df[df['comment']!='[removed]']
df = df[df['comment'].apply(lambda x: str(x).isdigit() == False)]

df = df.reset_index(drop=True)
print('Remaining comments:',len(df))

Remaining comments: 11599


In [8]:
# Check to ensure there are no errors
counta = 0
countb = 0

for i in df['comment']:
    if isinstance(i,float):
        counta+=1
    elif i.isdigit() == True:
        countb+=1
        
print(counta)
countb

0


0

In [9]:
# Define functions to process the comments
def processText(text):
    
    # Convert to lower case
    text = text.lower()
    
    # Convert www.* or https?://* to URL - removed as may eliminate information
    #text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',text)
    
    # Remove additional white spaces
    text = re.sub('[\s]+', ' ', text)
    
    # Replace #word with word
    text = re.sub(r'#([^\s]+)', r'\1', text)
      
    # Trim
    text = text.strip('\'"')
    
    return text

def replaceTwoOrMore(s):
    # Look for 2 or more repetitions of character and replace with the character itself
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)

In [10]:
# Attempt at a spellchecker using NLTK and textblob although runs EXTREMELY slowly and is still ineffective (i.e. corrects Obama to drama)
from nltk.corpus import words
from textblob import Word

correction_threshold = 0.95

def correct(sentence):
    sentence = replaceTwoOrMore(sentence)
    tokens = nltk.word_tokenize(sentence)
    corrected_tokens = []
    for i in tokens:
        if i in words.words():
            corrected_tokens.append(i)
        else:
            w = Word(i)
            if w.spellcheck()[0][1] >= correction_threshold:
                corrected_tokens.append(w.spellcheck()[0][0])
            else:
                corrected_tokens.append(i)

    clean_sentence = " ".join(corrected_tokens)
    return clean_sentence

In [11]:
# NLTK Processing - must manually select how to stem / lemmatize
porter = nltk.PorterStemmer() #porter.stem
snowball = nltk.SnowballStemmer('english') #snowball.stem
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer() #wordnet_lemmatizer.lemmatize

# Add additional stopwords
add_stop = ["'s","n't","''","'m",'http','.com',"--","gt"]

def NLTKprocess(text):
    tokens = nltk.word_tokenize(text)
    stemmed_list = []
    for i in tokens:
        if(i in stop or re.search("^[a-zA-Z0-9\-']*$", i) is None):
            continue
        else:
            lemma = wordnet_lemmatizer.lemmatize(i)
            if lemma not in add_stop:
                stemmed_list.append(lemma)
    return stemmed_list

# Ngram with no stop words or stemming
def NLTKngram(text,n):
    return list(ngrams(text.split(), n))


In [12]:
# Create new feature with cleaned comments
clean_text = []

for i in df['comment']:
    clean_text.append(processText(i))

df['new_comment'] = clean_text

In [13]:
# Tokenize cleaned comments. ngram functionality has been turned off as is not currently being used
tokens = []
ngram = []
nwords = 2

for i in df['new_comment']:
    tokens.append(NLTKprocess(i))
    #ngram.append(NLTKngram(i,nwords))  

df['tokens'] = tokens
#df['ngram'] = ngram

In [14]:
#Remove comments below minimum length
minimum_len = 1

df = df[df['tokens'].map(len) > minimum_len]

In [15]:
df = df.reset_index(drop=True)
df[:20]

Unnamed: 0,author,comment,score,created_utc,id,link_id,parent_id,hour,day,month,year,new_comment,tokens
0,[deleted],I just realised you are right and that im an i...,2.0,1514765000.0,ds0n5o3,t3_7mxfem,t1_ds0mznv,0.0,1.0,1.0,2018.0,i just realised you are right and that im an i...,"[realised, right, im, idiot, bc, forgot, im, s..."
1,jphuffinstuff,No problem! It happens to the best of us!,1.0,1514765000.0,ds0ncnp,t3_7mxfem,t1_ds0n5o3,0.0,1.0,1.0,2018.0,no problem! it happens to the best of us!,"[problem, happens, best, u]"
2,thrashgoat555,What missing information would that be.,3.0,1514765000.0,ds0nfc8,t3_7n88zr,t1_ds0ki36,0.0,1.0,1.0,2018.0,what missing information would that be.,"[missing, information, would]"
3,Adventure_Inc,"As someone who lives in Chicago, this pisses m...",17.0,1514765000.0,ds0nfj5,t3_7nbjsc,t3_7nbjsc,0.0,1.0,1.0,2018.0,"as someone who lives in chicago, this pisses m...","[someone, life, chicago, piss]"
4,[deleted],Happy new year btw :),1.0,1514765000.0,ds0ng9h,t3_7mxfem,t1_ds0ncnp,0.0,1.0,1.0,2018.0,happy new year btw :),"[happy, new, year, btw]"
5,doomsdayprophecy,You're acting like a naive foreigner who's hea...,-2.0,1514765000.0,ds0nhqi,t3_7n2nkh,t1_drzsa2c,0.0,1.0,1.0,2018.0,you're acting like a naive foreigner who's hea...,"['re, acting, like, naive, foreigner, heard, m..."
6,doomsdayprophecy,Foreign agents... And domestic reactionaries. ...,0.0,1514766000.0,ds0o4su,t3_7n6dyr,t1_drzvvew,0.0,1.0,1.0,2018.0,foreign agents... and domestic reactionaries. ...,"[foreign, agent, domestic, reactionary, classi..."
7,the_undine,Very telling how they do this stuff but never ...,0.0,1514767000.0,ds0p9h1,t3_7n88zr,t1_drzwdqu,0.0,1.0,1.0,2018.0,very telling how they do this stuff but never ...,"[telling, stuff, never, resign]"
8,[deleted],There’s that fucking blood and soil group agai...,17.0,1514768000.0,ds0puie,t3_7nbjsc,t3_7nbjsc,0.0,1.0,1.0,2018.0,there’s that fucking blood and soil group agai...,"[fucking, blood, soil, group, put, flyer, town]"
9,blinkysmurf,"Yea, cause it's, like, the same thing and a di...",0.0,1514769000.0,ds0qim6,t3_7n88zr,t3_7n88zr,1.0,1.0,1.0,2018.0,"yea, cause it's, like, the same thing and a di...","[yea, cause, like, thing, direct, comparison, ..."


In [16]:
# Top words by occurence
flatten = [item for sublist in df['tokens'] for item in sublist]
word_dist = nltk.FreqDist(flatten)

top_N = 15

rslt = pd.DataFrame(word_dist.most_common(top_N),
                    columns=['Word', 'Frequency'])
rslt[:50]

Unnamed: 0,Word,Frequency
0,people,3439
1,like,2144
2,would,1837
3,think,1613
4,anarchist,1585
5,'re,1486
6,one,1476
7,get,1252
8,state,1206
9,thing,1185


## Sentiment Analysis

In [17]:
# Textblob sentiment approach
sentiment_polarity = []
sentiment_subjectivity = []

for i in range(len(df['new_comment'])):
    zen = TextBlob(df['new_comment'].iloc[i])
    sentiment_polarity.append(zen.sentiment.polarity)
    sentiment_subjectivity.append(zen.sentiment.subjectivity)
    
df['blob_polarity'] = np.around(sentiment_polarity,2)
df['blob_subj'] = np.around(sentiment_subjectivity,2)

In [18]:
# Vader sentiment approach
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

vader_compound = []
vader_neg = []
vader_neu = []
vader_pos = []

for i in range(len(df['new_comment'])):
    vader = analyser.polarity_scores(df['new_comment'][i])
    vader_compound.append(vader['compound'])
    vader_neg.append(vader['neg'])
    vader_neu.append(vader['neu'])
    vader_pos.append(vader['pos'])
    
df['v_compound'] = np.around(vader_compound,2)
df['v_negative'] = np.around(vader_neg,2)
df['v_neutral'] = np.around(vader_neu,2)
df['v_positive'] = np.around(vader_pos,2)

## Reading Level

In [19]:
from textstat.textstat import textstat as ts

flesch = [] #http://www.readabilityformulas.com/flesch-grade-level-readability-formula.php
gunning_fog = [] #https://en.wikipedia.org/wiki/Gunning_fog_index
avg_syllables = []
difficult_words = []
num_words = []

for i in range(len(df['new_comment'])):
    flesch.append(ts.flesch_kincaid_grade(df['new_comment'][i]))
    gunning_fog.append(ts.gunning_fog(df['new_comment'][i]))
    avg_syllables.append(ts.syllable_count(df['new_comment'][i]) / ts.lexicon_count(df['new_comment'][i]))
    difficult_words.append(ts.difficult_words(df['new_comment'][i]) / ts.lexicon_count(df['new_comment'][i]))
    num_words.append(ts.lexicon_count(df['new_comment'][i]))
    
# Create new columns
df['flesch'] = np.around(flesch,2)
df['gunning_fog'] = np.around(gunning_fog,2)
df['avg_syllables'] = np.around(avg_syllables,2)
df['difficult_words'] = np.around(difficult_words,2)
df['word_count'] = np.around(num_words,2)

# Remove negative values
df['flesch'] = df['flesch'].clip(lower=0)
df['gunning_fog'] = df['gunning_fog'].clip(lower=0)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11083 entries, 0 to 11082
Data columns (total 24 columns):
author             11083 non-null object
comment            11083 non-null object
score              11083 non-null float64
created_utc        11083 non-null float64
id                 11083 non-null object
link_id            11083 non-null object
parent_id          11083 non-null object
hour               11083 non-null float64
day                11083 non-null float64
month              11083 non-null float64
year               11083 non-null float64
new_comment        11083 non-null object
tokens             11083 non-null object
blob_polarity      11083 non-null float64
blob_subj          11083 non-null float64
v_compound         11083 non-null float64
v_negative         11083 non-null float64
v_neutral          11083 non-null float64
v_positive         11083 non-null float64
flesch             11083 non-null float64
gunning_fog        11083 non-null float64
avg_syllables   

In [21]:
# Create summary report to ensure values make sense (no errors)
print('Summary Output (Based on {} Comments in Top)'.format(len(df)))
print()

print("Sentiment Analysis:")
avg_sentiment = df['v_compound'].mean()
# Polarity represents the average of the sentiment when it is non-zero
avg_negative = df['v_negative'][df['v_negative']>0].mean()
avg_neutral = df['v_neutral'][df['v_neutral']>0].mean()
avg_positive = df['v_positive'][df['v_positive']>0].mean()
                  
print("Overall: {:.2f}".format(avg_sentiment))
print("Negative Polarity: {:.2f}".format(avg_negative))
print("Neutral Polarity: {:.2f}".format(avg_neutral))
print("Positive Polarity: {:.2f}".format(avg_positive))
print()

print('Writing Statistics:')
avg_flesch = df['flesch'].mean()
avg_syllables = df['avg_syllables'].mean()
avg_difficult_words = df['difficult_words'].mean()

print("Flesch–Kincaid Grade: {:.1f}".format(avg_flesch))
print("Avg Number of Syllables per Word: {:.2f}".format(avg_syllables))
print("Proportion of Difficult Words: {:.2f}".format(avg_difficult_words))
print()

print('Most Common Words:')
print(rslt[:5])
print()

print("Note: All values represent averages")

Summary Output (Based on 11083 Comments in Top)

Sentiment Analysis:
Overall: 0.00
Negative Polarity: 0.17
Neutral Polarity: 0.77
Positive Polarity: 0.18

Writing Statistics:
Flesch–Kincaid Grade: 8.0
Avg Number of Syllables per Word: 1.53
Proportion of Difficult Words: 0.22

Most Common Words:
        Word  Frequency
0     people       3439
1       like       2144
2      would       1837
3      think       1613
4  anarchist       1585

Note: All values represent averages


In [22]:
# Export processed comments to CSV
comm_title = subreddit+'_processed_comments.csv'
df.to_csv(comm_title,index=False)