In [220]:
import os
os.chdir('C:/Users/AGB/Desktop/WeCloud_Materials/Project/Subreddit_Comments')
print(os.getcwd())

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
import re

C:\Users\AGB\Desktop\WeCloud_Materials\Project\Subreddit_Comments


## Text Classification

In [243]:
# Load processed comments from desired subreddits 
subreddits = ['Anarchism','Conservative','LateStageCapitalism','Libertarian']

df = pd.DataFrame([])

for i in subreddits: 
    tempdf = pd.read_csv(i+'_processed_comments.csv')
    tempdf['subreddit'] = i
    df = pd.concat([df,tempdf])

df = df.sample(frac=0.3, random_state=10)
df.reset_index(drop=True,inplace=True)
df['subreddit'].value_counts()/len(df)

Libertarian            0.473725
LateStageCapitalism    0.239021
Conservative           0.228165
Anarchism              0.059089
Name: subreddit, dtype: float64

In [252]:
# Optionally aggregate subreddits into conservative / liberal in order to highlight terms which differ
political_map = {'Libertarian':'Conservative','Conservative':'Conservative',
                'LateStageCapitalism':'Liberal','Anarchism':'Liberal'}

df['subreddit'] = df['subreddit'].map(political_map) 

In [245]:
from sklearn import preprocessing
from ast import literal_eval

# Label encode target subreddit / assigned political orientation
le = preprocessing.LabelEncoder()

df['subreddit'] = le.fit_transform(df['subreddit'])

tokens = [] 

for i in range(len(df['tokens'])):
    tokens.append(literal_eval(df['tokens'][i]))

df['tokens'] = tokens

In [246]:
from sklearn.model_selection import train_test_split

# Split into train / test sets 
y = df['subreddit']
X = df['new_comment']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20)

In [247]:
# Import NLP packages and set stop words 
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import string

stop_words = set(stopwords.words('english')+list(string.punctuation))

In [248]:
# Process text further to break into tokens
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer() #lemma = wordnet_lemmatizer.lemmatize(i)
porter = nltk.PorterStemmer() #lemma = porter.stem(i)
snowball = nltk.SnowballStemmer('english') #lemma = snowball.stem(i)

add_stop = ["'s","n't","''","'m",'http','.com',"--","gt"]

def NLTKprocess(text):
    tokens = nltk.word_tokenize(text)
    stemmed_list = []
    for i in tokens:
        if(i in stop_words or re.search("^[a-zA-Z0-9\-']*$", i) is None):
            continue
        else:
            lemma = wordnet_lemmatizer.lemmatize(i)
            if lemma not in add_stop:
                stemmed_list.append(lemma)
    return stemmed_list


vectorizer = TfidfVectorizer(tokenizer=NLTKprocess)
# Default w/o lemmatizing: 
#vectorizer = TfidfVectorizer(stop_words = stop_words,tokenizer=word_tokenize)

# Learn and transform train documents
vectorised_train_documents = vectorizer.fit_transform(X_train)
vectorised_test_documents = vectorizer.transform(X_test)

In [249]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

# Use LinearSVC as classifier
classifier = LinearSVC(C=1)
classifier.fit(vectorised_train_documents, y_train)

SVM_train_predictions = classifier.predict(vectorised_train_documents)
SVM_test_predictions = classifier.predict(vectorised_test_documents)

In [250]:
# Define functions to output key terms for subreddit

def print_top10(vectorizer, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-15:]
        print("%s: %s" % (class_label,
              " ".join(feature_names[j] for j in top10)))

def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

if len(np.sort(y_train.unique())) <= 2:
    show_most_informative_features(vectorizer, classifier, n=20)
else:
    print_top10(vectorizer, classifier, np.sort(y_train.unique()))

0: hijab kennedy assimilate pakistan bombshell dems brigade conceal conservative unborn hollywood reparation pelosi amnesty bannon
1: wic imperialism rehabilitation thriving anarchism revolutionary bourgeoisie 1937 imperialist solidarity boa bourgeois lobbying fash praxis
2: ultra-right populace discord cuck che statist threatened houston censor tariff lp nap nigger libertarianism libertarian


### Evaluation

The main metrics for Text Classification are:

**Precision**: Number of documents correctly assigned to a category out of the total number of documents predicted.  
**Recall**: Number of documents correctly assigned to a category out of the total number of documents in such category.  
**F1**: Metric that combines precision and recall using the harmonic mean.  

If the evaluation is being done in multi-class or multi-label environments, the method becomes slightly more complicated because the quality metrics have to be either shown per category, or globally aggregated. There are two main aggregation approaches:  

**Micro-average**: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.  
**Macro-average**: The quality for each category is calculated independently and their average is reported. All the categories are equally important.

In [251]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
 
test_predictions = SVM_test_predictions

accuracy = accuracy_score(y_test, test_predictions)
precision = precision_score(y_test, test_predictions, average='micro')
recall = recall_score(y_test, test_predictions, average='micro')
f1 = f1_score(y_test, test_predictions, average='micro')
 
print("Micro-average quality numbers")
print("Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(accuracy,precision, recall, f1))
 
precision = precision_score(y_test, test_predictions, average='macro')
recall = recall_score(y_test, test_predictions,average='macro')
f1 = f1_score(y_test, test_predictions, average='macro')

print()
print("Macro-average quality numbers")
print("Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(accuracy,precision, recall, f1))

Micro-average quality numbers
Accuracy: 0.6115, Precision: 0.6115, Recall: 0.6115, F1-measure: 0.6115

Macro-average quality numbers
Accuracy: 0.6115, Precision: 0.5929, Recall: 0.5753, F1-measure: 0.5810


In [231]:
# Calculate accuracy
counter = []

for i in range(len(test_predictions)):
    counter.append((test_predictions[i]==y_test.values[i]).sum())

counter = pd.DataFrame(counter)
counter[0].value_counts()/len(counter)

1    0.783309
0    0.216691
Name: 0, dtype: float64

In [232]:
print('Combined SVM Confusion Matrix:')
pd.crosstab(y_test, test_predictions, rownames=['True'], colnames=['Predicted'], margins=True)

Combined Logistic Regression Confusion Matrix:


Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,23774,1764,25538
1,6101,4657,10758
All,29875,6421,36296


In [233]:
print(le.inverse_transform(np.sort(y_train.unique())))

['Conservative' 'Liberal']


# SciKit Naive Bayes

In [128]:
# Can instead use multinomial NB
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(vectorised_train_documents, y_train)

NB_train_predictions = clf.predict(vectorised_train_documents)
NB_test_predictions = clf.predict(vectorised_test_documents)

In [129]:
test_predictions = NB_test_predictions

accuracy = accuracy_score(y_test, test_predictions)
precision = precision_score(y_test, test_predictions, average='micro')
recall = recall_score(y_test, test_predictions, average='micro')
f1 = f1_score(y_test, test_predictions, average='micro')
 
print("Micro-average quality numbers")
print("Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(accuracy,precision, recall, f1))
 
precision = precision_score(y_test, test_predictions, average='macro')
recall = recall_score(y_test, test_predictions,average='macro')
f1 = f1_score(y_test, test_predictions, average='macro')

print()
print("Macro-average quality numbers")
print("Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(accuracy,precision, recall, f1))

Micro-average quality numbers
Accuracy: 0.5453, Precision: 0.5453, Recall: 0.5453, F1-measure: 0.5453

Macro-average quality numbers
Accuracy: 0.5453, Precision: 0.6917, Recall: 0.3401, F1-measure: 0.3254


In [130]:
print('Combined Logistic Regression Confusion Matrix:')
pd.crosstab(y_test, test_predictions, rownames=['True'], colnames=['Predicted'], margins=True)

Combined Logistic Regression Confusion Matrix:


Predicted,0,1,2,3,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,11,51,190,1984,2236
1,0,1603,188,6531,8322
2,1,187,1723,6763,8674
3,2,337,270,16455,17064
All,14,2178,2371,31733,36296


# Model Stacking and Prediction

In [131]:
X_train_out = df.iloc[X_train.index].copy()
X_test_out = df.iloc[X_test.index].copy()

In [132]:
X_train_out.drop(['author','comment','created_utc','id','link_id','parent_id','new_comment','tokens','subreddit'], axis=1, inplace=True)
X_test_out.drop(['author','comment','created_utc','id','link_id','parent_id','new_comment','tokens','subreddit'], axis=1, inplace=True)

X_train_out.drop(['year','month'], axis=1, inplace=True)
X_test_out.drop(['year','month'], axis=1, inplace=True)

In [134]:
X_train_out['SVM_pred'] = SVM_train_predictions

dummy_CODE = X_train_out['SVM_pred']
dummy_CODE = pd.get_dummies(dummy_CODE)
dummy_CODE = dummy_CODE.add_suffix('_SVM')
X_train_out = X_train_out.join(dummy_CODE,how='outer')
X_train_out.drop(['SVM_pred',], axis=1, inplace=True)

X_test_out['SVM_pred'] = SVM_test_predictions

dummy_CODE = X_test_out['SVM_pred']
dummy_CODE = pd.get_dummies(dummy_CODE)
dummy_CODE = dummy_CODE.add_suffix('_SVM')
X_test_out = X_test_out.join(dummy_CODE,how='outer')
X_test_out.drop(['SVM_pred',], axis=1, inplace=True)

In [135]:
X_train_out['NB_pred'] = NB_train_predictions

dummy_CODE = X_train_out['NB_pred']
dummy_CODE = pd.get_dummies(dummy_CODE)
dummy_CODE = dummy_CODE.add_suffix('_NB')
X_train_out = X_train_out.join(dummy_CODE,how='outer')
X_train_out.drop(['NB_pred',], axis=1, inplace=True)

X_test_out['NB_pred'] = NB_test_predictions

dummy_CODE = X_test_out['NB_pred']
dummy_CODE = pd.get_dummies(dummy_CODE)
dummy_CODE = dummy_CODE.add_suffix('_NB')
X_test_out = X_test_out.join(dummy_CODE,how='outer')
X_test_out.drop(['NB_pred',], axis=1, inplace=True)

In [137]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(
        n_estimators = 100, # number of trees in the forest, default is 10
        max_features = 'sqrt', # number of features to be split on each node
        n_jobs = 4, # the number of jobs is set to the number of cores
        random_state = 7)
    
rf_clf.fit(X_train_out, y_train)

# check the variable importance
importance = rf_clf.feature_importances_
importance = pd.DataFrame(importance, 
                          columns=["importance"],
                          index = X_train_out.columns
                         )
importance.sort_values(by='importance',ascending=False,inplace=True)

importance[:]

Unnamed: 0,importance
3_SVM,0.079716
2_SVM,0.064136
score,0.063476
day,0.062385
gunning_fog,0.058288
hour,0.055401
flesch,0.055156
1_SVM,0.055051
word_count,0.054562
avg_syllables,0.054035


In [138]:
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

gbm = xgb.XGBClassifier(max_depth=3, n_estimators=150, learning_rate=0.05).fit(X_train_out, y_train.ravel())
gbm_predictedValue = gbm.predict_proba(X_test_out)

y_pred = gbm_predictedValue[:,1]
gbm_train_pred = gbm.predict(X_train_out)
gbm_test_pred = gbm.predict(X_test_out)

#print('AUC using XGBoost is {:.4f}'.format(roc_auc_score(y_test_out, y_pred)))
print('Accuracy of XGBoost classifier on training set: {:.3f}'
     .format(gbm.score(X_train_out, y_train)))
print('Accuracy of XGBoost classifier on test set: {:.3f}'
     .format(gbm.score(X_test_out, y_test)))

Accuracy of XGBoost classifier on training set: 0.694
Accuracy of XGBoost classifier on test set: 0.623


In [139]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

# Random forest model
rf_clf = RandomForestClassifier(
        n_estimators = 100, # number of trees in the forest, default is 10
        max_depth = 7, #max depth of each tree
        max_features = 'sqrt', # number of features to be split on each node
        n_jobs = 4, # the number of jobs is set to the number of cores
        random_state = 7)

rf_clf.fit(X_train_out, y_train.ravel())
rf_clf_predictedValue = rf_clf.predict_proba(X_test_out)

y_pred = rf_clf_predictedValue[:,1]
rf_train_pred = rf_clf.predict(X_train_out)
rf_test_pred = rf_clf.predict(X_test_out)

#print('AUC using Random Forest is {:.4f}'.format(roc_auc_score(y_test, y_pred)))
print('Accuracy of Random Forest classifier on training set: {:.3f}'
     .format(rf_clf.score(X_train_out, y_train)))
print('Accuracy of Random Forest classifier on test set: {:.3f}'
     .format(rf_clf.score(X_test_out, y_test)))

Accuracy of Random Forest classifier on training set: 0.693
Accuracy of Random Forest classifier on test set: 0.621


In [141]:
from sklearn.linear_model import LogisticRegression

logr_clf = LogisticRegression(C=1).fit(X_train_out, y_train)
logr_predictedValue = logr_clf.predict_proba(X_test_out)

y_pred = logr_predictedValue[:,1]
logr_train_pred = logr_clf.predict(X_train_out)
logr_test_pred = logr_clf.predict(X_test_out)

#print('AUC using Logistic Regression is {:.4f}'.format(roc_auc_score(y_test, y_pred)))
print('Accuracy of Logistic Regression classifier on training set: {:.3f}'
     .format(logr_clf.score(X_train_out, y_train)))
print('Accuracy of Logistic Regression classifier on test set: {:.3f}'
     .format(logr_clf.score(X_test_out, y_test)))

Accuracy of Logistic Regression classifier on training set: 0.692
Accuracy of Logistic Regression classifier on test set: 0.621


# NLTK Bayesian Classifier - Returns Feature Importance

In [182]:
import random

documents = [(list(df['tokens'].iloc[i]), df['subreddit'].iloc[i])
             for i in range(len(df['tokens']))]

flatten = [item for sublist in df['tokens'] for item in sublist]
all_words = nltk.FreqDist(flatten)

random.shuffle(documents)

word_features = list(all_words.keys())[:3000]

In [183]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [185]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [187]:
# set that we'll train our classifier with
test_cutoff = int(np.around(.8 * (len(featuresets)),0))

training_set = featuresets[:test_cutoff]

# set that we'll test against.
testing_set = featuresets[test_cutoff:]

In [188]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
#print("Classifier training accuracy percent:",(nltk.classify.accuracy(classifier, training_set))*100)
print("Classifier test accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier training accuracy percent: 53.4600725536116
Classifier test accuracy percent: 50.61989163375884


In [193]:
print(le.inverse_transform(np.sort(y_train.unique())))
inform_features = classifier.show_most_informative_features(30)

['Anarchism' 'Conservative' 'LateStageCapitalism' 'Libertarian']
Most Informative Features
                 anarchy = True                0 : 1      =     94.3 : 1.0
                 manning = True                0 : 3      =     72.9 : 1.0
                comrades = True                0 : 3      =     63.2 : 1.0
                 mueller = True                1 : 2      =     56.9 : 1.0
               anarchist = True                0 : 2      =     54.3 : 1.0
              indigenous = True                0 : 3      =     51.3 : 1.0
                 chelsea = True                0 : 3      =     51.3 : 1.0
          libertarianism = True                3 : 2      =     49.1 : 1.0
                     dsa = True                0 : 3      =     45.9 : 1.0
                  ancaps = True                0 : 1      =     42.6 : 1.0
             imperialist = True                0 : 3      =     38.2 : 1.0
             imperialism = True                0 : 1      =     37.5 : 1.0
         

  if diff:


# LDA Analysis - Topic Modelling

In [14]:
# Importing Gensim
import gensim
from gensim import corpora

subreddit = 0

sub_df = df[df['subreddit']== subreddit]

# Creating the term dictionary of our corpus, where every unique term is assigned an index. 

dictionary = corpora.Dictionary(sub_df['tokens'])
dictionary.save('dictionary.dict')
print(dictionary)

Dictionary(24183 unique tokens: ['bc', 'dumb', 'forgot', 'idiot', 'im']...)


In [10]:
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in sub_df['tokens']]
corpora.MmCorpus.serialize('corpus.mm', doc_term_matrix)

print(len(doc_term_matrix))
print(doc_term_matrix[100])

11086
[(79, 1), (321, 1), (1086, 1), (1087, 1), (1088, 1), (1089, 1)]


In [98]:
from time import time 

start = time()
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)
print('used: {:.2f}s'.format(time()-start))


used: 127.05s


In [100]:
ldamodel.save('topic.model')

In [11]:
from gensim.models import LdaModel
loading = LdaModel.load('topic.model')
print(loading.print_topics(num_topics=2, num_words=4))

[(2, '0.031*"amp" + 0.017*"--" + 0.008*"animals" + 0.008*"eat"'), (4, '0.018*"people" + 0.010*"like" + 0.009*"think" + 0.007*"get"')]


#pyLDAvis visualization

import pyLDAvis.gensim
#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

pyLDAvis.enable_notebook()

d = gensim.corpora.Dictionary.load('dictionary.dict')
c = gensim.corpora.MmCorpus('corpus.mm')
lda = gensim.models.LdaModel.load('topic.model')

data = pyLDAvis.gensim.prepare(lda, c, d)
data