## Data Preparation

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import time
import xgboost as xgb
sns.set_style('darkgrid')

font = {'size': 18}
matplotlib.rc('font', **font)

Load in the training data

In [4]:
df = pd.read_pickle("data/train.pkl")
df['viral'] = df['score'] >= 50

In [5]:
df.head()

Unnamed: 0,id,author,title,selftext,time,date,score,num_comments,viral
54442,n97ehm,weremanthing,Refinance my home to free up VA loan or wait?,First let me say thank you for looking at my p...,11:20:52,2021-05-10,1,2,False
41531,nzf89i,b1ackcat,Thank you for being such a great resource; you...,[removed],01:08:32,2021-06-14,1,2,False
61126,mxntnt,runnerup,"401k vs 457b, not sure which to max first",My work has both the 401k and 457b plans. They...,12:50:32,2021-04-24,3,7,False
87222,lg8t6y,Bunburier,"Student Loans, Interest Rate, and Payment Stra...",I'll be attending graduate school soon. Tuitio...,12:44:56,2021-02-09,2,2,False
34549,obzx08,Mxnchkinz_,What do I put under Gross Income when applying...,I'm applying for a Discover Secured Credit Car...,21:27:28,2021-07-01,0,15,False


### Adding Document Vectors

Load in the GoogleNews word vector data frame (downloaded in "nlp_modeling_v2"), with words restricted to those appearing in the titles of the posts. Each row is indexed by a word, and has 300 numerical rows corresponding to the word vector.

In [6]:
word_vecs = pd.read_pickle("data/word_vec_train.pkl")

In [7]:
vocab = list(word_vecs.index) # get list of words in the word vector data frame

For each title, compute the document vector, which is merely the mean of the word vectors for the words in the title, if they exist.

In [8]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) # had to run "import nltk & nltk.download('stopwords')" before this worked
def get_doc_vec(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()]
    doc = [word for word in doc if word in vocab]
    word_vectors = [word_vecs.loc[word] for word in doc]
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(300)

In [9]:
tic = time.perf_counter()
x = []
for title in df.title:
    x.append(get_doc_vec(title))
toc = time.perf_counter()
print(f"Took {toc - tic:0.2f} seconds")

Took 100.44 seconds


Use PCA to reduce the dimension of the document vectors from 300 to 15, making it more tractable for running machine learning algorithms

In [10]:
tic = time.perf_counter()
from sklearn.decomposition import PCA

pca = PCA(n_components = 15, random_state=10)

reduced_vecs = pca.fit_transform(x)
toc = time.perf_counter()

## Document Vector Visualization

Applying t-distributed Stochastic Neighbor Embedding (t-SNE) to convert the 15 dimensional document vectors to 2 dimensions, and visualizing the viral and nonviral tweets.

In [None]:
from sklearn.manifold import TSNE

num_rows = 1000
viral_list = list(df.viral)[:num_rows]
viral_indices = [i for i in range(len(list(df.viral)[:num_rows])) if viral_list[i]==True]
nonviral_indices = [i for i in range(len(list(df.viral)[:num_rows])) if viral_list[i]==False]

plt.figure(figsize=(10,8))
for perp in range(5,100):
    tsne = TSNE(n_components = 2, init = 'random', random_state = 10, perplexity = perp)
    squished_vecs = tsne.fit_transform(reduced_vecs[:num_rows])
    plt.clf()
    plt.scatter([s[0] for s in squished_vecs[nonviral_indices]], [s[1] for s in squished_vecs[nonviral_indices]], alpha=0.1, color='red', label='nonviral')
    plt.scatter([s[0] for s in squished_vecs[viral_indices]], [s[1] for s in squished_vecs[viral_indices]], alpha=0.5, color='blue', label='viral')
    plt.title(f'Perplexity: {perp}')
    plt.legend()
    plt.show()

## Prediction Modeling

Make a copy of the original data frame and append the 15 dimensional document vectors

In [119]:
doc_vec_df = pd.DataFrame(reduced_vecs)
doc_vec_df['title'] = list(df.title)
df2 = df.copy(deep=True)
for i in range(15):
    df2['docvec_'+str(i)] = list(doc_vec_df[i])
df2.head()

Unnamed: 0,id,author,title,selftext,time,date,score,num_comments,viral,docvec_0,...,docvec_5,docvec_6,docvec_7,docvec_8,docvec_9,docvec_10,docvec_11,docvec_12,docvec_13,docvec_14
54442,n97ehm,weremanthing,Refinance my home to free up VA loan or wait?,First let me say thank you for looking at my p...,11:20:52,2021-05-10,1,2,False,0.453273,...,0.154068,0.24729,-0.254865,-0.074706,0.012286,-0.209845,-0.142833,-0.121779,0.034462,-0.057537
41531,nzf89i,b1ackcat,Thank you for being such a great resource; you...,[removed],01:08:32,2021-06-14,1,2,False,-0.328374,...,-0.117939,0.17325,-0.135397,0.072369,-0.076752,0.038594,0.045877,-0.039308,0.005682,0.057338
61126,mxntnt,runnerup,"401k vs 457b, not sure which to max first",My work has both the 401k and 457b plans. They...,12:50:32,2021-04-24,3,7,False,-0.441687,...,-0.149564,0.233118,-0.383484,-0.023185,-0.091743,-0.216232,0.122335,0.018077,-0.125292,-0.116175
87222,lg8t6y,Bunburier,"Student Loans, Interest Rate, and Payment Stra...",I'll be attending graduate school soon. Tuitio...,12:44:56,2021-02-09,2,2,False,0.519265,...,0.090096,-0.137941,-0.072417,-0.186384,0.125709,-0.043674,-0.138492,0.330588,-0.011789,-0.202053
34549,obzx08,Mxnchkinz_,What do I put under Gross Income when applying...,I'm applying for a Discover Secured Credit Car...,21:27:28,2021-07-01,0,15,False,0.347807,...,-0.323602,0.068687,0.033118,0.019221,-0.153715,0.081396,0.043083,-0.25949,-0.02528,-0.085683


Extract more data from columns:

In [120]:
df2['hour'] = [dt.hour for dt in df2['time']]
df2['weekday'] = [dt.weekday() for dt in df2['date']]
df2['ismorning'] = [hour in [6,7,8,9] for hour in list(df2['hour'])]
df2['isweekend'] = [weekday in [5, 6] for weekday in list(df2['weekday'])]
df2['chars_in_title'] = [len(title) for title in df2['title']]
df2['words_in_title'] = [len(title.split()) for title in df2['title']]
df2['chars_in_selftext'] = [len(str(selftext)) for selftext in df2['selftext']]
df2['words_in_selftext'] = [len(str(selftext).split()) for selftext in df2['selftext']]

In [88]:
# df2 = pd.get_dummies(df2, columns=['hour', 'weekday']) 

In [121]:
from sklearn.model_selection import train_test_split

cols_to_drop = ['id', 'author', 'title', 'selftext', 'time', 'date', 'score', 'num_comments']
df3 = df2.drop(cols_to_drop, axis=1)

# separate the features and the response
X = df3.drop('viral', axis=1)
y = df3['viral']

# put 80% of data into training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [122]:
list(X_train.columns)

['docvec_0',
 'docvec_1',
 'docvec_2',
 'docvec_3',
 'docvec_4',
 'docvec_5',
 'docvec_6',
 'docvec_7',
 'docvec_8',
 'docvec_9',
 'docvec_10',
 'docvec_11',
 'docvec_12',
 'docvec_13',
 'docvec_14',
 'hour',
 'weekday',
 'ismorning',
 'isweekend',
 'chars_in_title',
 'words_in_title',
 'chars_in_selftext',
 'words_in_selftext']

In [152]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [137]:
def powerset(s):
    power_set = []
    x = len(s)
    for i in range(1 << x):
        power_set.append([s[j] for j in range(x) if (i & (1 << j))])
        
    return power_set[1:]


In [156]:
doc_vec = ['docvec_'+str(i) for i in range(15)]
feature_list = powerset(['ismorning', 
                         'isweekend', 'chars_in_title', 'words_in_title', 
                         'chars_in_selftext', 'words_in_selftext'])
#feature_list = [['isweekend', 'chars_in_selftext', 'ismorning', 'words_in_title']]
feature_list = [feature + doc_vec for feature in feature_list]

knn = KNeighborsClassifier()

max_f1 = 0
max_f1_features = []

for features in feature_list:
    model.fit(X_train[features], y_train)
    preds = model.predict(X_test[features])
    precision = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    if f1 > max_f1:
        max_f1 = f1
        max_f1_features = list(set(features) - set(doc_vec))
    print("*************")
    print("Features:", list(set(features) - set(doc_vec)))
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"f1: {f1:.3f}")
print("Best f1:", max_f1)
print("Best f1 features:", max_f1_features)

*************
Features: ['isweekend', 'chars_in_selftext', 'ismorning', 'words_in_title']
Precision: 0.167
Recall: 0.002
f1: 0.004
Best f1: 0.004024144869215291
Best f1 features: ['isweekend', 'chars_in_selftext', 'ismorning', 'words_in_title']


In [157]:
confusion_matrix(y_test, preds)

array([[24493,     5],
       [  490,     1]])