# Machine Learning Experimentation

In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

import time

sns.set_style('darkgrid')

font = {'size': 18}
matplotlib.rc('font', **font)
df = pd.read_pickle("data/train_full.pkl")

## Adding document vector columns

In [5]:
# load in GoogleNews word vector data frame (associates each word with a 300-dimensional vector)
word_vecs = pd.read_pickle("data/word_vec_train.pkl")
vocab = list(word_vecs.index) # list of words in the word vector data frame
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) # had to run "import nltk & nltk.download('stopwords')" before this worked

### For each row, create the document vector for the title,
### which is simply the mean of the word vectors for each 
### word in the row
def get_doc_vec(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()]
    doc = [word for word in doc if word in vocab]
    word_vectors = [word_vecs.loc[word] for word in doc]
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(300)

Gets 15-dimensional document vectors for each title.

In [6]:
tic = time.perf_counter()
title_vecs = []
for title in df.title:
    title_vecs.append(get_doc_vec(title))
toc = time.perf_counter()
print(f"Generating document vectors took {toc - tic:0.2f} seconds")

# dimensionality reduction to make them 15 dimensional
tic = time.perf_counter()
from sklearn.decomposition import PCA
pca = PCA(n_components = 15, random_state=10)
reduced_title_vecs = pca.fit_transform(title_vecs)
toc = time.perf_counter()
print(f"PCA took {toc - tic:0.2f} seconds for the titles")

### Add new doc vecs and df together to make df2
title_vec_df = pd.DataFrame(reduced_title_vecs)
title_vec_df['title'] = list(df.title)
df2 = df.copy(deep=True)
for i in range(15):
    df2['title_vec_'+str(i)] = list(title_vec_df[i])
df2.head()

Generating document vectors took 151.00 seconds
PCA took 5.50 seconds for the titles


Unnamed: 0,title,selftext,num_user_comments,median_user_comment_score,max_user_comment_score,median_user_submission_score,max_user_submission_score,title_length,selftext_length,account_age_in_days,...,title_vec_5,title_vec_6,title_vec_7,title_vec_8,title_vec_9,title_vec_10,title_vec_11,title_vec_12,title_vec_13,title_vec_14
0,Refinance my home to free up VA loan or wait?,First let me say thank you for looking at my p...,94,1.0,1871.0,3.0,25906.0,45,1119,1711,...,0.154077,0.246773,-0.255453,-0.07498,0.011749,-0.209472,-0.142993,-0.121136,0.034912,-0.05743
1,Thank you for being such a great resource; you...,[removed],0,0.0,0.0,6.0,4888.0,82,9,4030,...,-0.118272,0.172936,-0.136579,0.07182,-0.076568,0.038487,0.046794,-0.039418,0.005441,0.0572
2,"401k vs 457b, not sure which to max first",My work has both the 401k and 457b plans. They...,0,0.0,0.0,0.0,0.0,41,576,4209,...,-0.149487,0.231898,-0.384127,-0.023007,-0.091016,-0.21616,0.12211,0.016297,-0.124946,-0.117779
3,"Student Loans, Interest Rate, and Payment Stra...",I'll be attending graduate school soon. Tuitio...,0,0.0,0.0,2.0,490.0,59,381,1300,...,0.090344,-0.138411,-0.071522,-0.186286,0.125739,-0.043341,-0.138247,0.330284,-0.012551,-0.202465
4,What do I put under Gross Income when applying...,I'm applying for a Discover Secured Credit Car...,29,1.0,3.0,1.5,42.0,64,335,309,...,-0.3235,0.068177,0.032419,0.019061,-0.153625,0.081621,0.042312,-0.260291,-0.022442,-0.085601


Gets 15-dimensional document vectors for each selftext

In [7]:
tic = time.perf_counter()
selftext_vecs = []
for selftext in df.selftext:
    selftext_vecs.append(get_doc_vec(selftext))
toc = time.perf_counter()
print(f"Generating document vectors took {toc - tic:0.2f} seconds")

# dimensionality reduction to make them 15 dimensional
tic = time.perf_counter()
from sklearn.decomposition import PCA
pca = PCA(n_components = 15, random_state=10)
reduced_selftext_vecs = pca.fit_transform(selftext_vecs)
toc = time.perf_counter()
print(f"PCA took {toc - tic:0.2f} seconds")

### Add new doc vecs and df together to make df2
selftext_vec_df = pd.DataFrame(reduced_selftext_vecs)
selftext_vec_df['self_text'] = list(df.selftext)
df3 = df2.copy(deep=True)
for i in range(15):
    df3['selftext_vec'+str(i)] = list(selftext_vec_df[i])
df3.head()

Generating document vectors took 1526.37 seconds
PCA took 6.30 seconds


Unnamed: 0,title,selftext,num_user_comments,median_user_comment_score,max_user_comment_score,median_user_submission_score,max_user_submission_score,title_length,selftext_length,account_age_in_days,...,selftext_vec5,selftext_vec6,selftext_vec7,selftext_vec8,selftext_vec9,selftext_vec10,selftext_vec11,selftext_vec12,selftext_vec13,selftext_vec14
0,Refinance my home to free up VA loan or wait?,First let me say thank you for looking at my p...,94,1.0,1871.0,3.0,25906.0,45,1119,1711,...,0.085906,-0.068474,0.030933,-0.010447,0.054885,-0.041621,0.018199,-0.005874,0.110491,-0.094413
1,Thank you for being such a great resource; you...,[removed],0,0.0,0.0,6.0,4888.0,82,9,4030,...,0.004308,0.002728,0.000512,0.001219,0.002182,-0.000929,-0.000618,0.000131,-4e-05,0.000902
2,"401k vs 457b, not sure which to max first",My work has both the 401k and 457b plans. They...,0,0.0,0.0,0.0,0.0,41,576,4209,...,0.063874,0.056882,-0.151622,0.020153,-0.105026,-0.044397,-0.029719,-0.05688,0.100621,-0.05955
3,"Student Loans, Interest Rate, and Payment Stra...",I'll be attending graduate school soon. Tuitio...,0,0.0,0.0,2.0,490.0,59,381,1300,...,-0.066218,-0.044475,-0.069004,0.099427,0.029773,-0.096716,-0.015093,0.110866,-0.004543,-0.018326
4,What do I put under Gross Income when applying...,I'm applying for a Discover Secured Credit Car...,29,1.0,3.0,1.5,42.0,64,335,309,...,-0.05434,-0.085841,-0.165055,0.166259,-0.024915,0.022362,0.075793,0.088651,-0.02097,0.13057


## Machine Learning

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.base import clone
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [15]:
df4 = df3.drop(['title', 'selftext', 'score', 'num_comments', 'virality_score'], axis=1)
#df4 = pd.get_dummies(df4, prefix='weekday', columns=['weekday']) (did not help)
X = df4.drop('is_viral', axis=1)
y = df4['is_viral'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
def powerset(s):
    power_set = []
    x = len(s)
    for i in range(1 << x):
        power_set.append([s[j] for j in range(x) if (i & (1 << j))])
    return power_set[1:]

doc_vec = ['docvec_'+str(i) for i in range(15)]
no_nlp = [f for f in X.columns if not (f.startswith('title_vec') or f.startswith('selftext_vec')) ]
no_user = [f for f in X.columns if not (f in ['num_user_comments', 'median_user_comment_score',
       'max_user_comment_score', 'median_user_submission_score',
       'max_user_submission_score', 'account_age_in_days']) ]
no_nlp_no_user = [f for f in X.columns if f in no_nlp and f in no_user]

In [19]:
values = [i for i in np.arange(120,130,10)]
precisions = []
recalls = []
f1s = []
for value in values:
    xgb_clf = xgb.XGBClassifier(use_label_encoder=False, 
                                n_estimators=value, 
                                max_depth=9,
                                eval_metric='logloss') 
    feature_list = [no_nlp_no_user, no_user, no_nlp, X.columns] # all columns
    max_f1 = 0
    max_f1_features = []
    for features in feature_list:
        tic = time.perf_counter()
        model = clone(xgb_clf)
        model.fit(X_train[features], y_train)
        preds = model.predict(X_test[features])
        precision = precision_score(y_test, preds)
        recall = recall_score(y_test, preds)
        f1 = f1_score(y_test, preds)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        if f1 > max_f1:
            max_f1 = f1
            max_f1_features = list(set(features) - set(doc_vec))
        toc = time.perf_counter()
        print("*************")
        #print("Features:", list(set(features) - set(doc_vec)))
        print(f"Features: {features}")
        print(f"Precision: {precision:.3f}")
        print(f"Recall: {recall:.3f}")
        print(f"f1: {f1:.3f}")
        print(confusion_matrix(y_test, preds))
        print(f"Runtime: {toc-tic:.3f}")

*************
Features: ['title_length', 'selftext_length', 'minutes_into_day', 'weekday', 'ismorning', 'isweekend']
Precision: 0.360
Recall: 0.012
f1: 0.024
[[22750    48]
 [ 2164    27]]
Runtime: 11.375
*************
Features: ['title_length', 'selftext_length', 'minutes_into_day', 'weekday', 'ismorning', 'isweekend', 'title_vec_0', 'title_vec_1', 'title_vec_2', 'title_vec_3', 'title_vec_4', 'title_vec_5', 'title_vec_6', 'title_vec_7', 'title_vec_8', 'title_vec_9', 'title_vec_10', 'title_vec_11', 'title_vec_12', 'title_vec_13', 'title_vec_14', 'selftext_vec0', 'selftext_vec1', 'selftext_vec2', 'selftext_vec3', 'selftext_vec4', 'selftext_vec5', 'selftext_vec6', 'selftext_vec7', 'selftext_vec8', 'selftext_vec9', 'selftext_vec10', 'selftext_vec11', 'selftext_vec12', 'selftext_vec13', 'selftext_vec14']
Precision: 0.385
Recall: 0.037
f1: 0.067
[[22670   128]
 [ 2111    80]]
Runtime: 103.987
*************
Features: ['num_user_comments', 'median_user_comment_score', 'max_user_comment_score'