In [134]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

sns.set_style('darkgrid')

font = {'size': 18}
matplotlib.rc('font', **font)

Read in model, and add categorial column which a post is viral or not.

In [135]:
df = pd.read_pickle("data/train.pkl")
df['viral'] = df['score'] >= 50

In [136]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(df, random_state=42, test_size=0.2) 

Following along the skikit learn tutorial for working with text data: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [137]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train['title'])

In [138]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(99952, 19206)

In [139]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(99952, 19206)

In [140]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_tfidf, X_train.viral)

MultinomialNB()

Putting the above into a pipeline:

In [141]:
from sklearn.pipeline import Pipeline

nb_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

nb_clf.fit(X_train.title, X_train.viral)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

This baseline model categorizes virtually all as nonviral; the one post it classifies as going viral did not go viral.

In [142]:
confusion_matrix(X_test.viral, nb_clf.predict(X_test.title))

array([[24497,     1],
       [  491,     0]])

Instead of using Naive Bayes, we can try out a SGD classifier.

In [143]:
from sklearn.linear_model import SGDClassifier

sgd_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                         alpha=1e-3, random_state=42,
                         max_iter=5, tol=None)),
])

In [144]:
sgd_clf.fit(X_train['title'], X_train['viral'])
confusion_matrix(X_test.viral, sgd_clf.predict(X_test['title']))

array([[24498,     0],
       [  491,     0]])

No true positives; only once false positive, and plenty of false negatives. Our classifier doesn't think anything goes viral.

In [145]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)], #words or bigrams
    'tfidf__use_idf': (True, False), #with or without idf
    'clf__alpha': (1e-2, 1e-3), #with a penalty parameter of either 0.01 or 0.001 for linear SVM
}

In [146]:
gs_clf = GridSearchCV(sgd_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(X_train.title, X_train.viral)

In [147]:
for param_name in sorted(parameters.keys()):
    print(f'{param_name}: {gs_clf.best_params_[param_name]}')

clf__alpha: 0.01
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [148]:
predicted = gs_clf.predict(X_test['title'])
confusion_matrix(X_test.viral, predicted)

array([[24498,     0],
       [  491,     0]])

Even using the more sophisticated algorithm along with a grid search did not improve the model. However, we only used a bag-of-words approach to generate predictors; there exist more sophisticated ways to represent text data. In addition, we expect to also use temporal data, as well as data about the users themselves.