In [2]:
import pandas as pd
import pickle
import numpy as np
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, SCORERS, mean_squared_error, accuracy_score, f1_score
from imblearn.datasets import make_imbalance
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import RandomOverSampler

In [3]:
posts_df = pd.read_csv('./Data/posts_scores_dates.csv')

In [4]:
X = posts_df['title']
y = posts_df['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)

## Regression Models to predict score

#### Baselines

In [7]:
baseline = np.full_like(y_test, y_train.median())

print(mean_squared_error(y_test, baseline, squared = False))

print(accuracy_score(y_test, baseline))

#### Basic KNN Regressor

In [19]:
pipe_knn = Pipeline([
    ('cv', CountVectorizer()),
    ('knn', KNeighborsRegressor())
])

In [20]:
pipe_knn.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()), ('knn', KNeighborsRegressor())])

In [21]:
print(pipe_knn.score(X_train, y_train))
print(pipe_knn.score(X_test, y_test))

0.1883731025654961
-0.2697590813831401


In [22]:
mean_squared_error(y_train, pipe_knn.predict(X_train), squared = False)

1075.0297667838522

In [23]:
mean_squared_error(y_test, pipe_knn.predict(X_test), squared = False)

992.9478033165451

These numbers are both a bit worse than baseline, but the model is overfit and still has a large error

#### Basic Logistic Regressor

In [24]:
pipe_lr = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression(max_iter=1000))
])

In [25]:
pipe_lr.fit(X_train, y_train)
pipe_lr.score(X_train, y_train), pipe_lr.score(X_test, y_test)

(0.4967885375494071, 0.18303075213041867)

In [26]:
print(mean_squared_error(y_train, pipe_lr.predict(X_train), squared = False))
print(mean_squared_error(y_test, pipe_lr.predict(X_test), squared = False))

1170.4576042977799
890.5628898833294


The model is underperforming, but exceeds the baseline accuracy

#### Basic Random Forest Regressor

In [27]:
pipe_rf = Pipeline([
    ('cv', CountVectorizer()),
    ('rf', RandomForestRegressor())
])

In [28]:
pipe_rf.fit(X_train, y_train)
pipe_rf.score(X_train, y_train), pipe_rf.score(X_test, y_test)

(0.7474788061926598, -0.65358637870589)

In [29]:
print(mean_squared_error(y_train, pipe_rf.predict(X_train), squared = False))
print(mean_squared_error(y_test, pipe_rf.predict(X_test), squared = False))

599.6408222711941
1133.1285932891997


The model is outperforming the baseline, but is overfit

#### Basic Extra Trees Regressor

In [30]:
pipe_et = Pipeline([
    ('cv', CountVectorizer()),
    ('et', ExtraTreesRegressor(n_jobs=-1))

])

In [31]:
pipe_et.fit(X_train, y_train)
pipe_et.score(X_train, y_train), pipe_et.score(X_test, y_test)

(0.9247702493419087, -1.3190814827337491)

## Classification Models to predict if viral (top 2% of scores)

First create a function to pull all metrics

In [15]:
def get_metrics(function):
    print(f'The training balanced accuracy score is {balanced_accuracy_score(y_train, function.predict(X_train))}')
    print(f'The testing balanced accuracy score is {balanced_accuracy_score(y_test, function.predict(X_test))}')
    print(f'The training f1 score is {f1_score(y_train, function.predict(X_train))}')
    print(f'The testing f1 score is {f1_score(y_test, function.predict(X_test))}')
    print(f'The training accuracy score is {accuracy_score(y_train, function.predict(X_train))}')
    print(f'The testing accuracy score is {accuracy_score(y_test, function.predict(X_test))}')

Train Test Split

In [9]:
X = posts_df['title']
y = posts_df['viral']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20, stratify=y)

#### baseline

In [10]:
posts_df['viral'].value_counts(normalize=True)

0    0.979991
1    0.020009
Name: viral, dtype: float64

In [11]:
baseline_preds = np.full_like(y_test, 0)

In [12]:
print(f'The training balanced accuracy score is {balanced_accuracy_score(y_test, baseline_preds)}')
print(f'The training f1 score is {f1_score(y_test, baseline_preds)}')
print(f'The training accuracy score is {accuracy_score(y_test, baseline_preds)}')

The training balanced accuracy score is 0.5
The training f1 score is 0.0
The training accuracy score is 0.9799925898480919


#### Basic Logistic Regression

In [13]:
pipe_lrc = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression(max_iter=1000, random_state=20))
])

pipe_lrc.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()),
                ('lr', LogisticRegression(max_iter=1000, random_state=20))])

In [16]:
get_metrics(pipe_lrc)

The training balanced accuracy score is 0.5216049382716049
The testing balanced accuracy score is 0.5090702233424351
The training f1 score is 0.08284023668639053
The testing f1 score is 0.03571428571428571
The training accuracy score is 0.9808547430830039
The testing accuracy score is 0.9799925898480919


The model is slightly out performing the baseline on all metrics

#### Logistic Regression with balanced class weights

In [17]:
pipe_lrcb = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression(max_iter= 1000, class_weight='balanced', random_state=20))
])

pipe_lrcb.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()),
                ('lr',
                 LogisticRegression(class_weight='balanced', max_iter=1000,
                                    random_state=20))])

In [18]:
get_metrics(pipe_lrcb)

The training balanced accuracy score is 0.9608078374988719
The testing balanced accuracy score is 0.6103409647833089
The training f1 score is 0.42359249329758714
The testing f1 score is 0.11851851851851852
The training accuracy score is 0.9468873517786561
The testing accuracy score is 0.911819192293442


The model's accuracy is underforming baseline, but it is significantly outperforming baseline on balanced accuracy and f1

#### Basic KNN Classification

In [38]:
pipe_knnc = Pipeline([
    ('cv', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

In [128]:
get_metrics(pipe_knnc)

The training balanced accuracy score is 0.5184554986042256
The testing balanced accuracy score is 0.5092592592592593
The training f1 score is 0.07100591715976332
The testing f1 score is 0.03636363636363636
The training accuracy score is 0.9806077075098815
The testing accuracy score is 0.9803630974434976


The model performs very similarly to the logistic regression model and is slightly outperforming the baseline

#### Basic Random Forest Classifier

In [19]:
pipe_rfc = Pipeline([
    ('cv', CountVectorizer()),
    ('rf', RandomForestClassifier(random_state=20))
])

pipe_rfc.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()),
                ('rf', RandomForestClassifier(random_state=20))])

In [20]:
get_metrics(pipe_rfc)

The training balanced accuracy score is 0.9133911871707014
The testing balanced accuracy score is 0.5905131975075264
The training f1 score is 0.8963210702341138
The testing f1 score is 0.26666666666666666
The training accuracy score is 0.9961709486166008
The testing accuracy score is 0.9796220822526862


The model is performing well on the training data, but is overfit. However the testing data still outperforms the baseline on balanced accuracy score and f1 score

### Grid Search

#### grid search on random forest

In [21]:
params = {
    'cv__ngram_range': [(1,1),(1,2),(2,2)],
    'cv__stop_words' :[None, 'english'],
    'cv__min_df':[1, 2, 4],
    'cv__lowercase':[True, False],
    'rf__n_jobs':[-1],
    'rf__max_depth': [None, 2, 4]
}

gs = GridSearchCV(
    pipe_rfc,
    param_grid= params,
    n_jobs=-1,
    scoring='balanced_accuracy'
)

gs.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('rf',
                                        RandomForestClassifier(random_state=20))]),
             n_jobs=-1,
             param_grid={'cv__lowercase': [True, False],
                         'cv__min_df': [1, 2, 4],
                         'cv__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'cv__stop_words': [None, 'english'],
                         'rf__max_depth': [None, 2, 4], 'rf__n_jobs': [-1]},
             scoring='balanced_accuracy')

In [22]:
gs.best_params_

{'cv__lowercase': False,
 'cv__min_df': 1,
 'cv__ngram_range': (2, 2),
 'cv__stop_words': None,
 'rf__max_depth': None,
 'rf__n_jobs': -1}

In [23]:
gs.best_score_

0.5700145357163315

In [24]:
get_metrics(gs)

The training balanced accuracy score is 0.9598135232955836
The testing balanced accuracy score is 0.5918364489252959
The training f1 score is 0.9551282051282051
The testing f1 score is 0.2941176470588235
The training accuracy score is 0.9982707509881423
The testing accuracy score is 0.9822156354205261


In [140]:
pipe_gs = Pipeline([
    ('cv', CountVectorizer(ngram_range=(1,2), min_df=2)),
    ('rf', RandomForestClassifier(n_jobs=-1, random_state=20))
])
pipe_gs.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer(min_df=2, ngram_range=(1, 2))),
                ('rf', RandomForestClassifier(n_jobs=-1))])

In [141]:
get_metrics(pipe_gs)

The training balanced accuracy score is 0.9011715479869417
The testing balanced accuracy score is 0.590135125673878
The training f1 score is 0.8873720136518771
The testing f1 score is 0.2597402597402597
The training accuracy score is 0.9959239130434783
The testing accuracy score is 0.9788810670618747


### Bring in other word processing

In [86]:
import nltk
nltk.download('wordnet') 
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/emilyfuller/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/emilyfuller/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/emilyfuller/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [87]:
lemmatizer = WordNetLemmatizer()

In [88]:
class LemmaTokenizer():
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [89]:
def lemma_tokenizer(doc):
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(w) for w in word_tokenize(doc)]random_state=

In [90]:
cvl = CountVectorizer(tokenizer=lemma_tokenizer)

In [91]:
cvl.fit(X_train)

CountVectorizer(tokenizer=<function lemma_tokenizer at 0x7fcb7a3d9ee0>)

In [93]:
X_train_cvl = cvl.transform(X_train)

In [95]:
pd.DataFrame(X_train_cvl.A, columns = cvl.get_feature_names_out()).head()

Unnamed: 0,sponge,!,#,$,%,&,','','all-powerful,'baked,...,🥰✌🏻,🥲,🦀,🦄💕,🧁🍪,🧇,🧑🏻‍🍳🥸,🫐,🫐🍰,🫖
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Gride search over LR (balanced) with the new tokenizer

In [None]:
pipe_lrcb = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression(max_iter= 1000, class_weight='balanced'))
])

In [148]:
params_lrcb = {
    'cv__ngram_range': [(1,1),(1,2),(2,2)],
   # 'cv__stop_words' :[None, 'english'],
    'cv__min_df':[1, 2, 4],
    'cv__lowercase':[True, False],
    'cv__tokenizer' :[None, lemma_tokenizer],
    'lr__n_jobs':[-1]
}

In [150]:
gs_lrcb = GridSearchCV(
    pipe_lrcb,
    param_grid= params_lrcb,
    n_jobs=-1,
    scoring='balanced_accuracy'
)

In [151]:
gs_lrcb.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('lr',
                                        LogisticRegression(class_weight='balanced',
                                                           max_iter=1000))]),
             n_jobs=-1,
             param_grid={'cv__lowercase': [True, False],
                         'cv__min_df': [1, 2, 4],
                         'cv__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'cv__tokenizer': [None,
                                           <function lemma_tokenizer at 0x7fcb7a3d9ee0>],
                         'lr__n_jobs': [-1]},
             scoring='balanced_accuracy')

In [105]:
gs.best_estimator_

Pipeline(steps=[('cv', CountVectorizer(min_df=4)),
                ('lr',
                 LogisticRegression(class_weight='balanced', max_iter=1000,
                                    n_jobs=-1))])

In [152]:
get_metrics(gs_lrcb)

The training balanced accuracy score is 0.9212406676065192
The testing balanced accuracy score is 0.6197822586291395
The training f1 score is 0.28305400372439476
The testing f1 score is 0.10270270270270271
The training accuracy score is 0.904891304347826
The testing accuracy score is 0.8769914783253057


In [153]:
There is no substantive improvement with the grid searched lr balanced model compared to the 

The training balanced accuracy score is 0.9608078374988719
The testing balanced accuracy score is 0.6103409647833089
The training f1 score is 0.42359249329758714
The testing f1 score is 0.11851851851851852
The training accuracy score is 0.9468873517786561
The testing accuracy score is 0.911819192293442


#### gridsearch over rfc with the new tokenizer

In [110]:
pipe_rfc

Pipeline(steps=[('cv', CountVectorizer()), ('rf', RandomForestClassifier())])

In [115]:
params_rfc  = {
    'cv__ngram_range': [(1,1),(1,2),(2,2)],
    'cv__min_df':[1, 2, 4],
    'cv__lowercase':[True, False],
    'cv__tokenizer' :[None, lemma_tokenizer],
    'rf__n_jobs':[-1],
}

In [116]:
gs_rfc = GridSearchCV(
    pipe_rfc,
    param_grid= params_rfc,
    n_jobs=-1,
    scoring='balanced_accuracy'
)

In [117]:
gs_rfc.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('rf', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'cv__lowercase': [True, False],
                         'cv__min_df': [1, 2, 4],
                         'cv__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'cv__tokenizer': [None,
                                           <function lemma_tokenizer at 0x7fcb7a3d9ee0>],
                         'rf__n_jobs': [-1]},
             scoring='balanced_accuracy')

In [118]:
gs_rfc.best_estimator_

Pipeline(steps=[('cv',
                 CountVectorizer(min_df=2, ngram_range=(1, 2),
                                 tokenizer=<function lemma_tokenizer at 0x7fcb7a3d9ee0>)),
                ('rf', RandomForestClassifier(n_jobs=-1))])

In [119]:
gs_rfc.score(X_train, y_train), gs_rfc.score(X_test, y_test)

(0.9104938271604939, 0.5899460897570539)

In [154]:
get_metrics(gs_rfc)

The training balanced accuracy score is 0.9104938271604939
The testing balanced accuracy score is 0.5899460897570539
The training f1 score is 0.9016949152542372
The testing f1 score is 0.2564102564102564
The training accuracy score is 0.9964179841897233
The testing accuracy score is 0.978510559466469


There is no substantive improvement from the original rfc pipe model

### Try strategies to overcome the imbalanced data

In [160]:
pipe_ros = Pipeline([
    ('cv', CountVectorizer()),
    ('ros', RandomOverSampler()),
    ('rfc', RandomForestClassifier())
])

pipe_ros.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()), ('ros', RandomOverSampler()),
                ('rfc', RandomForestClassifier())])

In [161]:
get_metrics(pipe_ros)

The training balanced accuracy score is 0.9938240483992942
The testing balanced accuracy score is 0.6001470279353077
The training f1 score is 0.7677725118483413
The testing f1 score is 0.1935483870967742
The training accuracy score is 0.9878952569169961
The testing accuracy score is 0.9629492404594294


Adding the basic random oversampling slightly improves the RFC metrics. We may be able to further improve by tuning the parameters

In [166]:
params_ros  = {
    'cv__ngram_range': [(1,1),(1,2),(2,2)],
    'cv__min_df':[1, 2, 4],
    'cv__lowercase':[True, False],
    'cv__tokenizer' :[None, lemma_tokenizer],
    'ros__sampling_strategy':[0, .25, .5, .75, 1],
    'rfc__n_jobs':[-1],
}

In [167]:
gs_ros = GridSearchCV(
    pipe_ros,
    param_grid= params_ros,
    n_jobs=-1,
    scoring='balanced_accuracy'
)

gs_ros.fit(X_train, y_train)

180 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/emilyfuller/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/emilyfuller/opt/anaconda3/lib/python3.9/site-packages/imblearn/pipeline.py", line 268, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "/Users/emilyfuller/opt/anaconda3/lib/python3.9/site-packages/imblearn/pipeline.py", line 226, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "/Users/emilyfuller/opt/anaconda3/lib/python3.9/site-packages/jo

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('ros', RandomOverSampler()),
                                       ('rfc', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'cv__lowercase': [True, False],
                         'cv__min_df': [1, 2, 4],
                         'cv__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'cv__tokenizer': [None,
                                           <function lemma_tokenizer at 0x7fcb7a3d9ee0>],
                         'rfc__n_jobs': [-1],
                         'ros__sampling_strategy': [0.0, 0.25, 0.5, 0.75, 1]},
             scoring='balanced_accuracy')

In [168]:
get_metrics(gs_ros)

The training balanced accuracy score is 0.9025081925888581
The testing balanced accuracy score is 0.5827137156059652
The training f1 score is 0.1731694281133084
The testing f1 score is 0.058823529411764705
The training accuracy score is 0.8089179841897233
The testing accuracy score is 0.7154501667284179


In [169]:
gs_ros.best_estimator_

Pipeline(steps=[('cv', CountVectorizer(min_df=4, ngram_range=(2, 2))),
                ('ros', RandomOverSampler(sampling_strategy=1)),
                ('rfc', RandomForestClassifier(n_jobs=-1))])

In [170]:
gs_ros.best_params_

{'cv__lowercase': True,
 'cv__min_df': 4,
 'cv__ngram_range': (2, 2),
 'cv__tokenizer': None,
 'rfc__n_jobs': -1,
 'ros__sampling_strategy': 1}