In [82]:
import pandas as pd
import pickle
import numpy as np
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, SCORERS, mean_squared_error, accuracy_score

In [122]:
posts_df = pd.read_csv('./Data/posts_scores_dates_viral.csv')

In [123]:
X = posts_df['title']
y = posts_df['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)

#### Baselines

In [72]:
baseline = np.full_like(y_test, y_train.median())

In [78]:
mean_squared_error(y_test, baseline, squared = False)

1248.1081690577198

In [83]:
accuracy_score(y_test, baseline)

0.015351243116969799

## Regression Models to predict score

#### Basic KNN Regressor

In [60]:
pipe_knn = Pipeline([
    ('cv', CountVectorizer()),
    ('knn', KNeighborsRegressor())
])

In [124]:
pipe_knn.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()), ('knn', KNeighborsRegressor())])

In [125]:
pipe_knn.score(X_train, y_train)

0.3631191779913039

In [126]:
pipe_knn.score(X_test, y_test)

0.30848071528913534

In [127]:
mean_squared_error(y_train, pipe_knn.predict(X_train), squared = False)

864.8012738490858

In [128]:
mean_squared_error(y_test, pipe_knn.predict(X_test), squared = False)

1028.596439806195

These numbers are both a bit better than baseline, but the model is overfit and still has a large error

#### Basic Logistic Regressor

In [64]:
pipe_lr = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression(max_iter=1000))
])

In [65]:
pipe_lr.fit(X_train, y_train)
pipe_lr.score(X_train, y_train), pipe_lr.score(X_test, y_test)

(0.34957721406319536, 0.2089103954613716)

In [81]:
print(mean_squared_error(y_train, pipe_lr.predict(X_train), squared = False))
print(mean_squared_error(y_test, pipe_lr.predict(X_test), squared = False))

1078.4824752551972
1266.5016032479132


The model is underperforming, but exceeds the baseline accuracy

#### Basic Random Forest Regressor

In [66]:
pipe_rf = Pipeline([
    ('cv', CountVectorizer()),
    ('rf', RandomForestRegressor())
])

In [67]:
pipe_rf.fit(X_train, y_train)
pipe_rf.score(X_train, y_train), pipe_rf.score(X_test, y_test)

(0.49267580734014027, 0.3162584174127826)

In [84]:
print(mean_squared_error(y_train, pipe_rf.predict(X_train), squared = False))
print(mean_squared_error(y_test, pipe_rf.predict(X_test), squared = False))

771.8448689218682
1022.7956334127475


The model is outperforming the baseline, but is overfit

#### Basic Extra Trees Regressor

In [68]:
pipe_et = Pipeline([
    ('cv', CountVectorizer()),
    ('et', ExtraTreesRegressor(n_jobs=-1))

])

In [69]:
pipe_et.fit(X_train, y_train)
pipe_et.score(X_train, y_train), pipe_et.score(X_test, y_test)

(0.5235932501442125, 0.22721633355926607)

## Classification Models to predict if viral (top 2% of scores)

#### baseline

In [87]:
posts_df['viral'].value_counts(normalize=True)

0    0.979974
1    0.020026
Name: viral, dtype: float64

In [88]:
X = posts_df['title']
y = posts_df['viral']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20, stratify=y)

#### Basic Logistic Regression

In [89]:
pipe_lrc = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression(max_iter=1000))
])

In [90]:
pipe_lrc.fit(X_train, y_train)
pipe_lrc.score(X_train, y_train), pipe_lrc.score(X_test, y_test)

(0.9803627058299955, 0.9796429167361922)

In [91]:
balanced_accuracy_score(y_train, pipe_lrc.predict(X_train))

0.5165247502270663

In [92]:
balanced_accuracy_score(y_test, pipe_lrc.predict(X_test))

0.5039112605709746

The model is underperforming the baseline

#### Basic KNN Classification

In [93]:
pipe_knnc = Pipeline([
    ('cv', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

In [94]:
pipe_knnc.fit(X_train, y_train)
pipe_knnc.score(X_train, y_train), pipe_knnc.score(X_test, y_test)

(0.982254116599911, 0.9784748873685967)

In [95]:
print(balanced_accuracy_score(y_train, pipe_knnc.predict(X_train)))
print(balanced_accuracy_score(y_test, pipe_knnc.predict(X_test)))

0.6440168029064487
0.5971905329473863


The model performs very similarly to the logistic regression model and is underforming the baseline

#### Basic Random Forest Classifier

In [96]:
pipe_rfc = Pipeline([
    ('cv', CountVectorizer()),
    ('rf', RandomForestClassifier())
])

In [97]:
pipe_rfc.fit(X_train, y_train)
pipe_rfc.score(X_train, y_train), pipe_rfc.score(X_test, y_test)

(0.9843680462839342, 0.9763056899716336)

In [98]:
print(balanced_accuracy_score(y_train, pipe_rfc.predict(X_train)))
print(balanced_accuracy_score(y_test, pipe_rfc.predict(X_test)))

0.7253651983045716
0.6124098984051308


In [99]:
pipe_rfc.get_params(deep=True)

{'memory': None,
 'steps': [('cv', CountVectorizer()), ('rf', RandomForestClassifier())],
 'verbose': False,
 'cv': CountVectorizer(),
 'rf': RandomForestClassifier(),
 'cv__analyzer': 'word',
 'cv__binary': False,
 'cv__decode_error': 'strict',
 'cv__dtype': numpy.int64,
 'cv__encoding': 'utf-8',
 'cv__input': 'content',
 'cv__lowercase': True,
 'cv__max_df': 1.0,
 'cv__max_features': None,
 'cv__min_df': 1,
 'cv__ngram_range': (1, 1),
 'cv__preprocessor': None,
 'cv__stop_words': None,
 'cv__strip_accents': None,
 'cv__token_pattern': '(?u)\\b\\w\\w+\\b',
 'cv__tokenizer': None,
 'cv__vocabulary': None,
 'rf__bootstrap': True,
 'rf__ccp_alpha': 0.0,
 'rf__class_weight': None,
 'rf__criterion': 'gini',
 'rf__max_depth': None,
 'rf__max_features': 'auto',
 'rf__max_leaf_nodes': None,
 'rf__max_samples': None,
 'rf__min_impurity_decrease': 0.0,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'rf__min_weight_fraction_leaf': 0.0,
 'rf__n_estimators': 100,
 'rf__n_jobs': None,
 '

### Grid Search

#### grid search on random forest

In [110]:
params = {
    'cv__ngram_range': [(1,1),(1,2),(2,2)],
    'cv__stop_words' :[None, 'english'],
    'cv__min_df':[1, 2, 4],
    'cv__lowercase':[True, False],
    'rf__n_jobs':[-1],
    'rf__max_depth': [None, 2, 4]
}

In [111]:
gs = GridSearchCV(
    pipe_rfc,
    param_grid= params,
    n_jobs=-1,
    scoring='balanced_accuracy'
)

In [112]:
gs.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('rf', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'cv__lowercase': [True, False],
                         'cv__min_df': [1, 2, 4],
                         'cv__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'cv__stop_words': [None, 'english'],
                         'rf__max_depth': [None, 2, 4], 'rf__n_jobs': [-1]},
             scoring='balanced_accuracy')

In [113]:
gs.best_params_

{'cv__lowercase': False,
 'cv__min_df': 1,
 'cv__ngram_range': (2, 2),
 'cv__stop_words': 'english',
 'rf__max_depth': None,
 'rf__n_jobs': -1}

In [114]:
gs.best_estimator_

Pipeline(steps=[('cv',
                 CountVectorizer(lowercase=False, ngram_range=(2, 2),
                                 stop_words='english')),
                ('rf', RandomForestClassifier(n_jobs=-1))])

In [115]:
gs.best_score_

0.6251929740876908

In [116]:
print(balanced_accuracy_score(y_train, gs.predict(X_train)))
print(balanced_accuracy_score(y_test, gs.predict(X_test)))

0.7472752043596731
0.6038211589761053


In [117]:
pipe_gs = Pipeline([
    ('cv', CountVectorizer(ngram_range=(1,2), min_df=2)),
    ('rf', RandomForestClassifier(n_jobs=-1))
])

In [118]:
pipe_gs.fit(X_train, y_train)
pipe_gs.score(X_train, y_train), pipe_gs.score(X_test, y_test)

(0.9843680462839342, 0.9761388286334056)

In [119]:
print(balanced_accuracy_score(y_train, pipe_rfc.predict(X_train)))
print(balanced_accuracy_score(y_test, pipe_rfc.predict(X_test)))

0.7253651983045716
0.6124098984051308


Basic SciKit learn classification models do not perform well. Next steps are amplifying the minority class and trying neural networks