In [49]:
import pandas as pd
import numpy as np
import matplotlib as plt

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import f1_score, confusion_matrix, plot_confusion_matrix

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.corpus import stopwords



# Initialization

In [7]:
df = pd.read_csv('../data/reddit_chessbeginners_full.csv')
y = df['comm_greater_1']
X = df['title']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

# Baseline model

In [6]:
target.value_counts(normalize=True)

0    0.510359
1    0.489641
Name: comm_greater_1, dtype: float64

### If the model predicted majority class (less than or equal to 1 comment), the model would be right 51% of the time. The goal is to improve upon this baseline accuracy.

## Definte lemmatization and stemming functions

In [50]:
def lemma_tokenizer(doc):
    wnl = WordNetLemmatizer()
    tokens = word_tokenize(doc)
    return [wnl.lemmatize(t) for t in tokens]

def stemmer_tokenizer(doc):
    ps = PorterStemmer()
    tokens = word_tokenize(doc)
    return [ps.stem(t) for t in tokens]

# Logistic Regression

### Count Vectorizer

In [53]:
pipe = Pipeline([    
    ('cv', CountVectorizer(tokenizer = lemma_tokenizer, min_df=3)),
    ('lr', LogisticRegression(max_iter=1000)),
])

params = {
    'cv__stop_words' : [None, 'english'],
    'cv__ngram_range' : [(1,1),(1,2), (2,2)],
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ', gs.best_params_)
print('Best Estimator Score Train: ', gs.best_estimator_.score(X_train, y_train))
print('Best Estimator Score Test: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits




Best Params:  {'cv__ngram_range': (1, 1), 'cv__stop_words': 'english'}
Best Estimator Score Train:  0.6695815955872461
Best Estimator Score Test:  0.5558993676846495
[CV] END .....cv__ngram_range=(1, 1), cv__stop_words=english; total time=   0.7s
[CV] END ........cv__ngram_range=(1, 2), cv__stop_words=None; total time=   1.6s
[CV] END .....cv__ngram_range=(2, 2), cv__stop_words=english; total time=   0.8s
[CV] END .....cv__ngram_range=(1, 1), cv__stop_words=english; total time=   5.6s
[CV] END ........cv__ngram_range=(1, 2), cv__stop_words=None; total time=   8.3s
[CV] END ........cv__ngram_range=(2, 2), cv__stop_words=None; total time=  10.0s
[CV] END ........cv__ngram_range=(1, 1), cv__stop_words=None; total time=   0.9s
[CV] END .....cv__ngram_range=(1, 2), cv__stop_words=english; total time=   0.9s
[CV] END ........cv__ngram_range=(2, 2), cv__stop_words=None; total time=   1.2s
[CV] END ........cv__ngram_range=(1, 1), cv__stop_words=None; total time=   6.2s
[CV] END .....cv__ngram_

### TFID 

In [13]:
pipe = Pipeline([    
    ('tfid', TfidfVectorizer()),
    ('lr', LogisticRegression(max_iter=1000)),
])

params = {
    'tfid__stop_words' : [None, 'english'],
    'tfid__ngram_range' : [(1,1),(1,2), (2,2)],
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ', gs.best_params_)
print('Best Estimator Score Train: ', gs.best_estimator_.score(X_train, y_train))
print('Best Estimator Score Test: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params:  {'tfid__ngram_range': (1, 2), 'tfid__stop_words': None}
Best Estimator Score Train:  0.8443876407013767
Best Estimator Score Test:  0.5748688281985739


# Naive Bayes (Bernouilli)

### Count Vectorizer

In [22]:
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('nb', BernoulliNB())
])

params = {
    'cv__stop_words' : [None, 'english'],
    'cv__ngram_range' : [(1,1), (1,2), (1,3)],
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ', gs.best_params_)
print('Best Estimator Score Train: ', gs.best_estimator_.score(X_train, y_train))
print('Best Estimator Score Test: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params:  {'cv__ngram_range': (1, 2), 'cv__stop_words': None}
Best Estimator Score Train:  0.8642540024216333
Best Estimator Score Test:  0.5639714785416386


### TFID

In [19]:
pipe = Pipeline([
    ('tfid', TfidfVectorizer()),
    ('nb', BernoulliNB())
])

params = {
    'tfid__stop_words' : [None, 'english'],
    'tfid__ngram_range' : [(1,1), (1,2), (1,3)],
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ', gs.best_params_)
print('Best Estimator Score Train: ', gs.best_estimator_.score(X_train, y_train))
print('Best Estimator Score Test: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params:  {'tfid__ngram_range': (1, 2), 'tfid__stop_words': None}
Best Estimator Score Train:  0.8642540024216333
Best Estimator Score Test:  0.5639714785416386


# Naive Bayes (Multinomial)

### Count Vectorizer

In [33]:
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('nbm', MultinomialNB())
])

params = {
    'cv__stop_words' : [None, 'english'],
    'cv__ngram_range' : [(1,1), (1,2)],
}

gs = GridSearchCV(pipe, param_grid={}, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ', gs.best_params_)
print('Best Estimator Score Train: ', gs.best_estimator_.score(X_train, y_train))
print('Best Estimator Score Test: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params:  {}
Best Estimator Score Train:  0.696578321897843
Best Estimator Score Test:  0.5596663527512444


### TFID

In [34]:
pipe = Pipeline([
    ('tfid', TfidfVectorizer()),
    ('nbm', MultinomialNB())
])

params = {
    'tfid__stop_words' : [None, 'english'],
    'tfid__ngram_range' : [(1,1), (1,2), (1,3)],
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ', gs.best_params_)
print('Best Estimator Score Train: ', gs.best_estimator_.score(X_train, y_train))
print('Best Estimator Score Test: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params:  {'tfid__ngram_range': (1, 3), 'tfid__stop_words': None}
Best Estimator Score Train:  0.9365890847123189
Best Estimator Score Test:  0.5828064038746132


# Random Forest Classifier

### Count Vectorizer

In [35]:
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('rf', RandomForestClassifier())
])

params = {
    'cv__stop_words' : [None, 'english'],
    'cv__ngram_range' : [(1,1), (1,2), (1,3)],
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ', gs.best_params_)
print('Best Estimator Score Train: ', gs.best_estimator_.score(X_train, y_train))
print('Best Estimator Score Test: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params:  {'cv__ngram_range': (1, 2), 'cv__stop_words': None}
Best Estimator Score Train:  0.9819274406924078
Best Estimator Score Test:  0.564509619265438


### TFID

In [37]:
pipe = Pipeline([
    ('tfid', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

params = {
    'tfid__stop_words' : [None, 'english'],
    'tfid__ngram_range' : [(1,1), (1,2), (1,3)],
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ', gs.best_params_)
print('Best Estimator Score Train: ', gs.best_estimator_.score(X_train, y_train))
print('Best Estimator Score Test: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params:  {'tfid__ngram_range': (1, 2), 'tfid__stop_words': None}
Best Estimator Score Train:  0.9819274406924078
Best Estimator Score Test:  0.5635678729987892
[CV] END ....tfid__ngram_range=(1, 1), tfid__stop_words=None; total time=  36.5s
[CV] END ....tfid__ngram_range=(1, 2), tfid__stop_words=None; total time= 2.4min
[CV] END .tfid__ngram_range=(1, 3), tfid__stop_words=english; total time= 4.4min
[CV] END ....tfid__ngram_range=(1, 1), tfid__stop_words=None; total time=  36.4s
[CV] END ....tfid__ngram_range=(1, 2), tfid__stop_words=None; total time= 2.4min
[CV] END .tfid__ngram_range=(1, 3), tfid__stop_words=english; total time= 4.5min
[CV] END ....tfid__ngram_range=(1, 1), tfid__stop_words=None; total time=  36.3s
[CV] END ....tfid__ngram_range=(1, 2), tfid__stop_words=None; total time= 2.4min
[CV] END .tfid__ngram_range=(1, 3), tfid__stop_words=english; total time= 4.6min
[CV] END ....tfid__ngram_range=(1, 1), tfid__s

# Gradient Boosting Classifier

### Count Vectorizer

In [38]:
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('gb', GradientBoostingClassifier())
])

params = {
    'cv__stop_words' : [None, 'english'],
    'cv__ngram_range' : [(1,1), (1,2), (1,3)],
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ', gs.best_params_)
print('Best Estimator Score Train: ', gs.best_estimator_.score(X_train, y_train))
print('Best Estimator Score Test: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params:  {'cv__ngram_range': (1, 1), 'cv__stop_words': None}
Best Estimator Score Train:  0.59262747208395
Best Estimator Score Test:  0.5595318175702947


### TFID

In [40]:
pipe = Pipeline([
    ('tfid', TfidfVectorizer()),
    ('gb', GradientBoostingClassifier())
])

params = {
    'tfid__stop_words' : [None, 'english'],
    'tfid__ngram_range' : [(1,1), (1,2), (1,3)],
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ', gs.best_params_)
print('Best Estimator Score Train: ', gs.best_estimator_.score(X_train, y_train))
print('Best Estimator Score Test: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params:  {'tfid__ngram_range': (1, 1), 'tfid__stop_words': None}
Best Estimator Score Train:  0.6080990178931791
Best Estimator Score Test:  0.5575137898560474


# Ada BoostGradientBoostingClassifier

### Count Vectorizer

In [41]:
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('ab', AdaBoostClassifier())
])

params = {
    'cv__stop_words' : [None, 'english'],
    'cv__ngram_range' : [(1,1), (1,2), (1,3)],
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ', gs.best_params_)
print('Best Estimator Score Train: ', gs.best_estimator_.score(X_train, y_train))
print('Best Estimator Score Test: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params:  {'cv__ngram_range': (1, 2), 'cv__stop_words': None}
Best Estimator Score Train:  0.572133279519261
Best Estimator Score Test:  0.5505179604466568


### TFID

In [42]:
pipe = Pipeline([
    ('tfid', TfidfVectorizer()),
    ('ab', AdaBoostClassifier())
])

params = {
    'tfid__stop_words' : [None, 'english'],
    'tfid__ngram_range' : [(1,1), (1,2), (1,3)],
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ', gs.best_params_)
print('Best Estimator Score Train: ', gs.best_estimator_.score(X_train, y_train))
print('Best Estimator Score Test: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params:  {'tfid__ngram_range': (1, 2), 'tfid__stop_words': None}
Best Estimator Score Train:  0.5749136732588905
Best Estimator Score Test:  0.5490380734562088


# KNN

### Count Vectorizer

In [43]:
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

params = {
    'cv__stop_words' : [None, 'english'],
    'cv__ngram_range' : [(1,1), (1,2), (1,3)],
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ', gs.best_params_)
print('Best Estimator Score Train: ', gs.best_estimator_.score(X_train, y_train))
print('Best Estimator Score Test: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params:  {'cv__ngram_range': (1, 1), 'cv__stop_words': None}
Best Estimator Score Train:  0.7046504327548321
Best Estimator Score Test:  0.5292614018565855


### TFID

In [44]:
pipe = Pipeline([
    ('tfid', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
])

params = {
    'tfid__stop_words' : [None, 'english'],
    'tfid__ngram_range' : [(1,1), (1,2), (1,3)],
}

gs = GridSearchCV(pipe, params, cv=5, verbose=2, n_jobs=-1)

gs.fit(X_train, y_train)

print('Best Params: ', gs.best_params_)
print('Best Estimator Score Train: ', gs.best_estimator_.score(X_train, y_train))
print('Best Estimator Score Test: ', gs.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params:  {'tfid__ngram_range': (1, 1), 'tfid__stop_words': 'english'}
Best Estimator Score Train:  0.7016458137136194
Best Estimator Score Test:  0.5315484999327325
