- Transformers: PorterStemmer, SnowballStemmer('english'), LancastStemmer, WordNetLemmatizer
- Vectorizers: CountVectorizer (stopwords, mindf, maxdf, max_features, ngram_range), TFIDF (ditto + smooth, sublinear)  (consider waiting till here for lowercasing)
- Dim reduction: PCA

- feature engineering - not really possible
- scaling - shouldn't really be necessary, but i guess we could....
- log transform of y - doesn't make sense here

Class options
- Unsup clusters: Kmeans, DBScan
- Sup: logreg (w/regularization), dt, bagged, random forests, extra trees, adaboost/gradientboost/xgboost, knn, multinomialnb
- aggregated sup: votingclassifier, stacking

- searchers: gridsearch, randomizedsearch

- time permitting - spacy, remove words, multiclass, sentiment?

In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# infrastructure
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# transformers
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# dimensionality reduction
from sklearn.decomposition import PCA

# supervised
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

# heterogeneous-model supervised
from sklearn.ensemble import StackingClassifier

# unsupervised
from sklearn.cluster import KMeans, DBSCAN

# evaluation
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, recall_score, precision_score, f1_score
from sklearn.metrics import RocCurveDisplay, roc_auc_score, silhouette_score

In [24]:
df = pd.read_csv('../data/eda_posts.csv')

In [25]:
df.head()

Unnamed: 0,author,created_utc,id,num_comments,score,selftext,subreddit,title,period,all_text,all_text_len,all_text_word_len
0,discord,1221835135,72cy3,16,5,,jazz,Anyone know of any dark / evil sounding jazz?,1,Anyone know of any dark evil sounding jazz,42,8
1,smacfarl,1222039404,72p6u,5,0,,jazz,In honor of Soil &amp; Pimp showing us how it'...,1,In honor of Soil Pimp showing us how it's done...,114,21
2,[deleted],1223963408,76zmj,2,2,,jazz,anybody have that record of Charlie Christian ...,1,anybody have that record of Charlie Christian ...,145,26
3,smacfarl,1224683862,78nrr,4,5,,jazz,New Topline Image for Jazz subreddit. Suggesti...,1,New Topline Image for Jazz subreddit Suggestio...,73,11
4,ChefEspeff,1235783273,80wtq,10,2,,jazz,"Teaching yourself Jazz Piano, is it possible?",1,Teaching yourself Jazz Piano is it possible,43,7


In [26]:
# let's encode our classes
# https://benalexkeen.com/mapping-categorical-data-in-pandas/
df['subreddit_code'] = df['subreddit'].astype('category').cat.codes

In [27]:
df.head()

Unnamed: 0,author,created_utc,id,num_comments,score,selftext,subreddit,title,period,all_text,all_text_len,all_text_word_len,subreddit_code
0,discord,1221835135,72cy3,16,5,,jazz,Anyone know of any dark / evil sounding jazz?,1,Anyone know of any dark evil sounding jazz,42,8,1
1,smacfarl,1222039404,72p6u,5,0,,jazz,In honor of Soil &amp; Pimp showing us how it'...,1,In honor of Soil Pimp showing us how it's done...,114,21,1
2,[deleted],1223963408,76zmj,2,2,,jazz,anybody have that record of Charlie Christian ...,1,anybody have that record of Charlie Christian ...,145,26,1
3,smacfarl,1224683862,78nrr,4,5,,jazz,New Topline Image for Jazz subreddit. Suggesti...,1,New Topline Image for Jazz subreddit Suggestio...,73,11,1
4,ChefEspeff,1235783273,80wtq,10,2,,jazz,"Teaching yourself Jazz Piano, is it possible?",1,Teaching yourself Jazz Piano is it possible,43,7,1


In [29]:
X = df['all_text']
y = df['subreddit_code']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((14954,), (4985,), (14954,), (4985,))

In [31]:
y_train.value_counts(normalize = True)

0    0.500401
1    0.499599
Name: subreddit_code, dtype: float64

In [32]:
y_test.value_counts(normalize = True)

0    0.500502
1    0.499498
Name: subreddit_code, dtype: float64

In [33]:
def stemmer(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

In [34]:
def lemmatizer(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

In [47]:
pipe = Pipeline(steps = [('cvec', CountVectorizer()), ('logreg', LogisticRegression(max_iter = 10_000))])

pipe_params = {
    'cvec__tokenizer': [None, stemmer, lemmatizer],
    'cvec__stop_words': [None, 'english'],
    'cvec__max_features': [3000, 4000],
    'cvec__min_df': [1, 2],
    'cvec__max_df': [0.8, 0.9],
    'cvec__ngram_range': [(1, 1), (1, 2)]
}

gs = GridSearchCV(estimator = pipe, param_grid = pipe_params, n_jobs = -1)
gs.fit(X_train, y_train)
gs.best_estimator_

In [48]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.972850073558914, 0.9295887662988966)

In [49]:
pipe = Pipeline(steps = [('tfidf', TfidfVectorizer()), ('logreg', LogisticRegression(max_iter = 10_000))])

pipe_params = {
    'tfidf__tokenizer': [None, stemmer, lemmatizer],
    'tfidf__stop_words': [None, 'english'],
    'tfidf__max_features': [3000, 4000],
    'tfidf__min_df': [1, 2],
    'tfidf__max_df': [0.8, 0.9],
    'tfidf__ngram_range': [(1, 1), (1, 2)]
}

gs = GridSearchCV(estimator = pipe, param_grid = pipe_params, n_jobs = -1)
gs.fit(X_train, y_train)
gs.best_estimator_

In [50]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.9588738798983549, 0.9344032096288867)

In [54]:
pipe = Pipeline(steps = [('tfidf', TfidfVectorizer()), ('logreg', LogisticRegression(max_iter = 10_000))])

pipe_params = {
    'tfidf__tokenizer': [None],
    'tfidf__stop_words': ['english'],
    'tfidf__max_features': [4000],
    'tfidf__min_df': [1],
    'tfidf__max_df': [0.8],
    'tfidf__ngram_range': [(1, 1)],
    'logreg__penalty': ['None', 'l1', 'l2', 'elasticnet'],
    'logreg__C': [0.1, 1, 10],
    'logreg__solver': ['lbfgs', 'liblinear', 'saga'],
    'logreg__l1_ratio': [0.1, 0.5, 1.0]
}

gs = GridSearchCV(estimator = pipe, param_grid = pipe_params, n_jobs = -1)
gs.fit(X_train, y_train)
gs.best_estimator_

270 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Gabe\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Gabe\anaconda3\lib\site-packages\sklearn\pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\Gabe\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Gabe\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 55, 

In [55]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.9590076233783603, 0.9344032096288867)

In [58]:
pipe = Pipeline(steps = [('tfidf', TfidfVectorizer()), ('knn', KNeighborsClassifier())])

pipe_params = {
    'tfidf__tokenizer': [None],
    'tfidf__stop_words': ['english'],
    'tfidf__max_features': [4000],
    'tfidf__min_df': [1],
    'tfidf__max_df': [0.8],
    'tfidf__ngram_range': [(1, 1)],
    'knn__n_neighbors': [3, 5, 8],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['minkowski', 'manhattan', 'euclidean']
}

gs = GridSearchCV(estimator = pipe, param_grid = pipe_params, n_jobs = -1)
gs.fit(X_train, y_train)
gs.best_estimator_

In [57]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.994650260799786, 0.6256770310932799)

In [59]:
pipe = Pipeline(steps = [('tfidf', TfidfVectorizer()), ('mnb', MultinomialNB())])

pipe_params = {
    'tfidf__tokenizer': [None],
    'tfidf__stop_words': ['english'],
    'tfidf__max_features': [4000],
    'tfidf__min_df': [1],
    'tfidf__max_df': [0.8],
    'tfidf__ngram_range': [(1, 1)],
    'mnb__alpha': [0.0, 0.4, 0.8, 1.0]
}

gs = GridSearchCV(estimator = pipe, param_grid = pipe_params, n_jobs = -1)
gs.fit(X_train, y_train)
gs.best_estimator_

In [60]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.9412197405376488, 0.925777331995988)

In [61]:
pipe = Pipeline(steps = [('tfidf', TfidfVectorizer()), ('dt', DecisionTreeClassifier(random_state = 42))])

pipe_params = {
    'tfidf__tokenizer': [None],
    'tfidf__stop_words': ['english'],
    'tfidf__max_features': [4000],
    'tfidf__min_df': [1],
    'tfidf__max_df': [0.8],
    'tfidf__ngram_range': [(1, 1)],
    'dt__max_features': [3000, 4000],
    'dt__max_depth': [2, 5, 10],
    'dt__min_samples_split': [2, 4],
    'dt__min_samples_leaf': [1, 2]
}

gs = GridSearchCV(estimator = pipe, param_grid = pipe_params, n_jobs = -1)
gs.fit(X_train, y_train)
gs.best_estimator_

In [62]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.8176407650127057, 0.8106318956870612)

In [64]:
pipe = Pipeline(steps = [('tfidf', TfidfVectorizer()), ('bag', BaggingClassifier(random_state = 42))])

pipe_params = {
    'tfidf__tokenizer': [None],
    'tfidf__stop_words': ['english'],
    'tfidf__max_features': [4000],
    'tfidf__min_df': [1],
    'tfidf__max_df': [0.8],
    'tfidf__ngram_range': [(1, 1)],
    'bag__n_estimators': [10, 20]
}

gs = GridSearchCV(estimator = pipe, param_grid = pipe_params, n_jobs = -1)
gs.fit(X_train, y_train)
gs.best_estimator_

In [65]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.9927109803397084, 0.9047141424272819)

In [66]:
pipe = Pipeline(steps = [('tfidf', TfidfVectorizer()), ('rf', RandomForestClassifier(random_state = 42))])

pipe_params = {
    'tfidf__tokenizer': [None],
    'tfidf__stop_words': ['english'],
    'tfidf__max_features': [4000],
    'tfidf__min_df': [1],
    'tfidf__max_df': [0.8],
    'tfidf__ngram_range': [(1, 1)],
    'rf__n_estimators': [100, 150],
    'rf__max_features': ['sqrt', 'log2'],
    'rf__max_depth': [2, 5, 10],
    'rf__min_samples_split': [2, 4],
    'rf__min_samples_leaf': [1, 2]
}

gs = GridSearchCV(estimator = pipe, param_grid = pipe_params, n_jobs = -1)
gs.fit(X_train, y_train)
gs.best_estimator_

In [67]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.904774642236191, 0.8868605817452357)

In [69]:
pipe = Pipeline(steps = [('tfidf', TfidfVectorizer()), ('et', ExtraTreesClassifier(random_state = 42))])

pipe_params = {
    'tfidf__tokenizer': [None],
    'tfidf__stop_words': ['english'],
    'tfidf__max_features': [4000],
    'tfidf__min_df': [1],
    'tfidf__max_df': [0.8],
    'tfidf__ngram_range': [(1, 1)],
    'et__n_estimators': [100, 150],
    'et__max_features': ['sqrt', 'log2'],
    'et__max_depth': [2, 5, 10],
    'et__min_samples_split': [2, 4],
    'et__min_samples_leaf': [1, 2]
}

gs = GridSearchCV(estimator = pipe, param_grid = pipe_params, n_jobs = -1)
gs.fit(X_train, y_train)
gs.best_estimator_

In [70]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.8978868530159154, 0.8850551654964894)

In [73]:
pipe = Pipeline(steps = [('tfidf', TfidfVectorizer()), ('ada', AdaBoostClassifier(random_state = 42))])

pipe_params = {
    'tfidf__tokenizer': [None],
    'tfidf__stop_words': ['english'],
    'tfidf__max_features': [4000],
    'tfidf__min_df': [1],
    'tfidf__max_df': [0.8],
    'tfidf__ngram_range': [(1, 1)],
    'ada__n_estimators': [50, 100],
    'ada__learning_rate': [1, 5, 10]
}

gs = GridSearchCV(estimator = pipe, param_grid = pipe_params, n_jobs = -1)
gs.fit(X_train, y_train)
gs.best_estimator_

In [75]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.8968169051758726, 0.8890672016048144)

In [76]:
pipe = Pipeline(steps = [('tfidf', TfidfVectorizer()), ('grad', GradientBoostingClassifier(random_state = 42))])

pipe_params = {
    'tfidf__tokenizer': [None],
    'tfidf__stop_words': ['english'],
    'tfidf__max_features': [4000],
    'tfidf__min_df': [1],
    'tfidf__max_df': [0.8],
    'tfidf__ngram_range': [(1, 1)],
    'grad__n_estimators': [100, 150],
    'grad__subsample': [0.7, 1.0],
    'grad__learning_rate': [0.1, 1, 10],
    'grad__max_features': ['sqrt', 'log2'],
    'grad__max_depth': [3, 5, 10],
    'grad__min_samples_split': [2, 4],
    'grad__min_samples_leaf': [1, 2]
}

gs = GridSearchCV(estimator = pipe, param_grid = pipe_params, n_jobs = -1)
gs.fit(X_train, y_train)
gs.best_estimator_

In [77]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.9339975926173599, 0.919358074222668)

In [87]:
tfidf = TfidfVectorizer(max_df = 0.8, max_features = 4000, stop_words = 'english')
X_vec = tfidf.fit_transform(X)
km = KMeans(n_clusters = 2, random_state = 42)
km.fit(X_vec)
silhouette_score(X_vec, km.labels_)

0.006186818235387876

In [89]:
tfidf = TfidfVectorizer(max_df = 0.8, max_features = 4000, stop_words = 'english')
X_vec = tfidf.fit_transform(X)
db = DBSCAN(eps = 0.5, min_samples = 5)
db.fit(X_vec)
silhouette_score(X_vec, db.labels_)

-0.2670341449045659

In [95]:
tfidf = TfidfVectorizer(max_df = 0.8, max_features = 4000, stop_words = 'english')
X_vec_train = tfidf.fit_transform(X_train)
X_vec_test = tfidf.transform(X_test)

logreg = LogisticRegression(max_iter = 10_000)
mnb = MultinomialNB()
grad = GradientBoostingClassifier(max_depth=10, max_features='log2',
                           min_samples_leaf=2, n_estimators=150,
                           random_state=42)

l1_estimators = [('logreg', logreg), ('mnb', mnb), ('grad', grad)]

stack = StackingClassifier(l1_estimators)
stack.fit(X_vec_train, y_train)
stack.score(X_vec_train, y_train), stack.score(X_vec_test, y_test)

(0.9582051624983282, 0.9390170511534603)

In [96]:
tfidf = TfidfVectorizer(max_df = 0.8, max_features = 4000, stop_words = 'english')
X_vec_train = tfidf.fit_transform(X_train)
X_vec_test = tfidf.transform(X_test)

logreg = LogisticRegression(max_iter = 10_000)
mnb = MultinomialNB()
grad = GradientBoostingClassifier(max_depth=10, max_features='log2',
                           min_samples_leaf=2, n_estimators=150,
                           random_state=42)

l1_estimators = [('mnb', mnb), ('grad', grad)]

stack = StackingClassifier(l1_estimators)
stack.fit(X_vec_train, y_train)
stack.score(X_vec_train, y_train), stack.score(X_vec_test, y_test)

(0.951451116758058, 0.9368104312938816)

In [97]:
tfidf = TfidfVectorizer(max_df = 0.8, max_features = 4000, stop_words = 'english')
X_vec_train = tfidf.fit_transform(X_train)
X_vec_test = tfidf.transform(X_test)

logreg = LogisticRegression(max_iter = 10_000)
mnb = MultinomialNB()
grad = GradientBoostingClassifier(max_depth=10, max_features='log2',
                           min_samples_leaf=2, n_estimators=150,
                           random_state=42)

l1_estimators = [('logreg', logreg), ('grad', grad)]

stack = StackingClassifier(l1_estimators)
stack.fit(X_vec_train, y_train)
stack.score(X_vec_train, y_train), stack.score(X_vec_test, y_test)

(0.9598100842583924, 0.9362086258776329)

In [98]:
tfidf = TfidfVectorizer(max_df = 0.8, max_features = 4000, stop_words = 'english')
X_vec_train = tfidf.fit_transform(X_train)
X_vec_test = tfidf.transform(X_test)

logreg = LogisticRegression(max_iter = 10_000)
mnb = MultinomialNB()
grad = GradientBoostingClassifier(max_depth=10, max_features='log2',
                           min_samples_leaf=2, n_estimators=150,
                           random_state=42)

l1_estimators = [('logreg', logreg), ('mnb', mnb)]

stack = StackingClassifier(l1_estimators)
stack.fit(X_vec_train, y_train)
stack.score(X_vec_train, y_train), stack.score(X_vec_test, y_test)

(0.9573358298782935, 0.9380140421263792)

In [110]:
# trying our logreg with PCA

# https://stackoverflow.com/questions/28384680/scikit-learns-pipeline-a-sparse-matrix-was-passed-but-dense-data-is-required
from sklearn.pipeline import TransformerMixin
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return np.asarray(X.todense())

tfidf = TfidfVectorizer(max_df = 0.8, max_features = 4000, stop_words = 'english')
dense = DenseTransformer()
pca = PCA(n_components = 500, random_state = 42)
logreg = LogisticRegression(max_iter = 10_000)

pipe = Pipeline([('tfidf', tfidf), ('dense', dense), ('pca', pca), ('logreg', logreg)])
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)
