In [6]:
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC


In [7]:
data = pd.read_csv('reddit_model.csv')

In [8]:
data.head()

Unnamed: 0,subreddit,title,selftext,created_utc,pro_title,pro_selftext,lem_title,lem_selftext
0,1,$TK &amp; $TNK,Thoughts on Teekay (TK) &amp; Teekay Tankers (...,1587476965,TK amp TNK,Thoughts on Teekay TK amp Teekay Tankers TNK T...,TK amp TNK,Thoughts on Teekay TK amp Teekay Tankers TNK T...
1,1,Investing Frameworks,Are there any frameworks/ concepts that you ha...,1587474494,Investing Frameworks,Are there any frameworks concepts that you hav...,Investing Frameworks,Are there any framework concept that you have ...
2,1,Long term growth calculation,"Hi there, \n\n&amp;#x200B;\n\nI want to valuat...",1587474287,Long term growth calculation,Hi there \n\nampx200B\n\nI want to valuate a c...,Long term growth calculation,Hi there ampx200B I want to valuate a company ...
3,1,Is this the perfect time for NEW companies to ...,This pandemic has demonstrated how badly thing...,1587473864,Is this the perfect time for NEW companies to ...,This pandemic has demonstrated how badly thing...,Is this the perfect time for NEW company to ta...,This pandemic ha demonstrated how badly thing ...
4,1,Is anyone else losing faith in the stock market?,"Maybe stocks will crash again, maybe stocks wi...",1587473142,Is anyone else losing faith in the stock market,Maybe stocks will crash again maybe stocks wil...,Is anyone else losing faith in the stock market,Maybe stock will crash again maybe stock will ...


In [9]:
data.isnull().sum()

subreddit        0
title            0
selftext         0
created_utc      0
pro_title       27
pro_selftext    67
lem_title       29
lem_selftext    67
dtype: int64

In [10]:
data.dropna(inplace=True)

In [11]:
data.shape

(24400, 8)

In [12]:
data.isnull().sum()

subreddit       0
title           0
selftext        0
created_utc     0
pro_title       0
pro_selftext    0
lem_title       0
lem_selftext    0
dtype: int64

In [13]:
X = data['lem_title']
y = data['subreddit']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,
                                                    test_size = 0.3)

In [15]:
print(y_train.shape)
X_train.shape

(17080,)


(17080,)

In [28]:
cvec = CountVectorizer(stop_words='english')

In [29]:
X_train_cvec = cvec.fit_transform(X_train)
X_test_cvec = cvec.transform(X_test)

# Multinomal NB

In [30]:
mnb = MultinomialNB()

In [31]:
mnb.fit(X_train_cvec, y_train)
mnb.score(X_train_cvec, y_train)

0.8803864168618267

In [32]:
mnb.score(X_test_cvec, y_test)

0.8016393442622951

## Increase N Gram

In [33]:
cvec_mnb = Pipeline([
    ('cvec', CountVectorizer(stop_words=True)),
    ('mnb', MultinomialNB())
])

tvec_gnb = make_pipeline(
    TfidfVectorizer(stop_words=True),
    FunctionTransformer(lambda x: x.todense(), accept_sparse=True),
    GaussianNB()
)

In [34]:
cvec_params = {
    'cvec__max_features': np.arange(500, 5_000, 250),
      'cvec__stop_words': ['english'],
     'cvec__ngram_range': [(1,1), (1,2), (1,3)],
            'mnb__alpha': [.75, 1.0, 1.25, 1.5, 2.0]
}

tvec_params = {
    'tvec__max_features': np.arange(1_000, 8_000, 1_000),
    'tvec__stop_words': ['english'],
    'tvec__ngram_range': [(1,1), (1,2), (1,3)]
}

In [35]:
gs_cvec_mnb = GridSearchCV(cvec_mnb, cvec_params, cv=10, n_jobs=-1)

In [36]:
gs_cvec_mnb.fit(X_train,
                y_train)



GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        pre

In [37]:
print(gs_cvec_mnb.score(X_train, y_train))
gs_cvec_mnb.best_params_

0.8498829039812646


{'cvec__max_features': 4500,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'mnb__alpha': 1.25}

In [38]:
gs_cvec_mnb.score(X_test, y_test)

0.7952185792349726

In [39]:
cvec_lr = Pipeline([
    ('cvec', CountVectorizer(stop_words=True)),
    ('lr', LogisticRegression())
])

In [40]:
cvec_params = {
    'cvec__max_features': np.arange(500, 5_000, 250),
      'cvec__stop_words': ['english'],
     'cvec__ngram_range': [(1,1), (1,2)],
           'lr__penalty': ['l1', 'l2'],
                 'lr__C': np.linspace(.001, 2, 10),
            'lr__solver': ['liblinear']
}


In [41]:
%%time

gs_lr = GridSearchCV(cvec_lr, cvec_params, cv=10, n_jobs=-1)

gs_lr.fit(X_train, y_train)

print(gs_lr.best_params_)
gs_lr.score(X_train, y_train)



{'cvec__max_features': 4750, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english', 'lr__C': 0.44522222222222224, 'lr__penalty': 'l2', 'lr__solver': 'liblinear'}
CPU times: user 1min 18s, sys: 10.9 s, total: 1min 29s
Wall time: 4min 42s


0.8568501170960188

In [42]:
gs_lr.score(X_test, y_test)

0.7896174863387978

In [16]:
tv = TfidfVectorizer(ngram_range = (1,2), min_df = 2, 
                     stop_words='english')
X_train_tvec = tv.fit_transform(X_train)
X_test_tvec = tv.transform(X_test)

In [44]:
lr = LogisticRegression()

lr_params = {
    'lr__penalty': ['l1', 'l2'],
          'lr__C': np.linspace(.001, 2, 10),
     'lr__solver': ['liblinear']    
}

In [45]:
lr = LogisticRegression()

In [46]:
gs_lr = GridSearchCV(LogisticRegression(), 
                     param_grid = {
                    'penalty': ['l1', 'l2'],
                     'C': np.linspace(.001, 2, 10),
                     'solver': ['liblinear']},
                     cv=10, n_jobs=-1)

In [47]:
gs_lr.fit(X_train_tvec, y_train)



GridSearchCV(cv=10, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([1.00000000e-03, 2.23111111e-01, 4.45222222e-01, 6.67333333e-01,
       8.89444444e-01, 1.11155556e+00, 1.33366667e+00, 1.55577778e+00,
       1.77788889e+00, 2.00000000e+00]),
                         'penalty': ['l1', 'l2'], 'solver': ['liblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_s

In [48]:
gs_lr.score(X_train_tvec, y_train)

0.8842505854800937

In [49]:
gs_lr.score(X_test_tvec, y_test)

0.796311475409836

In [50]:
gs_lr.best_params_

{'C': 1.3336666666666666, 'penalty': 'l2', 'solver': 'liblinear'}

In [51]:
cvec_params = {
    'cvec__max_features': np.arange(500, 5_000, 250),
      'cvec__stop_words': ['english'],
     'cvec__ngram_range': [(1,1), (1,2), (1, 3)],
}


In [52]:
cvec_bagcl = Pipeline([
    ('cvec', CountVectorizer(stop_words=True)),
    ('bagcl', BaggingClassifier())
])

In [53]:
gs_bagcl = GridSearchCV(cvec_bagcl, cvec_params, cv=5, n_jobs=-1)
gs_bagcl.fit(X_train, y_train)
print(gs_bagcl.score(X_train, y_train))
gs_bagcl.best_params_



0.9565573770491803


{'cvec__max_features': 3750,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english'}

In [54]:
print(gs_bagcl.score(X_test, y_test))

0.7487704918032787


In [55]:
cvec_params = {
    'cvec__max_features': np.arange(500, 5_000, 250),
      'cvec__stop_words': ['english'],
     'cvec__ngram_range': [(1,1), (1,2), (1,3)],
           'svc__degree': [2]
}
cvec_svc = Pipeline([
    ('cvec', CountVectorizer(stop_words=True)),
    ('svc', SVC())
])

In [56]:
gs_svm_cvec = GridSearchCV(cvec_svc, cvec_params, cv=5, n_jobs=-1)

In [57]:
gs_svm_cvec.fit(X_train, y_train)
print(gs_svm_cvec.score(X_train, y_train))
gs_svm_cvec.best_params_



0.921135831381733


{'cvec__max_features': 4750,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'svc__degree': 2}

In [59]:
gs_svm_cvec.score(X_test, y_test)

0.7897540983606557

In [61]:

gs_svm_tvec = GridSearchCV(SVC(), param_grid = {'degree': [2]},
                                            cv=5,
                                            n_jobs=-1)

In [64]:
%%time
gs_svm_tvec.fit(X_train_tvec, y_train)
gs_svm_tvec.score(X_train_tvec, y_train)

CPU times: user 32.5 s, sys: 379 ms, total: 32.9 s
Wall time: 52.3 s


0.9522248243559719

In [66]:
gs_svm_tvec.score(X_test_tvec, y_test)

0.7974043715846995

In [24]:
cvec_params = {
    'cvec__max_features': np.arange(500, 3_000, 500),
      'cvec__stop_words': ['english'],
     'cvec__ngram_range': [(1,1), (1,2), (1,3)],
     'rfc__n_estimators': [50, 100],
'rfc__min_samples_split': [3],
 'rfc__min_samples_leaf': [2]
}

In [25]:
cvec_rfc = Pipeline([
    ('cvec', CountVectorizer(stop_words=True)),
    ('rfc', RandomForestClassifier())
])

In [26]:


gs_rfc_cvec = GridSearchCV(cvec_rfc, 
                           param_grid=cvec_params,
                           cv=5,
                           n_jobs=-1)

CPU times: user 29 µs, sys: 0 ns, total: 29 µs
Wall time: 32.9 µs


In [29]:
%%time

gs_rfc_cvec.fit(X_train, y_train)
print(gs_rfc_cvec.score(X_train, y_train))
gs_rfc_cvec.best_params_




0.8371779859484777
CPU times: user 5.67 s, sys: 500 ms, total: 6.17 s
Wall time: 48.9 s


{'cvec__max_features': 2500,
 'cvec__ngram_range': (1, 3),
 'cvec__stop_words': 'english',
 'rfc__min_samples_leaf': 2,
 'rfc__min_samples_split': 3,
 'rfc__n_estimators': 50}

In [30]:
gs_rfc_cvec.score(X_test, y_test)

0.7886612021857924

In [33]:

%%time

gs_rfc_tvec = GridSearchCV(RandomForestClassifier(), 
                      param_grid={'n_estimators': [50, 100],
                      'n_jobs': [-1],
                      'min_samples_split': [3],
                      'min_samples_leaf': [2]})

gs_rfc_tvec.fit(X_train_tvec, y_train)
print(gs_rfc_tvec.score(X_train_tvec, y_train))
gs_rfc_tvec.best_params_

0.8730093676814988
CPU times: user 14.1 s, sys: 249 ms, total: 14.3 s
Wall time: 9.19 s


{'min_samples_leaf': 2,
 'min_samples_split': 3,
 'n_estimators': 100,
 'n_jobs': -1}

In [34]:
gs_rfc_tvec.score(X_test_tvec, y_test)

0.7882513661202186