# Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

### Train/Test Split

In [2]:
df = pd.read_csv('data/posts_2.csv')
df

Unnamed: 0,subreddit,selftext,title
0,PS5,,"Frogwares: Sherlock Holmes The Awakened, a ful..."
1,PS5,,Kao the Kangaroo - Summer Drip Free DLC
2,PS5,,can't decide which game to get in the sale?
3,PS5,,New beta software!
4,PS5,,Beta PS5 software update supports 1440p HDMI v...
...,...,...,...
7489,XboxSeriesX,So my partner has bought me a Series X for my ...,My Series X is here 1 week before my birthday
7490,XboxSeriesX,I bought this new console and playing Halo dur...,Is the fact that the series x Is warm enough t...
7491,XboxSeriesX,Hello sorry for my English I use google transl...,samsung tv setting
7492,XboxSeriesX,Does anyone have a recommended video guide of ...,video recommendation


In [3]:
df = df[['subreddit', 'selftext']]
df

Unnamed: 0,subreddit,selftext
0,PS5,
1,PS5,
2,PS5,
3,PS5,
4,PS5,
...,...,...
7489,XboxSeriesX,So my partner has bought me a Series X for my ...
7490,XboxSeriesX,I bought this new console and playing Halo dur...
7491,XboxSeriesX,Hello sorry for my English I use google transl...
7492,XboxSeriesX,Does anyone have a recommended video guide of ...


In [4]:
df['subreddit'] = df['subreddit'].map({'PS5': 1, 'XboxSeriesX': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subreddit'] = df['subreddit'].map({'PS5': 1, 'XboxSeriesX': 0})


In [5]:
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [6]:
df

Unnamed: 0,subreddit,selftext
7,1,So yeah pretty sad situation. Fan stopped work...
25,1,I just got my ps5 and I'm trying to sign in on...
28,1,I just started installing PS4 disc version on ...
37,1,I saw that there's a PS4 and also a native PS5...
57,1,"Hey, Sony - wanna make it so I can make my bac..."
...,...,...
7489,0,So my partner has bought me a Series X for my ...
7490,0,I bought this new console and playing Halo dur...
7491,0,Hello sorry for my English I use google transl...
7492,0,Does anyone have a recommended video guide of ...


In [7]:
X = df['selftext']
y = df['subreddit']

In [8]:
y.value_counts(normalize=True)

0    0.531177
1    0.468823
Name: subreddit, dtype: float64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

## Count Vectorizer

In [10]:
cvec = CountVectorizer(max_features=7000, stop_words='english', min_df=2, max_df=0.7, ngram_range=(1,2))

In [11]:
X_train_cvec = cvec.fit_transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [12]:
pd.DataFrame(X_train_cvec.toarray(), columns=cvec.get_feature_names())



Unnamed: 0,000,000 000,000 lg,000 nits,000 option,0017,01,02,03,04,...,youtuber,yzi_75tboxrkbrnofoxym,yzi_75tboxrkbrnofoxym edit,z_oyx5gj4o,z_oyx5gj4o yzi_75tboxrkbrnofoxym,zelda,zero,zero dawn,zombies,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1595,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1596,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1597,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [147]:
print(cvec.get_feature_names())





In [13]:
ss = StandardScaler()
X_train_cvec = pd.DataFrame(X_train_cvec.toarray(), columns=cvec.get_feature_names())
X_test_cvec = pd.DataFrame(X_test_cvec.toarray(), columns=cvec.get_feature_names())

X_train_cvec = ss.fit_transform(X_train_cvec)
X_test_cvec = ss.transform(X_test_cvec)



## Logistic Regression

In [149]:
from sklearn.linear_model import LogisticRegression

In [150]:
logreg_cvec = LogisticRegression(max_iter=1000) 

In [151]:
lr.fit(X_train_cvec, y_train)

LogisticRegression(max_iter=1000)

In [154]:
lr.score(X_train_cvec, y_train)

0.9962476547842402

In [155]:
lr.score(X_test_cvec, y_test)

0.6573033707865169

#### Model tuning

In [129]:
pipe = Pipeline([('cvec', CountVectorizer()),('logreg', LogisticRegression())])

In [130]:
pipe_params = {
    'cvec__max_features' : [6000, 7000, 8000],
    'cvec__min_df' : [1, 2],
    'cvec__max_df' : [0.7, 0.8],
    'cvec__ngram_range' : [(1,1), (1,2)],
    'logreg__max_iter' : [500, 1000]
}

In [131]:
gs = GridSearchCV(pipe, param_grid=pipe_params, cv = 5)

In [132]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('logreg', LogisticRegression())]),
             param_grid={'cvec__max_df': [0.7, 0.8],
                         'cvec__max_features': [6000, 7000, 8000],
                         'cvec__min_df': [1, 2],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'logreg__max_iter': [500, 1000]})

In [133]:
gs.best_score_

0.8161363636363637

In [134]:
gs.best_params_

{'cvec__max_df': 0.7,
 'cvec__max_features': 8000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'logreg__max_iter': 500}

## Random Forest

In [20]:
rf = RandomForestClassifier()

In [25]:
cross_val_score(rf, X_train_tvec, y_train, cv = 5).mean()

0.8323922413793102

In [156]:
rf_params = {
    'n_estimators': [50, 75, 100],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [3, 5, 8]
}

In [158]:
rf_gs = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=rf_params, cv = 5, verbose=1, n_jobs=-1)
rf_gs.fit(X_train_cvec, y_train)
print(rf_gs.best_score_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
0.8305211598746082


In [159]:
rf_gs.best_params_

{'max_depth': 20,
 'min_samples_leaf': 3,
 'min_samples_split': 10,
 'n_estimators': 100}

In [170]:
best_rf = rf_gs.best_estimator_

In [171]:
best_rf.fit(X_train_cvec, y_train)

RandomForestClassifier(max_depth=20, min_samples_leaf=3, min_samples_split=10,
                       random_state=42)

In [172]:
best_rf.score(X_train_cvec, y_train)

0.866166353971232

In [174]:
best_rf.score(X_test_cvec, y_test)

0.8370786516853933

## KNN

In [37]:
from sklearn.neighbors import KNeighborsClassifier

In [45]:
# pd.DataFrame(X_train_cvec.toarray(), columns=cvec.get_feature_names())

In [39]:
X_train_cvec = pd.DataFrame(X_train_cvec.toarray(), columns=cvec.get_feature_names())
X_test_cvec = pd.DataFrame(X_test_cvec.toarray(), columns=cvec.get_feature_names())




In [41]:
knn = KNeighborsClassifier()
cross_val_score(knn, X_train_cvec, y_train, cv = 5).mean()

0.5465869905956113

In [160]:
knn_params = {
    'n_neighbors' : [3, 5, 10],
    'weights': ['uniform', 'distance']
}

In [161]:
knn_gs = GridSearchCV(KNeighborsClassifier(), param_grid=knn_params, cv=5, verbose =1, n_jobs=-1)
knn_gs.fit(X_train_cvec, y_train)
print(knn_gs.best_score_)
print(knn_gs.best_params_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
0.5515967868338558
{'n_neighbors': 3, 'weights': 'distance'}


In [180]:
best_knn = knn_gs.best_estimator_
best_knn.fit(X_train_cvec, y_train)
print(best_knn.score(X_train_cvec, y_train))
print(best_knn.score(X_test_cvec, y_test))

0.9993746091307066
0.5561797752808989


## SVM

In [14]:
from sklearn.svm import SVC

In [27]:
svc = SVC(kernel='poly', degree=3)
cross_val_score(svc, X_train_cvec, y_train, cv =6).mean()

0.5384669539016079

In [16]:
svm_params = {
    'kernel' : ['rbf', 'linear', 'poly'],
    'C': [1, 3, 5],
    'degree': [2, 3]
}

In [18]:
svm_gs = GridSearchCV(SVC(), param_grid=svm_params, cv=5, verbose =1, n_jobs=-1)
svm_gs.fit(X_train_cvec, y_train)
print(svm_gs.best_score_)
print(svm_gs.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


KeyboardInterrupt: 

In [175]:
best_svm = svm_gs.best_estimator_

In [176]:
best_svm.fit(X_train_cvec, y_train)
print(best_svm.score(X_train_cvec, y_train))
print(best_svm.score(X_test_cvec, y_test))

0.6722846441947565

In [177]:
print(best_svm.score(X_train_cvec, y_train))
print(best_svm.score(X_test_cvec, y_test))

0.991869918699187
0.6722846441947565


# TF-IDF Vectorizer

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
tvec = TfidfVectorizer(max_features=6000, stop_words='english', min_df=2, max_df=0.98, ngram_range=(1,2))
X_train_tvec = tvec.fit_transform(X_train)

X_test_tvec = tvec.transform(X_test)

In [24]:
cross_val_score(lr, X_train_tvec, y_train, cv = 5).mean()

NameError: name 'lr' is not defined

In [89]:
logreg_tvec = LogisticRegression()

In [90]:
logreg_tvec.fit(X_train_tvec, y_train)

LogisticRegression()

In [91]:
logreg_tvec.score(X_train_tvec, y_train)

0.9631019387116948

In [93]:
logreg_tvec.score(X_test_tvec, y_test)

0.8539325842696629

In [164]:
tf = Pipeline([('tf', TfidfVectorizer()),('logreg', LogisticRegression())])

In [165]:
tf_params = {
    'tf__max_features' : [6000, 7000, 8000],
    'tf__min_df' : [1, 2],
    'tf__max_df' : [0.7, 0.8],
    'tf__ngram_range' : [(1,1), (1,2)],
    'logreg__max_iter' : [500, 1000]
}

In [166]:
tf_gs = GridSearchCV(pipe, param_grid=pipe_params, cv = 5)
tf_gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('logreg', LogisticRegression())]),
             param_grid={'cvec__max_df': [0.7, 0.8],
                         'cvec__max_features': [6000, 7000, 8000],
                         'cvec__min_df': [1, 2],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'logreg__max_iter': [500, 1000]})

In [168]:
tf_gs.best_score_

0.8161363636363637

In [169]:
tf_gs.best_params_

{'cvec__max_df': 0.7,
 'cvec__max_features': 8000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'logreg__max_iter': 500}