In [1]:
#standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
   
#Sklearn imports
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

pd.set_option('display.max_colwidth' ,999)

In [2]:
preg_parent_df = pd.read_csv('./data/preg_parent_downsample.csv')
preg_parent_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  50000 non-null  int64  
 1   lems        50000 non-null  object 
 2   selftext    50000 non-null  object 
 3   title       50000 non-null  object 
 4   title_lems  50000 non-null  object 
 5   author      50000 non-null  object 
 6   subreddit   50000 non-null  int64  
 7   all_lems    50000 non-null  object 
 8   word_count  50000 non-null  float64
dtypes: float64(1), int64(2), object(6)
memory usage: 3.4+ MB


In [3]:
preg_parent_df.drop(columns='Unnamed: 0', inplace= True)

In [6]:
#Train Test Split
X = preg_parent_df['all_lems']
y = preg_parent_df['subreddit']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .33, random_state= 37, stratify= y)

In [8]:
cvec = CountVectorizer(max_features= 5_000, ngram_range= (1,2))

In [9]:
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train).todense(), 
                           columns= cvec.get_feature_names())

X_test_cvec = pd.DataFrame(cvec.fit_transform(X_test).todense(),
                          columns= cvec.get_feature_names())

In [11]:
#instantiate models 
rf = RandomForestClassifier(n_estimators= 100, random_state= 37)
et = ExtraTreesClassifier(n_estimators= 100, random_state= 37)

#### Random Forests with Basic Parameters 

In [12]:
%time
rf.fit(X_train_cvec, y_train)

Wall time: 0 ns


RandomForestClassifier(random_state=37)

In [13]:
rf.score(X_train_cvec, y_train)

0.999910447761194

In [14]:
rf.score(X_test_cvec, y_test)

0.6308484848484849

Random Forest Interpretation:
* This is an incredibly overfit model! 
* So much so that I think I need to revisit it again with different parameters to check whether it was so over fit due to the model or just bad parameters

#### Extra Trees with basic parameters

In [15]:
#extra trees any better?
et.fit(X_train_cvec, y_train)

ExtraTreesClassifier(random_state=37)

In [16]:
print(round(et.score(X_train_cvec, y_train), 4))
print(round(et.score(X_test_cvec, y_test), 4))

0.9999
0.6238


Extra Trees Interpretation:
* This is also an incredibly overfit model! 
* Again is it just that a decision tree classifier is not very good on this data or am I not using adequate parameters?

### Grid Search Attempt

In [37]:
#rf_params = {
 #   'n_estimators' : [100, 200, 300],
    'max_depth'    : [1, 2, 3, 4, 5],
#}
#gs = GridSearchCV(rf, param_grid= rf_params, cv= 5)

In [38]:
#gs.fit(X_train_cvec, y_train)

Wall time: 0 ns


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=37),
             param_grid={'max_depth': [1, 2, 3, 4, 5],
                         'n_estimators': [100, 200, 300]})

In [39]:
#gs.best_score_

0.9050666666666667

In [40]:
#gs.score(X_train_cvec, y_train)

0.90856

In [41]:
#gs.score(X_test_cvec, y_test)

0.50888

In [42]:
#gs.best_params_

{'max_depth': 5, 'n_estimators': 200}

Apparently the basic parameters were better than the parameters for this grid search

If I were to further attempt to improve the random forests mondel further looking at how to determine the leaf split would be the place to start.