In [13]:
import os
import pandas as pd
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [14]:
list_of_dfs = []

try: 
    print('Lets try and pull data from the files in this folder')
    for file in os.listdir('../data')[1:]:
        d = pd.read_csv('../data/' + file)
        list_of_dfs.append(d)
    print('Huzzah! Mission Complete')
except:
    print("Welp, that didn't work")

df = pd.concat(list_of_dfs, ignore_index=True).drop_duplicates(subset = 'selftext')
df.shape

Lets try and pull data from the files in this folder
Huzzah! Mission Complete


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if sys.path[0] == '':


(2135, 104)

In [15]:
df.columns

Index(['all_awardings', 'allow_live_comments', 'approved_at_utc',
       'approved_by', 'archived', 'author', 'author_cakeday',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext',
       ...
       'thumbnail_width', 'title', 'total_awards_received', 'ups', 'url',
       'user_reports', 'view_count', 'visited', 'whitelist_status', 'wls'],
      dtype='object', length=104)

In [16]:
df.head(3)

Unnamed: 0,all_awardings,allow_live_comments,approved_at_utc,approved_by,archived,author,author_cakeday,author_flair_background_color,author_flair_css_class,author_flair_richtext,...,thumbnail_width,title,total_awards_received,ups,url,user_reports,view_count,visited,whitelist_status,wls
0,[],False,,,False,pittman66,,,MAL,[],...,,"Wiki Overhaul Month, Week 2: Watch Order Wiki",0,67,https://www.reddit.com/r/anime/comments/c822ra...,[],,False,all_ads,6
1,[],False,,,False,AnimeMod,,,,[],...,,Recommendation Tuesdays Megathread - Week of J...,0,61,https://www.reddit.com/r/anime/comments/c823rj...,[],,False,all_ads,6
2,[],True,,,False,MinecrafterPH,,#2e51a2,MAL,[],...,,My Hero Academia Season 4 is reportedly listed...,0,5778,https://www.reddit.com/r/anime/comments/c8o432...,[],,False,all_ads,6


In [17]:
df = df[['selftext', 'subreddit']]
df.head()

Unnamed: 0,selftext,subreddit
0,"Hi everyone, welcome to the Wiki Overhaul Mont...",anime
1,\nNeed a recommendation or have one to share? ...,anime
2,According to [this post](https://i.imgur.com/b...,anime
3,"*Dumbbell Nan Kilo Moteru?*, episode 1\n\nAlte...",anime
4,###[Vote here](https://animebracket.com/vote/b...,anime


In [18]:
df['selftext'].isnull().sum()

1

I have 2152 Image/Video/Gif/Meme Subreddit posts. I'm going to drop them because I do not think they will be a good indicator of whether or not a post belongs to the anime or kdrama subreddit

In [19]:
df.dropna(inplace = True)

In [20]:
X = df['selftext']
y = df['subreddit'].map(lambda cell: 1 if cell == 'anime' else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

***
# Model Comparison Using NLP, Pipelines, and GridSearch

## Baseline 

This is the baseline for my models. If a model does not at least do better than this score, then the model is not worth using.

In [21]:
y.value_counts(normalize = True).max()

0.7582005623242737

## Logistic Regression

This is a model that excels on data with a binarized target variable. This model is also widely used for its simplicity and readability

### Count Vectorizer

This is an NLP method that converts my title strings into column names and counts how many time those words appear in all of the titles I'm looking at. It's basically like `.value_counts`.

In [22]:
#Creates pipeline for CountVectorizer and Logistic Regression
pipe_cv_lr = Pipeline([('cv', CountVectorizer(stop_words = 'english', max_df = .9)),  
                ('lr', LogisticRegression(random_state = 42))])

>* stop_words are words like 'the', 'an', 'to'. I don't want to include those as columns
>* max_df means the min percentage of documents something needs to be in order to be excluded

In [23]:
#Hyperparameters for GridSearch to use to find best possible CountVectorizer, Log Regression hyperparameters

pipe_cv_lr_params = {
    'cv__max_features': [1500, 2000, 2500],   #max number of columns
    
    'cv__min_df': [3, 5],  #minimum number of docments something needs to be in in order to be included
    
    'cv__ngram_range': [(1,1), (1,2)],   #Accounts for context of up to two words i.e  'not good' vs 'not' or 'good'
    
    'lr__C': [.5, 1]  #penalty on coefficients increases as C decreases
    
}

In [24]:
#GridSearching to find best combination of hyperparameters for CountVectorizer
gs_cv_lr = GridSearchCV(pipe_cv_lr, param_grid=pipe_cv_lr_params, cv = 3)
gs_cv_lr.fit(X_train, y_train)
print(f'The CountVectorizer, Logistic Regression train score is {gs_cv_lr.best_score_}')
print(f'The CountVectorizer, Logistic Regression test score is {gs_cv_lr.score(X_test, y_test)}')
gs_cv_lr.best_params_



The CountVectorizer, Logistic Regression train score is 0.93375
The CountVectorizer, Logistic Regression test score is 0.9344569288389513


{'cv__max_features': 1500,
 'cv__min_df': 3,
 'cv__ngram_range': (1, 2),
 'lr__C': 1}

In [25]:
conf_matrix = pd.DataFrame(confusion_matrix(y_true=y_test,   #Actuals
                                            y_pred=gs_cv_lr.predict(X_test)),   #Generate Predictions
             columns=['Actual Negative', 'Actual Positive'],
             index=['Predicted Negative','Predicted Positive'])

conf_matrix

Unnamed: 0,Actual Negative,Actual Positive
Predicted Negative,105,24
Predicted Positive,11,394


A confusion matrix lets us know how well our model is adapting to new data. The positive or majority class is Anime and the negative or minority class is KDrama. Because I stratified my data before I split it, the inequality in the number of true positive (*bottom right*) and true negative (*top left*) values lets you know that there is an imbalance in the amount of Anime and KDrama posts in my data. 
> * I have signigicantly more true positives and negatives than false positives (*bottom left*) and false negatives (*top right*), meaning that this model does a great job of correctly assigning new posts to the correct subreddit.
* I have a good number of false negatives though, which lets me know that we are often assigning something to the KDrama class when it actually belongs to the Anime class.
* The low number of false positives means that the model rarely assigns something to the KDrama class when it is really an anime. 

The last two cases are probably due to the fact that our data is imbalanced, causing our model to be less prepared to handle new KDrama data because it needs to learn a bit more about it before it can accurately be sorted.

### *TFIDF Vectorizer*

Similar to Count Vectorizer, this NLP method converts my titles into strings. Unlike CountVectorizer, TFIDFVectorizer assigns a float score to each of the words in the title based on how often they appear in all of my documents.
> * words that appear more often in one document but rarely in the rest of them will score higher (i.e names)
* words that appear often in one document and show up in every document will score lower(i.e the)

In [12]:
#Creates pipeline for TFIDFVectorizer and Logistic Regression
pipe_tf_lr = Pipeline([('tf', TfidfVectorizer(stop_words = 'english', max_df = .9)),
                ('lr', LogisticRegression(random_state = 42))])

#Hyperparameters for GridSearch to use to find best possible tfVectorizer, Log Regression hyperparameters

pipe_tf_lr_params = {
    'tf__max_features': [3000, 3500, 4000],
    'tf__min_df': [3, 5],
    'tf__ngram_range': [(1,1), (1,2)],  
    'lr__C': [.5, 1]
}

In [13]:
#GridSearching to find best combination of hyperparameters for tfVectorizer
gs_tf_lr = GridSearchCV(pipe_tf_lr, param_grid=pipe_tf_lr_params, cv = 3)
gs_tf_lr.fit(X_train, y_train)
print(f'The TFIDFVectorizer, Logistic Regression train score is {gs_tf_lr.best_score_}')
print(f'The TFIDFVectorizer, Logistic Regression test score is {gs_tf_lr.score(X_test, y_test)}')
gs_tf_lr.best_params_



The TFIDFVectorizer, Logistic Regression train score is 0.9413439635535308
The TFIDFVectorizer, Logistic Regression test score is 0.9538855678906917


{'lr__C': 1,
 'tf__max_features': 3000,
 'tf__min_df': 5,
 'tf__ngram_range': (1, 2)}

In [None]:
conf_matrix = pd.DataFrame(confusion_matrix(y_true=y_test,   #Actuals
                                            y_pred=gs_tf_lr.predict(X_test)),   #Generate Predictions
             columns=['Actual Negative', 'Actual Positive'],
             index=['Predicted Negative','Predicted Positive'])

conf_matrix

**It turns out that both Logistic Regression Models perform really strongly and adapt to new data really well.** The Count Vectorizer model is pulling ahead of the TFIDF model, but this could be changed by modifying the gridsearch hyperparameters. 
> Due to it's performance, I will be comparing the **Logistic Regression Count Vectorizer model (LRCV)** to the remaining models. 

***
## Multinomial Naive Bayes

This is a modeling technique that relies on Bayes Theorem to make classification. It also models with the assumption that all of our features are independent of one another, which is rarely met. Although that assumption is naive, this model performs amazingly well regardless of that fact. We will use the Multinomial Version because our column values are positive integers after CountVectorizering our data.

> This model should give us a much better train and test score than the logistic regression model.

In [14]:
#Creates pipeline for TFIDFVectorizer and Gaussian Naive Bayes
pipe_mnb = Pipeline([('cv', CountVectorizer(stop_words = 'english', max_df = .9)), 
                ('mnb', MultinomialNB())])

#Hyperparameters for GridSearch to use to find best possible tfVec, Gaussian Naive Bayes hyperparameters

pipe_mnb_params = {
    'cv__max_features': [1500, 2000, 2500],
    'cv__min_df': [3, 5],
    'cv__ngram_range': [(1,1), (1,2)]
    
}

In [15]:
#GridSearching to find best combination of hyperparameters for tfVectorizer
gs_mnb = GridSearchCV(pipe_mnb, param_grid=pipe_mnb_params, cv = 3)
gs_mnb.fit(X_train, y_train)
print(f'The CountVectorizer, Multnomial Naive Bayes train score is {gs_mnb.best_score_}')
print(f'The CountVectorizer, Multinomial Naive Bayes test score is {gs_mnb.score(X_test, y_test)}')
gs_mnb.best_params_

The CountVectorizer, Multnomial Naive Bayes train score is 0.9496013667425968
The CountVectorizer, Multinomial Naive Bayes test score is 0.9598633646456021


{'cv__max_features': 2500, 'cv__min_df': 3, 'cv__ngram_range': (1, 1)}

In [None]:
conf_matrix = pd.DataFrame(confusion_matrix(y_true=y_test,   #Actuals
                                            y_pred=gs_mnb.predict(X_test)),   #Generate Predictions
             columns=['Actual Negative', 'Actual Positive'],
             index=['Predicted Negative','Predicted Positive'])

conf_matrix

The Count Vectorized Multinomial Bayes model performed very strongly with an extremely high train score. It also sports high adaptability to new data with the test score being higher than the train score. However, our model is less accurate and a weaker performer when adapting to new data when compared to the LRCV model. This may be because the naive assumption that's trademark to the Naive Bayes model is causing some error. It may also be because of the hyperparameters currently set. 

*As we continue, one major factor in the performance of our models will be the hyperparameters that each model is tested on. The nature of grid searching is a guess and check, so a good estimate can lead to amazing results for one model, and poor results for another.*

***
## Gaussian Naive Bayes

This is a modeling technique that relies on Bayes Theorem to make classification. It also models with the assumption that all of our features are independent of one another, which is rarely met. Although that assumption is naive, this model performs amazingly well regardless of that fact. We will use th Gaussian Version because the requirements for the alternative version are violated when we use the TFIDFVectorizor on our corpora. 

> This model should give us a much better train and test score than the logistic regression model.

In [16]:
from sklearn.base import TransformerMixin

class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [17]:
#Creates pipeline for TFIDFVectorizer and Gaussian Naive Bayes
pipe_gnb = Pipeline([('tf', TfidfVectorizer(stop_words = 'english', max_df = .9)),
                ('to_dense', DenseTransformer()), 
                ('gnb', GaussianNB())])

#Hyperparameters for GridSearch to use to find best possible tfVec, Gaussian Naive Bayes hyperparameters

pipe_gnb_params = {
    'tf__max_features': [3500, 4000, 4500],
    'tf__min_df': [3, 5],
    'tf__ngram_range': [(1,1), (1,2)]
    
}

In [18]:
#GridSearching to find best combination of hyperparameters for tfVectorizer
gs_gnb = GridSearchCV(pipe_gnb, param_grid=pipe_gnb_params, cv = 3)
gs_gnb.fit(X_train, y_train)
print(f'The TFIDFVectorizer, Gaussian Naive Bayes train score is {gs_gnb.best_score_}')
print(f'The TFIDFVectorizer, Gaussian Naive Bayes test score is {gs_gnb.score(X_test, y_test)}')
gs_gnb.best_params_

The TFIDFVectorizer, Gaussian Naive Bayes train score is 0.941628701594533
The TFIDFVectorizer, Gaussian Naive Bayes test score is 0.9350982066609735


{'tf__max_features': 4000, 'tf__min_df': 5, 'tf__ngram_range': (1, 2)}

In [None]:
conf_matrix = pd.DataFrame(confusion_matrix(y_true=y_test,   #Actuals
                                            y_pred=gs_gnb.predict(X_test)),   #Generate Predictions
             columns=['Actual Negative', 'Actual Positive'],
             index=['Predicted Negative','Predicted Positive'])

conf_matrix

As we can see, the Gaussian Naive Bayes Model did very well, but doesn't match up to the LRCV. Although our data is slightly overfit, we can see that this is still a very strong model that adapts to new data with high accuracy. 

***
## Decision Tree

Decision Trees are very prone to overfitting, so we implemented some hyperparamters when fitting to alleviate that. Decision trees work by attempting to reduce a Gini score to 0. A gini score is basically how pure a group of sample data is. A model with a gini score of 0 has final samples that consist of the same thing (i.e. all red birds). Decision trees want to get 1 sample per final sample group, which is what leads to overfitting. 

### *Count Vectorizer*

In [19]:
pipe_dt_cv = Pipeline([('cv', CountVectorizer(stop_words = 'english', max_df = .9)),
                ('dt', DecisionTreeClassifier(max_depth = 5, random_state = 42))])

#max_depth is the number of levels/questions asked by the decision tree

pipe_dt_cv_params = {
    'cv__max_features': [1500, 2000, 2500],
    'cv__min_df': [3, 5],
    'cv__ngram_range': [(1,1), (1,2)],
    'dt__min_samples_leaf': [3, 5],   #Minimum number of samples required before splitting a group/node
    'dt__min_samples_split': [7, 10]   #Minimum number of samples required to be in a group/node
}

In [20]:
#GridSearching to find best combination of hyperparameters for cvVectorizer
gs_dt_cv = GridSearchCV(pipe_dt_cv, param_grid=pipe_dt_cv_params, cv = 3)
gs_dt_cv.fit(X_train, y_train)
print(f'The CountVectorizer, Decision Tree train score is {gs_dt_cv.best_score_}')
print(f'The CountVectorizer, Decision Tree test score is {gs_dt_cv.score(X_test, y_test)}')
gs_dt_cv.best_params_

The CountVectorizer, Decision Tree train score is 0.9068906605922551
The CountVectorizer, Decision Tree test score is 0.9222886421861657


{'cv__max_features': 2500,
 'cv__min_df': 5,
 'cv__ngram_range': (1, 1),
 'dt__min_samples_leaf': 3,
 'dt__min_samples_split': 7}

In [None]:
conf_matrix = pd.DataFrame(confusion_matrix(y_true=y_test,   #Actuals
                                            y_pred=gs_dt_cv.predict(X_test)),   #Generate Predictions
             columns=['Actual Negative', 'Actual Positive'],
             index=['Predicted Negative','Predicted Positive'])

conf_matrix

### *TFIDF Vectorizer*

In [21]:
pipe_dt_tf = Pipeline([('tf', TfidfVectorizer(stop_words = 'english', max_df = .9)),
                ('dt', DecisionTreeClassifier(max_depth = 5, random_state = 42))])

#max_depth is the number of levels/questions asked by the decision tree

pipe_dt_tf_params = {
    'tf__max_features': [3500, 4000, 4500],
    'tf__min_df': [3, 5],
    'tf__ngram_range': [(1,1), (1,2)],
    'dt__min_samples_leaf': [3, 5],   #Minimum number of samples required before splitting a group/node
    'dt__min_samples_split': [7, 10]   #Minimum number of samples required to be in a group/node
}

In [23]:
#GridSearching to find best combination of hyperparameters for tfVectorizer
gs_dt_tf = GridSearchCV(pipe_dt_tf, param_grid=pipe_dt_tf_params, cv = 3)
gs_dt_tf.fit(X_train, y_train)
print(f'The TFIDFVectorizer, Decision Tree train score is {gs_dt_tf.best_score_}')
print(f'The TFIDFVectorizer, Decision Tree test score is {gs_dt_tf.score(X_test, y_test)}')
gs_dt_tf.best_params_

The TFIDFVectorizer, Decision Tree train score is 0.9094533029612756
The TFIDFVectorizer, Decision Tree test score is 0.9231426131511529


{'dt__min_samples_leaf': 3,
 'dt__min_samples_split': 7,
 'tf__max_features': 4500,
 'tf__min_df': 5,
 'tf__ngram_range': (1, 1)}

In [None]:
conf_matrix = pd.DataFrame(confusion_matrix(y_true=y_test,   #Actuals
                                            y_pred=gs_dt_tf.predict(X_test)),   #Generate Predictions
             columns=['Actual Negative', 'Actual Positive'],
             index=['Predicted Negative','Predicted Positive'])

conf_matrix

Here we can see that the parameters I have inserted to lower the error due to variance have worked considerably well. In fact, these decision tree models adapt extremely well to new data. This shows the strengths of grid searching and hyperparameter tuning. The right combination can make an ok model a great one, but it is still a far cry from the LRCV model. This may be due to the imbalance in my data.

*** 
## Random Forest 

Random Forests are aggregated decision trees that use a concept called bagging to make them very accurate. Bagging is basically the central limit theorem; we split a population (our data) into multiple samples and run decision tree models over each sample. Then we take the average of each model. This is an ensemble model, and these models typically have the highest accuracy scores for three reasons. 
> * First, ensemble models take the average of multiple models, which usually results in canceling out the error in each of the individuals models. 
* Second, taking the average scores of multiple models tends to result in reaching scores one model may not have been able to reach alone. This means we may be able to hone in on a global best. 
* Finally, one model will most likely be not perfect since they all have their shortcomings. Aggregating the results of multiple models could create a model that exceeds the limitations of all the components combined. 

### *Count Vectorizer*

In [24]:
pipe_rf_cv = Pipeline([('cv', CountVectorizer(stop_words = 'english', max_df = .9)),
                ('rf', RandomForestClassifier(max_depth = 5, random_state = 42))])

pipe_rf_cv_params = {
    'cv__max_features': [2500, 3500, 4000, 4500],
    'cv__min_df': [3, 5],
    'cv__ngram_range': [(1,1), (1,2)],
    'rf__n_estimators': [50 ,75],
    'rf__max_depth': [5, 6]
}

In [25]:
#GridSearching to find best combination of hyperparameters for tfVectorizer
gs_rf_cv = GridSearchCV(pipe_rf_cv, param_grid=pipe_rf_cv_params, cv = 3)
gs_rf_cv.fit(X_train, y_train)
print(f'The CountVectorizer, Random Forest train score is {gs_rf_cv.best_score_}')
print(f'The CountVectorizer, Random Forest test score is {gs_rf_cv.score(X_test, y_test)}')
gs_rf_cv.best_params_

The CountVectorizer, Random Forest train score is 0.8345671981776766
The CountVectorizer, Random Forest test score is 0.8317677198975235


{'cv__max_features': 2500,
 'cv__min_df': 5,
 'cv__ngram_range': (1, 2),
 'rf__max_depth': 6,
 'rf__n_estimators': 50}

In [None]:
conf_matrix = pd.DataFrame(confusion_matrix(y_true=y_test,   #Actuals
                                            y_pred=gs_rf_cv.predict(X_test)),   #Generate Predictions
             columns=['Actual Negative', 'Actual Positive'],
             index=['Predicted Negative','Predicted Positive'])

conf_matrix

### *TFIDF Vectorizer*

In [26]:
pipe_rf_tf = Pipeline([('tf', TfidfVectorizer(stop_words = 'english', max_df = .9)),
                ('rf', RandomForestClassifier(max_depth = 5, random_state = 42))])

pipe_rf_tf_params = {
    'tf__max_features': [2500, 3500, 4000, 4500],
    'tf__min_df': [3, 5],
    'tf__ngram_range': [(1,1), (1,2)],
    'rf__n_estimators': [50 ,75],
    'rf__max_depth': [5, 6]
}

In [27]:
#GridSearching to find best combination of hyperparameters for tfVectorizer
gs_rf_tf = GridSearchCV(pipe_rf_tf, param_grid=pipe_rf_tf_params, cv = 3)
gs_rf_tf.fit(X_train, y_train)
print(f'The TFIDFVectorizer, Random Forest train score is {gs_rf_tf.best_score_}')
print(f'The TFIDFVectorizer, Random Forest test score is {gs_rf_tf.score(X_test, y_test)}')
gs_rf_tf.best_params_

The TFIDFVectorizer, Random Forest train score is 0.8354214123006833
The TFIDFVectorizer, Random Forest test score is 0.8317677198975235


{'rf__max_depth': 6,
 'rf__n_estimators': 50,
 'tf__max_features': 2500,
 'tf__min_df': 5,
 'tf__ngram_range': (1, 2)}

In [None]:
conf_matrix = pd.DataFrame(confusion_matrix(y_true=y_test,   #Actuals
                                            y_pred=gs_rt_tf.predict(X_test)),   #Generate Predictions
             columns=['Actual Negative', 'Actual Positive'],
             index=['Predicted Negative','Predicted Positive'])

conf_matrix

Shockingly, the random forest model actually yielded the worst results. Ensemble models are typically the most accurate of all the models and adapts to new data remarkably well. Since this model is built using an aggregation of decision trees, it should have at least performed better than them. I'm going to attribute this error to the tradeoff with gridsearching. I could pass in all the available hyperparameters for random forests, but then it may take a whole day for my model to run. Grid searching is also a guess and check method, and I may have picked a poor range of possible hyperparameters. That is definitely something I would want to change in the future. 

***
# Model Comparison

In [28]:
#Accuracy score
print(f'Baseline model accuracy score is {y.value_counts(normalize = True).max()}.')
print(f'The CountVectorizer, Logistic Regression train score is {gs_cv_lr.best_score_}')
print(f'The TFIDFVectorizer, Logistic Regression train score is {gs_tf_lr.best_score_}')
print(f'The CountVectorizer, Multnomial Naive Bayes train score is {gs_mnb.best_score_}')
print(f'The TFIDFVectorizer, Gaussian Naive Bayes train score is {gs_gnb.best_score_}')
print(f'The CountVectorizer, Decision Tree train score is {gs_dt_cv.best_score_}')
print(f'The TFIDFVectorizer, Decision Tree train score is {gs_dt_tf.best_score_}')
print(f'The CountVectorizer, Random Forest train score is {gs_rf_cv.best_score_}')
print(f'The TFIDFVectorizer, Random Forest train score is {gs_rf_tf.best_score_}')

Baseline model accuracy score is 0.7157804825966261.
The CountVectorizer, Logistic Regression train score is 0.9689635535307517
The TFIDFVectorizer, Logistic Regression train score is 0.9413439635535308
The CountVectorizer, Multnomial Naive Bayes train score is 0.9496013667425968
The TFIDFVectorizer, Gaussian Naive Bayes train score is 0.941628701594533
The CountVectorizer, Decision Tree train score is 0.9068906605922551
The TFIDFVectorizer, Decision Tree train score is 0.9094533029612756
The CountVectorizer, Random Forest train score is 0.8345671981776766
The TFIDFVectorizer, Random Forest train score is 0.8354214123006833


# Summary

Although all of our models performed better than the baseline, I would go with the CountVectorized Logisitic Regression model to sort between the Anime and KDrama subreddit. It sported the best train and test scores while not being overfit. In the future, I could definitely increase the accuracy of my models by tweaking the hyperparameters each model gridsearches over. Our worst performing models were decision trees and random forests, which is probably due to our dataset imbalance. Our strongest performing models were the Logisitic Regression models, which is without a doubt due to the fact that we have an easily binarized target variable. The Naive Bayes models also performed really strongly and would be solid alternatives to the Logistic Regression models. 