## Import Libraries
---
The following libraries are needed for modeling:
- `confusion_matrix`, `plot_confusion_matrix` from sklearn.metrics
- `MultinomialNB` from sklearn.naive_bayes
- `Pipeline` from sklearn.pipeline
- `train_test_split`, `GridSearchCV` from sklearn.model_selection
- `CountVectorizer`, `TfidfVectorizer` from sklearn.feature_extraction.text

In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
#from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from html import unescape

## Load Data
---
Reimport the data that is cleaned from `Part 2: Data Cleaning and EDA` into dataframe.
The binary subreddits:
- 0 for StarTrek
- 1 for StarWars

In [161]:
subreddits = pd.read_csv("../data/startrek_starwars.csv")
subreddits.head()

Unnamed: 0,subreddit,selftext
0,1,"So, ship between Rey and Ben is called reylo. ..."
1,1,"May people have critizized the Prequels, hate ..."
2,0,First contact with an alien species such as th...
3,0,[I played this game for hours.](https://youtu....
4,0,The first two episodes seemed really rushed. L...


In [162]:
# define X as the message column and y are the target variable (subreddits)
X = subreddits['selftext']
y = subreddits['subreddit']

In [163]:
y.value_counts(normalize=True)

0    0.524798
1    0.475202
Name: subreddit, dtype: float64

#### *Comment*:
The data are almost 50-50, and hence we will not use stratify.

In [152]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

In [153]:
def subreddit_preprocessor(str_input):
    
    #clear html entities
    text = unescape(str_input).lower() #lower case
    
    #"words" that we dont want
    text = text.replace('www',' ')
    text = text.replace('#x200B', ' ')
    
    #tokenize with regex
    tokenizer = RegexpTokenizer(r"[\w]+")
    tokens = tokenizer.tokenize(text)
    
    #stem words
    porter_stemmer = PorterStemmer()
    stemmed_words = [porter_stemmer.stem(word) for word in tokens]
    
    return ' '.join(stemmed_words)
    

In [154]:
# Instantiate a CountVectorizer with the default hyperparameters.
cvec = CountVectorizer(preprocessor=subreddit_preprocessor)

In [155]:
# Fit the vectorizer on our corpus.
cvec.fit(X_train)

CountVectorizer(preprocessor=<function subreddit_preprocessor at 0x000001786D70C160>)

In [156]:
X_train = cvec.transform(X_train)

In [157]:
cvec.get_feature_names()

['00',
 '000',
 '000li',
 '000th',
 '01',
 '019loc8f0gqf',
 '02',
 '03',
 '04',
 '040f80fddac7e4106f586c12a746b4dbf023b6e2',
 '05',
 '050523',
 '0580a0a28c91e4fa77aea2b4771824b677619780',
 '06',
 '08',
 '08e553817539ef8998712b1ceca96665592f62d7',
 '09',
 '0cauqkfycahckewjggocftyjvahuaaaaahqaaaaaqcq',
 '0g',
 '0uq53v',
 '0x32jdn',
 '0xqcb0hlb9m61',
 '0y5ksep',
 '10',
 '100',
 '1000',
 '1000434208458',
 '1001',
 '100x',
 '101',
 '102',
 '1024',
 '1026328',
 '103',
 '1031',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '10min',
 '10n1',
 '10p',
 '10th',
 '10x',
 '10year',
 '11',
 '110',
 '1100',
 '11000',
 '111',
 '112',
 '113',
 '114',
 '115',
 '115338053',
 '116',
 '117',
 '1172',
 '118',
 '119',
 '11h',
 '11th',
 '12',
 '120',
 '1200',
 '12000',
 '121',
 '122',
 '1234929230',
 '1250',
 '1280',
 '12am',
 '12th',
 '13',
 '130000808',
 '1300f',
 '1309239516666892288',
 '13560',
 '1366',
 '1369332166308417536',
 '1369332524288212997',
 '1378860745677414402',
 '1380328656996868105',
 '13

In [158]:
input_text = cvec.get_feature_names()
output_list = []
for i in range(len(input_text)):
    # find messages that contains specifically '[deleted]' or '[removed]' -- must have []
    mo = re.findall('www', input_text[i])
    output_list.extend(mo)
output_list

[]