In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline


import warnings 
warnings.simplefilter("ignore", UserWarning)

In [None]:
dem = 'data/dem.txt'
gop = 'data/gop.txt'
stop = 'stop_words.txt'

In [None]:
with open(dem) as file_dem, open(gop) as file_gop, open(stop) as stop_words:
    text_dem = [word.strip() for word in file_dem.readlines()]
    text_gop = [word.strip() for word in file_gop.readlines()]
    stop = [word.strip() for word in stop_words.readlines()]

In [None]:
vectorizer = CountVectorizer(input=text_dem + text_gop,
                             stop_words=stop,
                             max_features=1200)
dem_bow = vectorizer.fit_transform(text_dem)
gop_bow = vectorizer.fit_transform(text_gop)

In [None]:
(dem_bow.shape, gop_bow.shape)

In [None]:
predictor = sparse.vstack((dem_bow, gop_bow))
ones = np.ones(200)
zeros = np.zeros(200)
response = np.hstack((ones, zeros))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(predictor, response, test_size=0.25, random_state=42)

In [None]:
naive_bayes = BernoulliNB()
model = naive_bayes.fit(X=x_train, y=y_train)

In [None]:
predictions = model.predict(x_test)
print(f"Prediction accuracy: {accuracy_score(y_test, predictions)}")

In [None]:
x_train, x_test, y_train, y_test = train_test_split(text_dem + text_gop, response, test_size=0.25, random_state=5)

In [41]:
#Creates a pipeline to transform and manipulate the data
pipeline = Pipeline([('vect', vectorizer), ('nb', naive_bayes)])
pipeline_model = pipeline.fit(x_train, y_train)
y_predictions = pipeline_model.predict(x_test)
accuracy_score(y_test, y_predictions)

0.92