In [1]:
import math
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier


In [2]:

df = pd.read_csv('federalist.csv')

# STEP 1
print(df.head())
authors = {}
for row in df['author']:
    if authors.get(row) != None:
        authors[row] += 1
    else:
        authors[row] = 0
print(authors)


     author                                               text
0  HAMILTON  FEDERALIST. No. 1 General Introduction For the...
1       JAY  FEDERALIST No. 2 Concerning Dangers from Forei...
2       JAY  FEDERALIST No. 3 The Same Subject Continued (C...
3       JAY  FEDERALIST No. 4 The Same Subject Continued (C...
4       JAY  FEDERALIST No. 5 The Same Subject Continued (C...
{'HAMILTON': 48, 'JAY': 4, 'MADISON': 14, 'HAMILTON AND MADISON': 2, 'HAMILTON OR MADISON': 10}


In [3]:

# STEP 2
stopwords = stopwords.words('english')

df['text_nostop'] = df['text'].apply(lambda words: ' '.join([word for word in words.split() if word not in stopwords]))
X_train, X_test, y_train, y_test = train_test_split(df['text_nostop'], df['author'], test_size=0.2, random_state=1234)
print(X_train.shape)
print(X_test.shape)


(66,)
(17,)


In [4]:

# STEP 3
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train) # fit the training data
X_test = tfidf.transform(X_test)
print(X_train.shape)
print(X_test.shape)


(66, 7991)
(17, 7991)


In [5]:

# STEP 4
bern = BernoulliNB()
bern.fit(X_train,y_train)
pred = bern.predict(X_test)
print("bernoulli: ", bern.score(X_test,y_test))


bernoulli:  0.5882352941176471


In [6]:

# STEP 5
X_train, X_test, y_train, y_test = train_test_split(df['text_nostop'], df['author'], test_size=0.2, random_state=1234)
tfidf = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.5, ngram_range=(1, 2))
X_train = tfidf.fit_transform(X_train) # fit the training data
X_test = tfidf.transform(X_test)


In [7]:

bern = BernoulliNB()
bern.fit(X_train,y_train)
pred = bern.predict(X_test)
print("bernoulli: ", bern.score(X_test,y_test)) # MUCH BETTER RESULTS!


bernoulli:  0.9411764705882353


In [8]:

# STEP 6
lr = LogisticRegression(solver='liblinear') # CHANGED A TON OF PARAMETERS, COULD GET NO CHANGE
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
print("logisitic regression: ", lr.score(X_test,y_test))


logisitic regression:  0.5882352941176471


In [9]:

# STEP 7
mlp = MLPClassifier(max_iter=10000, hidden_layer_sizes=(7,9)) # about 82% accuracy
mlp.fit(X_train,y_train)
pred = mlp.predict(X_test)
print("MLP: ", mlp.score(X_test,y_test))

MLP:  0.8235294117647058
