In [5]:
# read in csv file
import pandas as pd
federalist = pd.read_csv('federalist.csv')

# convert the author column to categorical data
s = pd.Series(federalist.author, dtype="category")

# display the first few rows and counts by author
gb = federalist.groupby('author').apply(len)
print(gb)
federalist.head()

author
HAMILTON                49
HAMILTON AND MADISON     3
HAMILTON OR MADISON     11
JAY                      5
MADISON                 15
dtype: int64


Unnamed: 0,author,text
0,HAMILTON,FEDERALIST. No. 1 General Introduction For the...
1,JAY,FEDERALIST No. 2 Concerning Dangers from Forei...
2,JAY,FEDERALIST No. 3 The Same Subject Continued (C...
3,JAY,FEDERALIST No. 4 The Same Subject Continued (C...
4,JAY,FEDERALIST No. 5 The Same Subject Continued (C...


In [6]:
# divide into train and test
from sklearn.model_selection import train_test_split

X = federalist.text
y = federalist.author
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=1234)

# display the shape of train and test
print(X_train.shape)
print(X_test.shape)

(66,)
(17,)


In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# process the text by removing stop words and performing tf-idf vectorization
stopwords = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(stop_words=stopwords)
Xtrain = vectorizer.fit_transform(X_train)
Xtest = vectorizer.transform(X_test)

# output the training set shape and the test set shape
print(Xtrain.shape)
print(Xtest.shape)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


(66, 7876)
(17, 7876)


In [8]:
# Bernoulli Naïve Bayes model
from sklearn.naive_bayes import BernoulliNB
naive_bayes = BernoulliNB()
naive_bayes.fit(Xtrain, y_train)

# get accuracy score on the test set
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
pred = naive_bayes.predict(Xtest)
print(accuracy_score(y_test, pred))

0.5882352941176471


The accuracy on the test set with the Naïve Bayes model is 0.5882352941176471

In [9]:
# redo the vectorization with max_features and bigrams
vectorizer2 = TfidfVectorizer(stop_words=stopwords, max_features = 1000, ngram_range=(1, 2))
Xtrain2 = vectorizer2.fit_transform(X_train)
Xtest2 = vectorizer2.transform(X_test)

# try Naïve Bayes again on the new train/test vectors
naive_bayes2 = BernoulliNB()
naive_bayes2.fit(Xtrain2, y_train)
pred = naive_bayes2.predict(Xtest2)
print(accuracy_score(y_test, pred))

0.9411764705882353


Comparing the accuracy_score from the two implementations of the Bernoulli Naïve Bayes, the second implementation with max_features set to 1000 and bigrams as a feature performed better with an accuracy score of 0.9411764705882353, which is far greater than the previous accuracy score of 0.5882352941176471.

In [28]:
# logistic regression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss

vectorizer3 = TfidfVectorizer(stop_words=stopwords)
Xtrain3 = vectorizer3.fit_transform(X_train)
Xtest3 = vectorizer3.transform(X_test)

classifier = LogisticRegression(solver='lbfgs', class_weight='balanced')
classifier.fit(Xtrain3, y_train)
pred = classifier.predict(Xtest3)
print('accuracy score: ', accuracy_score(y_test, pred))

# adjust at least one parameter
classifier = LogisticRegression(class_weight='balanced',
                                    fit_intercept=False, 
                                    solver='lbfgs')
classifier.fit(Xtrain3, y_train)
pred = classifier.predict(Xtest3)
print('accuracy score with fit_intecept=False: ', accuracy_score(y_test, pred))

accuracy score:  0.7058823529411765
accuracy score with fit_intecept=False:  0.8823529411764706


By changing the parameter fit_intercept from its default state of True to False, the accuracy score of the results increased.

In [47]:
# neural network

# text preprocessing
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss


vectorizer4 = TfidfVectorizer(stop_words=stopwords)
Xtrain4 = vectorizer4.fit_transform(X_train)
Xtest4 = vectorizer4.transform(X_test)

classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15, 7), random_state=1234)
classifier.fit(Xtrain4, y_train)
pred = classifier.predict(Xtest4)
print('accuracy score: ', accuracy_score(y_test, pred))

classifier2 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(150, 100), random_state=1234)
classifier2.fit(Xtrain4, y_train)
pred2 = classifier2.predict(Xtest4)
print('accuracy score: ', accuracy_score(y_test, pred2))

classifier3 = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(15, 7), random_state=1234)
classifier3.fit(Xtrain4, y_train)
pred3 = classifier3.predict(Xtest4)
print('accuracy score: ', accuracy_score(y_test, pred3))

classifier4 = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(150, 100), random_state=1234)
classifier4.fit(Xtrain4, y_train)
pred4 = classifier4.predict(Xtest4)
print('accuracy score: ', accuracy_score(y_test, pred4))

classifier5 = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(20, 1000), random_state=1234)
classifier5.fit(Xtrain4, y_train)
pred5 = classifier5.predict(Xtest4)
print('accuracy score: ', accuracy_score(y_test, pred5))

accuracy score:  0.7058823529411765
accuracy score:  0.7647058823529411




accuracy score:  0.6470588235294118
accuracy score:  0.7058823529411765
accuracy score:  0.7058823529411765


After trying several topologies, the one with the highest accuracy was a MLPClassifier using the adam solver and with (20, 1000) hidden layer sizes. The highest accuracy was 0.7647058823529411