# 1. Imports

In [18]:
import os
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix


# 2. Load all the data in the .txt files (text obtained from https://www.presidency.ucsb.edu/)

In [2]:
import os

data_folder = '../data'  
presidents = ['teddy', 'washington', 'adams', 'jefferson', 'madison', 'lincoln', 'FDR']

data = []

# Loop through each president folder and read the text files
for president in presidents:
    president_folder = os.path.join(data_folder, president)  
    for filename in os.listdir(president_folder):
        if filename.endswith('.txt'):  
            file_path = os.path.join(president_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                data.append((text, president)) 

print(f"Total examples: {len(data)}")


Total examples: 169


# Number of files and words per president

In [3]:
file_counts = {}
word_counts = {}

for president in presidents:
    president_folder = os.path.join(data_folder, president)  
    file_counts[president] = 0
    word_counts[president] = 0

    for filename in os.listdir(president_folder):
        if filename.endswith('.txt'):
            file_counts[president] += 1

            file_path = os.path.join(president_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                word_counts[president] += len(text.split())

print("\nNumber of files per president:")
for president, count in file_counts.items():
    print(f"{president}: {count}")

print("\nNumber of words per president:")
for president, count in word_counts.items():
    print(f"{president}: {count}")



Number of files per president:
teddy: 23
washington: 29
adams: 15
jefferson: 11
madison: 29
lincoln: 43
FDR: 19

Number of words per president:
teddy: 47453
washington: 29253
adams: 15264
jefferson: 18589
madison: 65994
lincoln: 50596
FDR: 45277


# 3. Clean the text using regex

In [4]:
def clean_text(text):
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = ' '.join(text.split())
    text = text.lower()
    return text

In [5]:
texts, labels = zip(*data)  
cleaned_texts = [clean_text(text) for text in texts]

In [6]:


X_train, X_test, y_train, y_test = train_test_split(cleaned_texts, labels, test_size=0.2, random_state=42)

print(f"Training data: {len(X_train)} examples")
print(f"Testing data: {len(X_test)} examples")


Training data: 135 examples
Testing data: 34 examples


# MultinomialMB

In [7]:
vectorizer = TfidfVectorizer(max_features=5000) 

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"X_train_tfidf shape: {X_train_tfidf.shape}")
print(f"X_test_tfidf shape: {X_test_tfidf.shape}")


X_train_tfidf shape: (135, 5000)
X_test_tfidf shape: (34, 5000)


In [8]:
modelMMB = MultinomialNB()

modelMMB.fit(X_train_tfidf, y_train)

y_pred = modelMMB.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         FDR       0.00      0.00      0.00         4
       adams       0.00      0.00      0.00         3
   jefferson       0.00      0.00      0.00         3
     lincoln       0.21      1.00      0.35         7
     madison       0.00      0.00      0.00         4
       teddy       1.00      0.17      0.29         6
  washington       0.00      0.00      0.00         7

    accuracy                           0.24        34
   macro avg       0.17      0.17      0.09        34
weighted avg       0.22      0.24      0.12        34



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Logisitc Regression

In [11]:
vectorizerLogisticRegression = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))  
X_tfidf2 = vectorizerLogisticRegression.fit_transform(X_train) 
y2 = y_train 

In [12]:
modelLogisticRegression = LogisticRegression(max_iter=1000)

cvLogisticRegression = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoresLogisticRegression = cross_val_score(modelLogisticRegression, X_tfidf2, y2, cv=cvLogisticRegression, scoring='accuracy')

print(f"Cross-validation scores: {scoresLogisticRegression}")
print(f"Average accuracy: {scoresLogisticRegression.mean()}")

Cross-validation scores: [0.55555556 0.59259259 0.40740741 0.62962963 0.33333333]
Average accuracy: 0.5037037037037038


# f1_macro

In [13]:
f1_scores = cross_val_score(modelLogisticRegression, X_tfidf2, y2, cv=cvLogisticRegression, scoring='f1_macro')
print(f"Cross-validation F1-macro scores: {f1_scores}")
print(f"Average F1-macro: {f1_scores.mean()}")

Cross-validation F1-macro scores: [0.37698413 0.40912183 0.17216117 0.40926899 0.15229885]
Average F1-macro: 0.3039669944296203


# Hyperparameter Tuning

In [None]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
}
grid = GridSearchCV(LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs'),
                    param_grid, cv=cvLogisticRegression, scoring='f1_macro')
grid.fit(X_tfidf2, y2)

print(f"Best params: {grid.best_params_}")
print(f"Best F1-macro: {grid.best_score_}")


# Confusion Matrix

In [17]:
y_pred = cross_val_predict(LogisticRegression(C=10), X_tfidf2, y2, cv=5)
print(confusion_matrix(y2, y_pred))
print(classification_report(y2, y_pred))

[[12  0  0  3  0  0  0]
 [ 0  2  0  6  1  0  3]
 [ 0  0  0  3  3  0  2]
 [ 0  0  0 35  0  0  1]
 [ 0  0  0  0 22  0  3]
 [ 0  0  0  3  0 13  1]
 [ 0  0  0  3  3  0 16]]
              precision    recall  f1-score   support

         FDR       1.00      0.80      0.89        15
       adams       1.00      0.17      0.29        12
   jefferson       0.00      0.00      0.00         8
     lincoln       0.66      0.97      0.79        36
     madison       0.76      0.88      0.81        25
       teddy       1.00      0.76      0.87        17
  washington       0.62      0.73      0.67        22

    accuracy                           0.74       135
   macro avg       0.72      0.62      0.62       135
weighted avg       0.74      0.74      0.70       135



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Results from confusion matrix:

### 1. 0% accuracy for Jefferson (likely because he has very little data)

### 2. 100% of the time the model predicted FDR, Adams, or TR it was correct (1.00 precision)

### 3. High F1 score for FDR, Lincoln, and TR, but low for Adams, Washington (kind of), Jefferson

### 4. Average precision: 72%

### 5. Average recall: 62%

### 6. Average F1-Score: 62%