# 1. Imports

In [29]:
import os
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score


# 2. Load all the data in the .txt files (text obtained from https://www.presidency.ucsb.edu/)

In [21]:
import os

data_folder = '../data'  
presidents = ['teddy', 'washington', 'adams', 'jefferson', 'madison', 'lincoln', 'FDR']

data = []

# Loop through each president folder and read the text files
for president in presidents:
    president_folder = os.path.join(data_folder, president)  
    for filename in os.listdir(president_folder):
        if filename.endswith('.txt'):  
            file_path = os.path.join(president_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                data.append((text, president)) 

print(f"Total examples: {len(data)}")


Total examples: 74


# 3. Clean the text using regex

In [22]:
def clean_text(text):
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = ' '.join(text.split())
    text = text.lower()
    return text

In [23]:
texts, labels = zip(*data)  
cleaned_texts = [clean_text(text) for text in texts]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(cleaned_texts, labels, test_size=0.2, random_state=42)

print(f"Training data: {len(X_train)} examples")
print(f"Testing data: {len(X_test)} examples")


Training data: 59 examples
Testing data: 15 examples


In [25]:
vectorizer = TfidfVectorizer(max_features=5000) 

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"X_train_tfidf shape: {X_train_tfidf.shape}")
print(f"X_test_tfidf shape: {X_test_tfidf.shape}")


X_train_tfidf shape: (59, 5000)
X_test_tfidf shape: (15, 5000)


In [27]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))  
X_tfidf = vectorizer.fit_transform(texts) 
y = labels  

In [30]:
model = LogisticRegression(max_iter=1000)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='accuracy')

print(f"Cross-validation scores: {scores}")
print(f"Average accuracy: {scores.mean()}")

Cross-validation scores: [0.33333333 0.33333333 0.4        0.33333333 0.21428571]
Average accuracy: 0.32285714285714284


In [26]:
model = MultinomialNB()

model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         FDR       0.27      1.00      0.42         4
   jefferson       0.00      0.00      0.00         2
     madison       0.00      0.00      0.00         2
       teddy       0.00      0.00      0.00         3
  washington       0.00      0.00      0.00         4

    accuracy                           0.27        15
   macro avg       0.05      0.20      0.08        15
weighted avg       0.07      0.27      0.11        15



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
