# Now trying logistic regression  but with more data

In [10]:
import os
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix


In [11]:
import os

data_folder = '../data'  
presidents = ['teddy', 'washington', 'adams', 'jefferson', 'madison', 'lincoln', 'FDR']

data = []

# Loop through each president folder and read the text files
for president in presidents:
    president_folder = os.path.join(data_folder, president)  
    for filename in os.listdir(president_folder):
        if filename.endswith('.txt'):  
            file_path = os.path.join(president_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                data.append((text, president)) 

print(f"Total examples: {len(data)}")


Total examples: 224


In [12]:
file_counts = {}
word_counts = {}

for president in presidents:
    president_folder = os.path.join(data_folder, president)  
    file_counts[president] = 0
    word_counts[president] = 0

    for filename in os.listdir(president_folder):
        if filename.endswith('.txt'):
            file_counts[president] += 1

            file_path = os.path.join(president_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                word_counts[president] += len(text.split())

print("\nNumber of files per president:")
for president, count in file_counts.items():
    print(f"{president}: {count}")

print("\nNumber of words per president:")
for president, count in word_counts.items():
    print(f"{president}: {count}")



Number of files per president:
teddy: 23
washington: 29
adams: 16
jefferson: 56
madison: 29
lincoln: 43
FDR: 28

Number of words per president:
teddy: 47453
washington: 29253
adams: 15552
jefferson: 171449
madison: 65994
lincoln: 50596
FDR: 55354


In [13]:
def clean_text(text):
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = ' '.join(text.split())
    text = text.lower()
    return text

In [14]:
texts, labels = zip(*data)  
cleaned_texts = [clean_text(text) for text in texts]

In [15]:


X_train, X_test, y_train, y_test = train_test_split(cleaned_texts, labels, test_size=0.2, random_state=42)

print(f"Training data: {len(X_train)} examples")
print(f"Testing data: {len(X_test)} examples")


Training data: 179 examples
Testing data: 45 examples


In [16]:
vectorizerLogisticRegression = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))  
X_tfidf2 = vectorizerLogisticRegression.fit_transform(X_train) 
y2 = y_train 

In [17]:
modelLogisticRegression = LogisticRegression(max_iter=1000)

cvLogisticRegression = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoresLogisticRegression = cross_val_score(modelLogisticRegression, X_tfidf2, y2, cv=cvLogisticRegression, scoring='accuracy')

print(f"Cross-validation scores: {scoresLogisticRegression}")
print(f"Average accuracy: {scoresLogisticRegression.mean()}")

Cross-validation scores: [0.63888889 0.52777778 0.52777778 0.47222222 0.45714286]
Average accuracy: 0.5247619047619048


In [18]:
f1_scores = cross_val_score(modelLogisticRegression, X_tfidf2, y2, cv=cvLogisticRegression, scoring='f1_macro')
print(f"Cross-validation F1-macro scores: {f1_scores}")
print(f"Average F1-macro: {f1_scores.mean()}")

Cross-validation F1-macro scores: [0.61128527 0.48163265 0.4622449  0.4223356  0.39447279]
Average F1-macro: 0.4743942415001528


In [19]:
y_pred = cross_val_predict(LogisticRegression(C=10), X_tfidf2, y2, cv=5)
print(confusion_matrix(y2, y_pred))
print(classification_report(y2, y_pred))

[[23  0  0  0  0  0  0]
 [ 1  2  4  0  1  0  5]
 [ 0  0 39  0  0  0  2]
 [ 2  0  7 23  1  0  0]
 [ 0  0  2  0 19  0  3]
 [ 3  0  0  1  0 14  1]
 [ 2  0  4  0  2  0 18]]
              precision    recall  f1-score   support

         FDR       0.74      1.00      0.85        23
       adams       1.00      0.15      0.27        13
   jefferson       0.70      0.95      0.80        41
     lincoln       0.96      0.70      0.81        33
     madison       0.83      0.79      0.81        24
       teddy       1.00      0.74      0.85        19
  washington       0.62      0.69      0.65        26

    accuracy                           0.77       179
   macro avg       0.83      0.72      0.72       179
weighted avg       0.81      0.77      0.76       179

