# Machine Learning

Our  Model

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np

In [2]:
head_df = pd.read_csv("state_headlines_for_model.csv")

In [3]:
head_df["headline_raw"] = head_df["headline.main"].str.lower().replace(
    "[^A-Za-z\s]", "").str.replace(";", "").replace(",", "")

head_df["head_split"] = head_df["headline_raw"].str.split(",").str.join("")

In [4]:
vec = TfidfVectorizer(norm=False)
vec.fit(head_df["head_split"])
X_train = vec.transform(head_df["head_split"])
y_train = head_df["state"]

scaler = Normalizer()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

model = KNeighborsClassifier(metric="euclidean")
model.fit(X_train_scaled, y_train)

pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])

In [5]:
# calculates estimate of test error based on 5-fold cross validation
def get_cv_error(k):
    model = KNeighborsClassifier(n_neighbors=k)
    pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
    f1 = np.mean(-cross_val_score(
        pipeline, head_df["head_split"], y_train == "Colorado", 
        cv=10, scoring="f1"
    ))
    return f1
    
ks = pd.Series(range(1, 25))
ks.index = range(1, 25)
test_errs = ks.apply(get_cv_error)
best_k = test_errs.sort_values(ascending=False).idxmax()
print("Optimal K: ", best_k)

Optimal K:  2


In [6]:
test_errs.plot.line()

<matplotlib.axes._subplots.AxesSubplot at 0x7f1b55fbba90>

In [7]:
#update model to get optimal K

model = KNeighborsClassifier(n_neighbors=best_k, metric="euclidean")
model.fit(X_train_scaled, y_train)

pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])

In [8]:
# Calculating accuracy:
my_accuracy = cross_val_score(pipeline, head_df["head_split"], y_train, cv=10, scoring="accuracy")

recall_list = []
precision_list = []
f1_list = []

for state in y_train.unique():
    recall_list.append((cross_val_score(pipeline, head_df["head_split"], y_train == state, cv=10, scoring="recall").mean()))
    precision_list.append(cross_val_score(pipeline, head_df["head_split"], y_train == state, cv=10, scoring="precision").mean())
    f1_list.append(cross_val_score(pipeline, head_df["head_split"], y_train == state, cv=10, scoring="f1").mean())

In [9]:
import numpy as np
accuracy=np.mean(my_accuracy)

In [10]:
combo_list = [precision_list, recall_list, f1_list]

In [11]:
score_df = pd.DataFrame(combo_list).rename({0:head_df["state"].unique()[0],
                                1: head_df["state"].unique()[1],
                                2: head_df["state"].unique()[2],
                                3: head_df["state"].unique()[3],
                                4: head_df["state"].unique()[4]},
                               axis=1).rename(index={0:"Precision", 1:"Recall", 2:"F1_score"})

In [12]:
score_df.T

Unnamed: 0,Precision,Recall,F1_score
Colorado,0.822381,0.34,0.470368
New York,0.844048,0.54,0.644873
Washington,0.8975,0.53,0.658067
Mississippi,0.863095,0.42,0.551543
Texas,0.783333,0.21,0.329487


In [14]:
accuracy

0.59599999999999997

It's fair to say that our model is pretty high-performing when it comes to finding out which state a headline was written about. The fact that our model was able to distinguish between which headlines were written with respect with each state may be evidence that the five groups of headlines were significantly different, lending some credibility to the point of view that NYT has been writing differently about some states than others in terms of word choice and, by proxy, tone.

(For two more unrelated but high quality machine learning models we made throughout the course of this project, please check out our notebook titled "Extra_Work")