In [1]:
%matplotlib inline
import csv, requests, os
import pandas as pd
import numpy as np

## Get data from Google sheets

In [2]:
def make_regular_gsheet_url(doc_id, sheet_id):
    return f"https://docs.google.com/spreadsheets/d/{doc_id}/edit#gid={sheet_id}"

def make_csv_gsheet_url(doc_id, sheet_id):
    return f"https://docs.google.com/spreadsheets/d/{doc_id}/export?format=csv&id={doc_id}&gid={sheet_id}"

GOOGLE_SHEET_ID = '1bvRKCfu9iGllHsOolDjMtbGA_2COddQFoZ7I45Lyn6o'
print("Querying Doc:", make_regular_gsheet_url(GOOGLE_SHEET_ID, "0"))
response = requests.get(make_csv_gsheet_url(GOOGLE_SHEET_ID, "0"))
reader = csv.reader(response.text.splitlines())
header = next(reader)
df = pd.DataFrame(list(reader), columns=header)

# Remove rows when N/A is a filename
df = df[df['Filename'] != 'N/A']
df['filepath'] = 'speeches/' + df.Filename
df['file_exists'] = df['filepath'].apply(lambda x: os.path.isfile(x))
df.head()

Querying Doc: https://docs.google.com/spreadsheets/d/1bvRKCfu9iGllHsOolDjMtbGA_2COddQFoZ7I45Lyn6o/edit#gid=0


Unnamed: 0,Filename,State,Governor,Gender,Party,Type of Speech,New Gov?,2020 Contender?,Region,Trifecta Status,Trifecta,Best Transcript URL,Selector,Note,Lesser Transcript URL,New Best Transcript URL,filepath,file_exists
0,Alabama_Inaugural.txt,Alabama,Kay Ivey,Female,R,Inaugural,No,No,South,R trifecta,Trifecta,https://governor.alabama.gov/remarks-speeches/...,,,https://www.al.com/news/2019/01/the-full-text-...,,speeches/Alabama_Inaugural.txt,True
1,Alabama_SOTS.txt,Alabama,Kay Ivey,Female,R,State of the state,No,No,South,R trifecta,Trifecta,https://governor.alabama.gov/remarks-speeches/...,,,,https://governor.alabama.gov/remarks-speeches/...,speeches/Alabama_SOTS.txt,True
3,Alaska_SOTS.txt,Alaska,Mike Dunleavy,Male,R,State of the state,Yes,No,West,Divided government,Divided,https://gov.alaska.gov/newsroom/2019/01/22/201...,,,https://www.adn.com/politics/2019/01/23/watch-...,https://gov.alaska.gov/newsroom/2019/01/22/201...,speeches/Alaska_SOTS.txt,True
4,Arizona_Inaugural.txt,Arizona,Doug Ducey,Male,R,Inaugural,No,No,West,R trifecta,Trifecta,https://azgovernor.gov/governor/news/2019/01/g...,,,,,speeches/Arizona_Inaugural.txt,True
5,Arizona_SOTS.txt,Arizona,Doug Ducey,Male,R,State of the state,No,No,West,R trifecta,Trifecta,https://azgovernor.gov/governor/news/2019/01/g...,,,,https://azgovernor.gov/governor/news/2019/01/g...,speeches/Arizona_SOTS.txt,True


## Filter Data

In [3]:
df = df[df['Type of Speech'].isin(['State of the state','Both'])]
f"Dataset is {len(df)} speeches"

'Dataset is 50 speeches'

In [4]:
STATES_TO_WITHOLD = ['Washington', 'Utah']
df_witheld = df.query("State.isin(@STATES_TO_WITHOLD)")
df = df.query("~State.isin(@STATES_TO_WITHOLD)")
print(f"Dataset is {len(df)} speeches")

Dataset is 48 speeches


## Read Speeches

In [5]:
def get_speeches(df):
    speeches = []
    for path in df['filepath']:
        with open(path) as f:
            text = f.read()
            speeches.append(text)
    return speeches

speeches = get_speeches(df)

## Tokenize and Vectorize

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

## YOU CAN EDIT THESE
y_columns = ['Party', 'Trifecta']
BINARY=False
NGRAMS=(1,1)
MIN_DF=3

def ngram_vectorizer(n, binary, min_df):
    return CountVectorizer(
        stop_words='english', # 'english' if not custom list
        ngram_range=NGRAMS,
        binary=binary,
        min_df=min_df
    )

vectorizer = ngram_vectorizer(NGRAMS, binary=BINARY, min_df=MIN_DF)
X = vectorizer.fit_transform(speeches)
y = np.array(df['Party'] == "R").astype('int')

In [7]:
# Display X
# turning it back into a dataframe for visibility
word_vectors = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
x_display = df[['State', 'Governor', 'Party', 'Trifecta Status']].reset_index().\
    merge(word_vectors, left_index=True, right_index=True)\
    .head()

print("X is the vector of words shown in the dataframe below")
display(x_display)
print("1 is Republican, 0 is Democrat")
print("y=", y)


X is the vector of words shown in the dataframe below


Unnamed: 0,index,State,Governor,Party,Trifecta Status,000,10,100,100th,105,...,yield,york,young,younger,youngest,youth,zero,zip,zone,zones
0,1,Alabama,Kay Ivey,R,R trifecta,10,2,0,0,0,...,0,0,4,0,1,0,0,0,0,0
1,3,Alaska,Mike Dunleavy,R,Divided government,2,0,3,0,0,...,0,2,2,0,0,0,0,0,0,0
2,5,Arizona,Doug Ducey,R,R trifecta,5,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,Arkansas,Asa Hutchinson,R,R trifecta,12,0,0,0,0,...,0,0,2,0,0,1,0,0,0,0
4,9,California,Gavin Newsom,D,D trifecta,3,1,2,0,0,...,0,0,2,0,0,0,0,0,0,1


1 is Republican, 0 is Democrat
y= [1 1 1 1 0 0 0 0 1 1 0 1 0 1 1 0 1 0 0 1 1 0 0 1 1 0 1 0 1 0 0 0 0 1 1 1 0
 0 0 1 1 1 1 1 0 1 0 1]


## Train a Classifier

In [8]:
clf = MultinomialNB(alpha=1.0e-10, class_prior=None, fit_prior=True)
clf.fit(X, y)

## Test the Classifier

In [9]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score, accuracy_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

# 5-fold cross-validation
scoring = ['accuracy', 'precision', 'recall', 'f1']
scores = cross_validate(clf, X, y, scoring=scoring, cv=4)
display(pd.DataFrame(scores).round(2))

pd.DataFrame(scores)[
    ['test_accuracy','test_precision','test_recall','test_f1']]\
    .mean().round(2)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
0,0.0,0.0,0.75,0.7,1.0,0.82
1,0.0,0.0,0.42,0.5,0.71,0.59
2,0.0,0.0,0.5,0.5,1.0,0.67
3,0.0,0.0,0.75,0.67,1.0,0.8


test_accuracy     0.60
test_precision    0.59
test_recall       0.93
test_f1           0.72
dtype: float64

## Make a prediction

In [10]:
print(STATES_TO_WITHOLD)

['Washington', 'Utah']


In [11]:
df_witheld

Unnamed: 0,Filename,State,Governor,Gender,Party,Type of Speech,New Gov?,2020 Contender?,Region,Trifecta Status,Trifecta,Best Transcript URL,Selector,Note,Lesser Transcript URL,New Best Transcript URL,filepath,file_exists
72,Utah_SOTS.txt,Utah,Gary Herbert,Male,R,State of the state,No,No,West,R trifecta,Trifecta,https://governor.utah.gov/2019/01/31/governor-...,,,,https://governor.utah.gov/2019/01/31/governor-...,speeches/Utah_SOTS.txt,True
75,Washington_SOTS.txt,Washington,Jay Inslee,Male,D,State of the state,No,Yes,West,D trifecta,Trifecta,https://www.governor.wa.gov/news-media/news-me...,,,,https://www.governor.wa.gov/news-media/news-me...,speeches/Washington_SOTS.txt,True


In [12]:
witheld_speeches = get_speeches(df_witheld)
witheld_speeches

['President Adams, Speaker Wilson, members of the Legislature, justices of the Supreme Court, Utah’s First Lady, Lieutenant Governor and Mrs. Cox, other constitutional officers, and my fellow Utahns: It is an honor to address you tonight.\nPresident Adams and Speaker Wilson, congratulations for being elected to your significant leadership roles. I have appreciated working with you to prepare for this session and I look forward to continued collaboration as we address this year’s challenges.\nI extend a special welcome to the 19 new representatives and the 7 new senators here this evening. I thank you for stepping into the political arena to serve the citizens of the great state of Utah.\nOur separation of constitutional powers can, from time-to-time, create some tensions. And given current national trends, can I tell you just how grateful I am that the Speaker didn’t disinvite me from delivering this year’s State of the State? Because of your kindness, I promise I won’t cancel your sta

In [13]:
X_witheld_states = vectorizer.transform(witheld_speeches)
X_witheld_states

<2x4001 sparse matrix of type '<class 'numpy.int64'>'
	with 1355 stored elements in Compressed Sparse Row format>

In [14]:
witheld_states_word_vectors = pd.DataFrame(X_witheld_states.toarray(), 
                            columns=vectorizer.get_feature_names_out(),
                            index=STATES_TO_WITHOLD)


washington_top_words = witheld_states_word_vectors.T.sort_values(by='Washington', ascending=False).head(25)
utah_top_words = witheld_states_word_vectors.T.sort_values(by='Utah', ascending=False).head(25)

washington_top_words.reset_index().merge(utah_top_words.reset_index(), left_index=True, right_index=True).head(10)

Unnamed: 0,index_x,Washington_x,Utah_x,index_y,Washington_y,Utah_y
0,state,27,34,state,27,34
1,tax,22,0,washington,1,19
2,let,16,3,people,3,17
3,year,14,5,story,0,16
4,education,10,8,new,10,15
5,new,10,15,health,2,12
6,base,9,0,ve,0,11
7,challenges,9,3,chapter,0,9
8,day,9,3,today,5,8
9,growth,9,0,education,10,8


In [15]:
clf.predict_log_proba(X_witheld_states)

array([[-298.05152943,    0.        ],
       [   0.        , -404.74674482]])

In [16]:
clf.predict_proba(X_witheld_states)

array([[3.61297907e-130, 1.00000000e+000],
       [1.00000000e+000, 1.66234881e-176]])

In [17]:
clf.predict(X_witheld_states)

array([1, 0])

## The Piece
https://fivethirtyeight.com/features/what-americas-governors-are-talking-about/