In [95]:
%matplotlib inline
import csv, requests, os
import pandas as pd
import numpy as np

## Get data from Google sheets

In [167]:
def make_regular_gsheet_url(doc_id, sheet_id):
    return f"https://docs.google.com/spreadsheets/d/{doc_id}/edit#gid={sheet_id}"

def make_csv_gsheet_url(doc_id, sheet_id):
    return f"https://docs.google.com/spreadsheets/d/{doc_id}/export?format=csv&id={doc_id}&gid={sheet_id}"


GOOGLE_SHEET_ID = '1bvRKCfu9iGllHsOolDjMtbGA_2COddQFoZ7I45Lyn6o'
print("Querying Doc:", make_regular_gsheet_url(GOOGLE_SHEET_ID, "0"))
response = requests.get(make_csv_gsheet_url(GOOGLE_SHEET_ID, "0"))
reader = csv.reader(response.text.splitlines())
header = next(reader)
df = pd.DataFrame(list(reader), columns=header, dtype=int)

# Remove rows when N/A is a filename
df = df[df['Filename'] != 'N/A']
df['filepath'] = 'speeches/' + df.Filename
df['file_exists'] = df['filepath'].apply(lambda x: os.path.isfile(x))
df.head()

Querying Doc: https://docs.google.com/spreadsheets/d/1bvRKCfu9iGllHsOolDjMtbGA_2COddQFoZ7I45Lyn6o/edit#gid=0


  df = pd.DataFrame(list(reader), columns=header, dtype=int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['filepath'] = 'speeches/' + df.Filename
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['file_exists'] = df['filepath'].apply(lambda x: os.path.isfile(x))


Unnamed: 0,Filename,State,Governor,Gender,Party,Type of Speech,New Gov?,2020 Contender?,Region,Trifecta Status,Trifecta,Best Transcript URL,Selector,Note,Lesser Transcript URL,New Best Transcript URL,filepath,file_exists
0,Alabama_Inaugural.txt,Alabama,Kay Ivey,Female,R,Inaugural,No,No,South,R trifecta,Trifecta,https://governor.alabama.gov/remarks-speeches/...,,,https://www.al.com/news/2019/01/the-full-text-...,,speeches/Alabama_Inaugural.txt,True
1,Alabama_SOTS.txt,Alabama,Kay Ivey,Female,R,State of the state,No,No,South,R trifecta,Trifecta,https://governor.alabama.gov/remarks-speeches/...,,,,https://governor.alabama.gov/remarks-speeches/...,speeches/Alabama_SOTS.txt,True
3,Alaska_SOTS.txt,Alaska,Mike Dunleavy,Male,R,State of the state,Yes,No,West,Divided government,Divided,https://gov.alaska.gov/newsroom/2019/01/22/201...,,,https://www.adn.com/politics/2019/01/23/watch-...,https://gov.alaska.gov/newsroom/2019/01/22/201...,speeches/Alaska_SOTS.txt,True
4,Arizona_Inaugural.txt,Arizona,Doug Ducey,Male,R,Inaugural,No,No,West,R trifecta,Trifecta,https://azgovernor.gov/governor/news/2019/01/g...,,,,,speeches/Arizona_Inaugural.txt,True
5,Arizona_SOTS.txt,Arizona,Doug Ducey,Male,R,State of the state,No,No,West,R trifecta,Trifecta,https://azgovernor.gov/governor/news/2019/01/g...,,,,https://azgovernor.gov/governor/news/2019/01/g...,speeches/Arizona_SOTS.txt,True


## Filter Data

In [168]:
df = df[df['Type of Speech'].isin(['State of the state','Both'])]
f"Dataset is {len(df)} speeches"

'Dataset is 50 speeches'

In [169]:
STATES_TO_WITHOLD = ['Washington', 'Utah']
df_witheld = df.query("State.isin(@STATES_TO_WITHOLD)")
df = df.query("~State.isin(@STATES_TO_WITHOLD)")
print(f"Dataset is {len(df)} speeches")

Dataset is 48 speeches


## Read Speeches

In [170]:
def get_speeches(df):
    speeches = []
    for path in df['filepath']:
        with open(path) as f:
            text = f.read()
            speeches.append(text)
    return speeches

speeches = get_speeches(df)

## Tokenize and Vectorize

In [171]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

## YOU CAN EDIT THESE
y_columns = ['Party', 'Trifecta']
BINARY=False
NGRAMS=1
MIN_DF=3

def ngram_vectorizer(n, binary, min_df):
    return CountVectorizer(
        stop_words='english', # 'english' if not custom list
        ngram_range=(1,n),
        binary=binary,
        min_df=min_df
    )

vectorizer = ngram_vectorizer(NGRAMS, binary=BINARY, min_df=MIN_DF)
X = vectorizer.fit_transform(speeches)
y = np.array(df['Party'] == "R").astype('int')

In [166]:
# Display X
# turning it back into a dataframe for visibility
word_vectors = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
x_display = df[['State', 'Governor', 'Party', 'Trifecta Status']].reset_index().\
    merge(word_vectors, left_index=True, right_index=True)\
    .drop(columns=['index_x', 'index_y']).head()

print("X is the vector of words shown in the dataframe below")
display(x_display)
print("1 is Republican, 0 is Democrat")
print("y=", y)


KeyError: "['index_x', 'index_y'] not found in axis"

## Train a Classifier

In [156]:
clf = MultinomialNB(alpha=1.0e-10, class_prior=None, fit_prior=True)
clf.fit(X, y)

MultinomialNB(alpha=1e-10)

## Test the Classifier

In [157]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score, accuracy_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

# 5-fold cross-validation
scoring = ['accuracy', 'precision', 'recall', 'f1']
scores = cross_validate(clf, X, y, scoring=scoring, cv=4)
display(pd.DataFrame(scores).round(2))

pd.DataFrame(scores)[
    ['test_accuracy','test_precision','test_recall','test_f1']]\
    .mean().round(2)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
0,0.0,0.0,0.75,0.75,0.86,0.8
1,0.0,0.0,0.58,0.58,1.0,0.74
2,0.0,0.0,0.42,0.45,0.83,0.59
3,0.0,0.0,0.67,0.6,1.0,0.75


test_accuracy     0.60
test_precision    0.60
test_recall       0.92
test_f1           0.72
dtype: float64

## Make a prediction

In [158]:
print(STATES_TO_WITHOLD)

['Washington', 'Utah']


In [159]:
witheld_speeches = get_speeches(df_witheld)
# print(witheld_speeches[0])
# print(witheld_speeches[1])

In [160]:
X_witheld_states = vectorizer.transform(witheld_speeches)
X_witheld_states

<2x11118 sparse matrix of type '<class 'numpy.int64'>'
	with 1574 stored elements in Compressed Sparse Row format>

In [161]:
witheld_states_word_vectors = pd.DataFrame(X_witheld_states.toarray(), 
                            columns=vectorizer.get_feature_names_out(),
                            index=STATES_TO_WITHOLD)


washington_top_words = witheld_states_word_vectors.T.sort_values(by='Washington', ascending=False).head(25)
utah_top_words = witheld_states_word_vectors.T.sort_values(by='Utah', ascending=False).head(25)

washington_top_words.reset_index().merge(utah_top_words.reset_index(), left_index=True, right_index=True).head(10)

Unnamed: 0,index_x,Washington_x,Utah_x,index_y,Washington_y,Utah_y
0,state,27,34,state,27,34
1,tax,22,0,washington,1,19
2,utah,22,0,people,3,17
3,let,16,3,story,0,16
4,year,14,5,new,10,15
5,education,10,8,health,2,12
6,new,10,15,ve,0,11
7,economy,9,1,chapter,0,9
8,day,9,3,education,10,8
9,challenges,9,3,today,5,8


In [162]:
clf.predict(X_witheld_states)

array([1, 0])

In [163]:
clf.predict_proba(X_witheld_states)

array([[9.56696827e-029, 1.00000000e+000],
       [1.00000000e+000, 3.82858217e-147]])

In [164]:
clf.predict_log_proba(X_witheld_states)

array([[ -64.51665134,    0.        ],
       [   0.        , -337.13751413]])