In [1]:
%matplotlib inline
import csv, requests, os
import pandas as pd
import numpy as np

## Get data from Google sheets

In [2]:
def make_regular_gsheet_url(doc_id, sheet_id):
    return f"https://docs.google.com/spreadsheets/d/{doc_id}/edit#gid={sheet_id}"

def make_csv_gsheet_url(doc_id, sheet_id):
    return f"https://docs.google.com/spreadsheets/d/{doc_id}/export?format=csv&id={doc_id}&gid={sheet_id}"


GOOGLE_SHEET_ID = '1bvRKCfu9iGllHsOolDjMtbGA_2COddQFoZ7I45Lyn6o'
print("Querying Doc:", make_regular_gsheet_url(GOOGLE_SHEET_ID, "0"))
response = requests.get(make_csv_gsheet_url(GOOGLE_SHEET_ID, "0"))
reader = csv.reader(response.text.splitlines())
header = next(reader)
df = pd.DataFrame(list(reader), columns=header)

# Remove rows when N/A is a filename
df = df[df['Filename'] != 'N/A']
df['filepath'] = 'speeches/' + df.Filename
df['file_exists'] = df['filepath'].apply(lambda x: os.path.isfile(x))
df.head()

Querying Doc: https://docs.google.com/spreadsheets/d/1bvRKCfu9iGllHsOolDjMtbGA_2COddQFoZ7I45Lyn6o/edit#gid=0


Unnamed: 0,Filename,State,Governor,Gender,Party,Type of Speech,New Gov?,2020 Contender?,Region,Trifecta Status,Trifecta,Best Transcript URL,Selector,Note,Lesser Transcript URL,New Best Transcript URL,filepath,file_exists
0,Alabama_Inaugural.txt,Alabama,Kay Ivey,Female,R,Inaugural,No,No,South,R trifecta,Trifecta,https://governor.alabama.gov/remarks-speeches/...,,,https://www.al.com/news/2019/01/the-full-text-...,,speeches/Alabama_Inaugural.txt,True
1,Alabama_SOTS.txt,Alabama,Kay Ivey,Female,R,State of the state,No,No,South,R trifecta,Trifecta,https://governor.alabama.gov/remarks-speeches/...,,,,https://governor.alabama.gov/remarks-speeches/...,speeches/Alabama_SOTS.txt,True
3,Alaska_SOTS.txt,Alaska,Mike Dunleavy,Male,R,State of the state,Yes,No,West,Divided government,Divided,https://gov.alaska.gov/newsroom/2019/01/22/201...,,,https://www.adn.com/politics/2019/01/23/watch-...,https://gov.alaska.gov/newsroom/2019/01/22/201...,speeches/Alaska_SOTS.txt,True
4,Arizona_Inaugural.txt,Arizona,Doug Ducey,Male,R,Inaugural,No,No,West,R trifecta,Trifecta,https://azgovernor.gov/governor/news/2019/01/g...,,,,,speeches/Arizona_Inaugural.txt,True
5,Arizona_SOTS.txt,Arizona,Doug Ducey,Male,R,State of the state,No,No,West,R trifecta,Trifecta,https://azgovernor.gov/governor/news/2019/01/g...,,,,https://azgovernor.gov/governor/news/2019/01/g...,speeches/Arizona_SOTS.txt,True


## Filter Data

In [3]:
df = df[df['Type of Speech'].isin(['State of the state','Both'])]
f"Dataset is {len(df)} speeches"

'Dataset is 50 speeches'

## Read Speeches

In [4]:
def get_speeches(df):
    speeches = []
    for path in df['filepath']:
        with open(path) as f:
            text = f.read()
            speeches.append(text)
    return speeches

df['speech_text'] = get_speeches(df)


# Truncate speeches that are too long (>8000 tokens)

In [5]:
# Import modules
import tiktoken
from openai import OpenAI
client = OpenAI()

# Set embedding model parameters
embedding_model = "text-embedding-3-large" # this is the model we will use to make embeddings
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
MAX_TOKENS = 8000  # the maximum for text-embedding-ada-002 is 8191

# Get the encoding for the specified model
encoding = tiktoken.get_encoding(embedding_encoding)

# Make a new column with the number of tokens in the combined title and summary
df["n_tokens"] = df['speech_text'].apply(lambda x: len(encoding.encode(x)))

# Sort by that column
df = df.sort_values(by='n_tokens', ascending=False)

# Display the stories
df.head(7)


Unnamed: 0,Filename,State,Governor,Gender,Party,Type of Speech,New Gov?,2020 Contender?,Region,Trifecta Status,Trifecta,Best Transcript URL,Selector,Note,Lesser Transcript URL,New Best Transcript URL,filepath,file_exists,speech_text,n_tokens
76,WestVirginia_SOTS.txt,West Virginia,Jim Justice,Male,R,State of the state,No,No,South,R trifecta,Trifecta,https://governor.wv.gov/News/press-releases/20...,,,,https://governor.wv.gov/News/press-releases/20...,speeches/WestVirginia_SOTS.txt,True,"Sit, sit, listen. Let's get \n\nat this. Tha...",12156
52,NewYork_SOTS.txt,New York,Andrew Cuomo,Male,D,State of the state,No,No,Northeast,D trifecta,Trifecta,https://www.governor.ny.gov/news/video-audio-p...,,,,https://www.governor.ny.gov/news/video-audio-p...,speeches/NewYork_SOTS.txt,True,Thank you. Thank you. Thank you. Thank you ver...,11839
29,Kentucky_SOTS.txt,Kentucky,Matt Bevin,Male,R,State of the state,No,No,South,R trifecta,Trifecta,https://ketorg.cdn.ket.org/wp-content/uploads/...,,,,https://ketorg.cdn.ket.org/wp-content/uploads/...,speeches/Kentucky_SOTS.txt,True,I guess you all expect me to say something abo...,11464
67,SouthDakota_SOTS.txt,South Dakota,Kristi Noem,Female,R,State of the state,Yes,No,Midwest,R trifecta,Trifecta,https://kelo.com/news/articles/2019/jan/09/tra...,,,,https://kelo.com/news/articles/2019/jan/09/tra...,speeches/SouthDakota_SOTS.txt,True,"Lieutenant Governor Rhoden, Mr. Speaker, membe...",9670
54,NorthDakota_SOTS.txt,North Dakota,Doug Burgum,Male,R,State of the state,No,No,Midwest,R trifecta,Trifecta,https://www.governor.nd.gov/sites/governor/fil...,,,,https://www.governor.nd.gov/sites/governor/fil...,speeches/NorthDakota_SOTS.txt,True,Good afternoon! And thank you all for that war...,7659
69,Tennessee_SOTS.txt,Tennessee,Bill Lee,Male,R,State of the state,Yes,No,South,R trifecta,Trifecta,https://www.tn.gov/governor/about-bill-lee/sta...,,,,https://www.tn.gov/governor/about-bill-lee/sta...,speeches/Tennessee_SOTS.txt,True,"Lieutenant Governor McNally, Speaker Casada, S...",7044
74,Virginia_SOTS.txt,Virginia,Ralph Northam,Male,D,State of the state,No,No,South,Divided government,Divided,https://www.governor.virginia.gov/newsroom/all...,,,,https://www.governor.virginia.gov/newsroom/all...,speeches/Virginia_SOTS.txt,True,"My fellow Virginians, ladies and gentlemen—goo...",6720


In [6]:
def get_first_n_tokens(text, n):
    tokens = text.split()  # Split the text into tokens based on spaces
    first_n_tokens = tokens[:n]  # Get the first n tokens
    return ' '.join(first_n_tokens)  # Join the tokens back into a string

# display(df.head(3))

df['speech_text'] = df.apply(lambda x: get_first_n_tokens(x['speech_text'], 6000) if x['n_tokens'] > MAX_TOKENS else x['speech_text'], axis=1)
df["n_tokens"] = df['speech_text'].apply(lambda x: len(encoding.encode(x)))

df.head(3)

Unnamed: 0,Filename,State,Governor,Gender,Party,Type of Speech,New Gov?,2020 Contender?,Region,Trifecta Status,Trifecta,Best Transcript URL,Selector,Note,Lesser Transcript URL,New Best Transcript URL,filepath,file_exists,speech_text,n_tokens
76,WestVirginia_SOTS.txt,West Virginia,Jim Justice,Male,R,State of the state,No,No,South,R trifecta,Trifecta,https://governor.wv.gov/News/press-releases/20...,,,,https://governor.wv.gov/News/press-releases/20...,speeches/WestVirginia_SOTS.txt,True,"Sit, sit, listen. Let's get at this. Thank you...",7559
52,NewYork_SOTS.txt,New York,Andrew Cuomo,Male,D,State of the state,No,No,Northeast,D trifecta,Trifecta,https://www.governor.ny.gov/news/video-audio-p...,,,,https://www.governor.ny.gov/news/video-audio-p...,speeches/NewYork_SOTS.txt,True,Thank you. Thank you. Thank you. Thank you ver...,7420
29,Kentucky_SOTS.txt,Kentucky,Matt Bevin,Male,R,State of the state,No,No,South,R trifecta,Trifecta,https://ketorg.cdn.ket.org/wp-content/uploads/...,,,,https://ketorg.cdn.ket.org/wp-content/uploads/...,speeches/Kentucky_SOTS.txt,True,I guess you all expect me to say something abo...,7126


## Make Embeddings

In [7]:
from tqdm.notebook import tqdm

client = OpenAI()

def get_embeddings(texts, model="text-embedding-3-large"):
    # Replace newlines in each text and ensure it's a list of texts
    texts = [text.replace("\n", " ") for text in texts]
    # OpenAI's embeddings.create can process multiple inputs as a list
    response = client.embeddings.create(input=texts, model=model)
    # Extract embeddings from the response
    embeddings = [item.embedding for item in response.data]
    return embeddings

# Function to process DataFrame in batches and return a list of embeddings
def process_in_batches(df, column_name, batch_size=10):
    # Break the DataFrame into batches of size `batch_size`
    batches = [df[column_name].iloc[i:i + batch_size] for i in range(0, len(df), batch_size)]
    # Process each batch and collect embeddings
    all_embeddings = []
    for batch in tqdm(batches, desc="Processing batches"):
        batch_embeddings = get_embeddings(batch.tolist())
        all_embeddings.extend(batch_embeddings)
    return all_embeddings

# Example usage
batch_size = 100  # Adjust based on your preference and rate limits
df['embedding'] = process_in_batches(df, 'speech_text', batch_size=batch_size)


Processing batches:   0%|          | 0/1 [00:00<?, ?it/s]

# Run Models and Compare

Run various classification models

In [9]:
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate
import pandas as pd

# Define custom scorers
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, pos_label='R'),
    'recall': make_scorer(recall_score, pos_label='R'),
    'f1': make_scorer(f1_score, pos_label='R')
}


In [16]:
# Set X and Y
X =np.array(df['embedding'].tolist())
y = np.array(df['Party'])


In [11]:
# # Multinomial Naive Bayes
# # CANNOT RUN NAIVE BAYES WITH NEGATIVE VALUES


# from sklearn.naive_bayes import MultinomialNB
# clf = MultinomialNB()
# clf.fit(X,y)
# scores = cross_validate(clf, X, y, scoring=scoring, cv=4)
# pd.DataFrame(scores).describe().round(2)[1:3]

In [12]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1e9, solver='lbfgs', max_iter=4000)
clf.fit(X,y)
scores = cross_validate(clf, X, y, scoring=scoring, cv=4)
pd.DataFrame(scores).describe().round(2)[1:3]

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
mean,0.01,0.0,0.7,0.76,0.74,0.74
std,0.01,0.0,0.18,0.2,0.13,0.13


In [13]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X,y)
scores = cross_validate(clf, X, y, scoring=scoring, cv=4)
pd.DataFrame(scores).describe().round(2)[1:3]

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
mean,0.05,0.0,0.6,0.63,0.71,0.66
std,0.0,0.0,0.16,0.14,0.11,0.12


In [14]:
# Linear Support Vector Classification.
from sklearn.svm import LinearSVC
clf = LinearSVC(dual='auto')
clf.fit(X, y)
scores = cross_validate(clf, X, y, scoring=scoring, cv=4)
pd.DataFrame(scores).describe().round(2)[1:3]

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
mean,0.01,0.0,0.72,0.76,0.78,0.76
std,0.0,0.0,0.15,0.19,0.08,0.11


In [15]:
# Multi-layer perceptron (a type of Neural Network ¯\_(ツ)_/¯)
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier()
clf.fit(X,y)
scores = cross_validate(clf, X, y, scoring=scoring, cv=4)
pd.DataFrame(scores).describe().round(2)[1:3]

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
mean,1.98,0.0,0.72,0.78,0.74,0.75
std,0.44,0.0,0.15,0.18,0.13,0.12
