In [0]:
import pandas as pd

FILE = "committee_utterances.tsv"
path = "drive/My Drive/Colab Notebooks/466-proj2/"

df = pd.read_csv(path + FILE, sep='\t')

##Prepare dataset
Get a subset of the data to include records from only the top N speakers


In [0]:
# Combine First and last name
speaker+names = []
for index,row in df.iterrows():
    speaker = row['last'] + " " + row['first']
    names.append(speaker)

# add to dataframe
df["full_name"] = names

In [39]:
# top 100 speakers with the most records in the dataframe
top_100_speakers = df.pivot_table(index=['full_name'], aggfunc='size')
top_100_speakers = df.groupby(['full_name'])['full_name']\
                .count()\
                .reset_index(name='count')\
                .sort_values(['count'], ascending=False)\
                .head(100)

Unnamed: 0,full_name,count
891,Lara Ricardo,1061
1430,Secretary Committee,897
1119,Mullin Kevin,668
716,Hernandez Ed,516
541,Frazier Jim,500
...,...,...
982,M. Randolph Liane,77
1304,Ridley-Thomas Sebastian,76
217,Cabral Edgar,75
996,Mahone Amber,75


In [0]:
df_top_100 = df[df.full_name.isin(top_100_speakers['full_name'])]

In [282]:
N = 50

speaker_utter_cnt = []
for name in top_100_speakers.full_name:
    temp_df = df_top_100[df_top_100.full_name == name]

    tot_words = 0
    for t in temp_df.text:
        tot_words += len(t.split())

    speaker_utter_cnt.append((name, tot_words))
speaker_utter_cnt.sort(key=lambda tup: tup[1], reverse=True)    

top_N_speakers = [tup[0] for tup in speaker_utter_cnt[:N]]
df_top_N = df_top_100[df_top_100.full_name.isin(top_N_speakers)]
df_top_N

Unnamed: 0,vid,fileid,cid,c_name,c_house,hid,position,pid,diarization_id,last,first,start,end,utterance_order,text,full_name
47,4527,8xcAFOvPC50,2,Agriculture,Assembly,648,1,14987,4,Lund,Jay,844,857,48,"Thank you very much for having me, it's a plea...",Lund Jay
48,4527,8xcAFOvPC50,2,Agriculture,Assembly,648,1,14987,4,Lund,Jay,857,874,49,of this year on Economic Analysis of 2015 Drou...,Lund Jay
49,4527,8xcAFOvPC50,2,Agriculture,Assembly,648,1,14987,4,Lund,Jay,875,897,50,It was done by a collection of agricultural ec...,Lund Jay
50,4527,8xcAFOvPC50,2,Agriculture,Assembly,648,1,14987,4,Lund,Jay,897,918,51,We have a water year that starts October first...,Lund Jay
51,4527,8xcAFOvPC50,2,Agriculture,Assembly,648,1,14987,4,Lund,Jay,918,943,52,four steps as each storm contributes water to ...,Lund Jay
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30858,2858,nMgNQIdoyHs,148,Public Health and Developmental Services,Senate,497,4,103,1,Hernandez,Ed,1485,1494,555,"Current vote count is 9-3, and that bill is ou...",Hernandez Ed
30859,2858,nMgNQIdoyHs,148,Public Health and Developmental Services,Senate,497,4,2998,4,Secretary,Committee,1494,1498,556,Anderson? Mitchell? (aye) Moorlach?,Secretary Committee
30860,2858,nMgNQIdoyHs,148,Public Health and Developmental Services,Senate,497,4,103,1,Hernandez,Ed,1504,1512,557,Current vote count is 9-2. That bill is out. I...,Hernandez Ed
30861,2858,nMgNQIdoyHs,148,Public Health and Developmental Services,Senate,497,4,2998,4,Secretary,Committee,1512,1521,558,Morrell? (no) Anderson? Mitchell? (aye) 9-3.,Secretary Committee


##Features

In [307]:

import nltk, random, spacy
from nltk.corpus import stopwords

nltk.download('stopwords')    #these are only for Collab, on Frank the "download"is not necessary
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets') 
nltk.download('punkt')

stemmer = nltk.stem.porter.PorterStemmer() #NLTK's built-in stemmer resource
nlp = spacy.load("en_core_web_sm")
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#uses NLTK's built in word tokenizer. Output is a list now.
def myTokenizer(text):
  return nltk.word_tokenize(text)

#POS tagger.
#input is a list of words, output is a list of tuples (Word, Tag)
##Do you want to know what the tags mean? Run this: nltk.help.upenn_tagset()
def getPOS(tokenized):
  return nltk.pos_tag(tokenized)

#This removes all the stop words, and actually also punctuation and non-alphabetic words, and makes all lower case
#you can edit your own version
def filterTokens(tokenized):
  return [token.lower() for token in tokenized if token.isalpha() and token.lower() not in stopwords.words('english')]

#Using the NLTK stemmer
def stemming(tokenized):
  return [stemmer.stem(token) for token in tokenized]

def lemmatize(tokenized):
    doc = nlp(' '.join(tokenized))
    return [token.lemma_ for token in doc]

# text is lemmatized, filtered, and tokenized
def mostCommon(texts, N):
    vocab = []
    for text in texts:
        tokenized = lemmatize(filterTokens(myTokenizer(text)))
        vocab.extend(tokenized)
    
    freqdist = nltk.FreqDist(vocab) 
    freqdist = freqdist.most_common()
    return freqdist[:N]

def processSents(text):  # return [(sent, word_count)]
    sents = sent_tokenizer.tokenize(text)
    return [(sent, len(sent.split())) for sent in sents]

def getEntities(text):
    doc = nlp(text)
    
    summary = {}
    entities = {}
    # orgs = []
    # persons = []
    # gpes = []
    # other = []
    for ent in doc.ents:
        # 1 build summary of all entities
        if ent in summary:
            summary["ent_cnt(" + ent.label_ + ")"] += 1
        else:
            summary["ent_cnt(" + ent.label_ + ")"] = 1
            
        entities["contains_entity(" + ent.text +")"] = True
        # # 2 get list of orgs mentioned
        # if (ent.label_ == 'ORG'):
        #     orgs.append(ent.text)
        # # 3 get list of people mentioned
        # elif (ent.label_ == 'PERSON'):
        #     persons.append(ent.text)
        # elif (ent.label_ == 'GPE'):
        #     gpes.append(ent.text)            
        # else:
        #     other.append(ent.text)
    # return summary, orgs, persons, gpes, other
    return summary, entities

    
# Extract features of a single text
def processText(text):
    features = {}

    if (PREPROCESS_STAGE):
        cleaned = lemmatize(filterTokens(myTokenizer(text)))        
        svs = speakerVocabScore(cleaned, num_of_speakers, TOP_VOCAB)
        features.update(svs)

    sent_lengths = [sent[1] for sent in processSents(text)]
    avg_sent = sum(sent_lengths) / len(sent_lengths)
    features['avg_sent'] = avg_sent

    summary, entities = getEntities(text)
    features.update(summary)
    features.update(entities)
    
    return features


# -------------------------------------------------------------------------
# Extract additional features using the training set
# - this takes about 2 minutes to do
# -------------------------------------------------------------------------
# given a speaker and dataset, returns a list of corresponding texts
def getSpeakerTexts(speaker, data):
    print('Speaker ', speaker, ' - total texts: ', len(data[data.full_name == speaker]['text']))
    return data[data.full_name == speaker]['text']

# Process the train data and create more features
def preprocessSpeakers(speakers, train_data, label_encoder):
    if (PREPROCESS_STAGE == False):
        return []

    speakers_summary = []
    for name in speakers:
        texts = getSpeakerTexts(name, train_data)
        most_common = mostCommon(texts, 25)
        speakers_summary.append({
                                'ID': label_encoder.transform([name])[0],
                                'name':name,
                                'top_N_vocab':[tup[0] for tup in most_common]
        })

    return speakers_summary

# This feature requires a prebuilt summary of the speakers (See preprocessSpeakers(...))
def speakerVocabScore(tokenized, num_speakers, vocab_lookup):    
    if (PREPROCESS_STAGE == False):
         return {}

    score_table = {}    # {speaker_id : score}

    # rally up score
    for word in tokenized:
        if word in vocab_lookup:                    # this word in the top vocab
            for speakerID in vocab_lookup[word]:    # add score to the speaker
                if speakerID in score_table:
                    score_table["SVS_" + str(speakerID)] += 1  # SVS - speaker_vocab score
                else:
                    score_table["SVS_" + str(speakerID)] = 1
    return score_table
# -------------------------------------------------------------------------
# -------------------------------------------------------------------------



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##Split training/testing

In [0]:
import numpy as np

# train/test data will only be from the sample-size
def split_train_test(data, splitpt, sample_size = 1.0):    
    length = len(data)
    rand_df = pd.DataFrame(np.random.randn(length,2))
    msk = np.random.rand(len(rand_df)) < splitpt    

    train = data[msk]
    old_len = len(train)
    train = train.sample(frac=sample_size, replace=False)    
    # print("train: old_length = ", old_len, " new_length = ", len(train))
    
    test = data[~msk]
    old_len = len(test)
    test = test.sample(frac=sample_size, replace=False)
    # print("test: old_length = ", old_len, " new_length = ", len(test))
    return train, test

In [0]:
train_df = pd.DataFrame(columns = df_top_N.columns)
test_df = pd.DataFrame(columns = df_top_N.columns)

for name in top_N_speakers:
    temp_df = df_top_N[df_top_N.full_name == name]  # only one speaker

    train, test = split_train_test(temp_df, 0.8, .25)
    train_df = pd.concat([train_df, train])
    test_df = pd.concat([test_df, test])


In [291]:
print(len(X_train))

82


In [0]:
labelencoder = LabelEncoder()

X_train = train_df['text']
y_train = labelencoder.fit_transform(train_df['full_name'])

X_test = test_df['text']
y_test = labelencoder.transform(test_df['full_name'])

##Pre-processing stage
Extract features from the training set

In [0]:
PREPROCESS_STAGE = True
TOP_VOCAB = {}

In [313]:
if (PREPROCESS_STAGE):
    SPEAKER_SUMMARY = preprocessSpeakers(top_N_speakers, df_top_N, labelencoder)
    num_of_speakers = len(SPEAKER_SUMMARY)

    # save time checking if a word is in a speaker's top vocab with this
    for i in range(0, num_of_speakers):
        for word in SPEAKER_SUMMARY[i]['top_N_vocab']:
            if word not in TOP_VOCAB:
                TOP_VOCAB[word] = [SPEAKER_SUMMARY[i]['ID']]
            else:
                # print(word, ":", TOP_VOCAB[word])
                TOP_VOCAB[word].append(SPEAKER_SUMMARY[i]['ID'])            


Speaker  Picker Michael  - total texts:  414
Speaker  Lara Ricardo  - total texts:  1061
Speaker  Secretary Committee  - total texts:  897
Speaker  Mullin Kevin  - total texts:  668
Speaker  Pan Richard  - total texts:  375
Speaker  De Leon Kevin  - total texts:  377
Speaker  Chiu David  - total texts:  433
Speaker  Beall Jim  - total texts:  317
Speaker  Frazier Jim  - total texts:  500
Speaker  Hernandez Ed  - total texts:  516
Speaker  Bonta Rob  - total texts:  346
Speaker  Hueso Ben  - total texts:  363
Speaker  Cooley Ken  - total texts:  291
Speaker  Jones-Sawyer Reginald  - total texts:  225
Speaker  Mitchell Holly  - total texts:  397
Speaker  Cappio Claudia  - total texts:  254
Speaker  Hertzberg Robert  - total texts:  188
Speaker  Thurmond Tony  - total texts:  181
Speaker  Eggman Susan Talamantes  - total texts:  167
Speaker  Boatman Patterson Tia  - total texts:  124
Speaker  Hill Jerry  - total texts:  204
Speaker  Lund Jay  - total texts:  150
Speaker  Chau Ed  - total 

In [317]:
sample = "And are the sole producer of 14 commodities in California. Including walnuts, which my family grows. And other products such as almonds and raisins. California's agricultural exports totaled $21 billion in 2013, representing 15% of the nation's total. What those figures show, is that farmers and ranchers are adapting."
processText(sample)

{'SVS_1': 1,
 'SVS_11': 1,
 'SVS_13': 1,
 'SVS_15': 1,
 'SVS_17': 1,
 'SVS_18': 1,
 'SVS_19': 1,
 'SVS_2': 1,
 'SVS_21': 1,
 'SVS_24': 1,
 'SVS_27': 1,
 'SVS_3': 1,
 'SVS_31': 1,
 'SVS_33': 1,
 'SVS_4': 1,
 'SVS_42': 1,
 'SVS_44': 1,
 'SVS_5': 1,
 'SVS_8': 1,
 'avg_sent': 9.8,
 'contains_entity($21 billion)': True,
 'contains_entity(14)': True,
 'contains_entity(15%)': True,
 'contains_entity(2013)': True,
 'contains_entity(California)': True,
 'ent_cnt(CARDINAL)': 1,
 'ent_cnt(DATE)': 1,
 'ent_cnt(GPE)': 1,
 'ent_cnt(MONEY)': 1,
 'ent_cnt(PERCENT)': 1}

In [270]:
len(X_train)

10536

In [0]:
### Vectorize features
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()
X_train_vec = vectorizer.fit_transform(map(processText, list(X_train)))
X_test_vec = vectorizer.transform(map(processText, list(X_test)))

In [0]:

### Build model
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model = model.fit(X_train_vec , y_train)

In [316]:

### Accuracy Score
print("Accuracy Scores:")
print("train: ", model.score(X_train_vec, y_train))
print("test:", model.score(X_test_vec, y_test))
print("\n\n")

Accuracy Scores:
train:  0.20418442464161177
test: 0.17602427921092564



