### Creating bigram representation and feature set

##### Parse into lists of words

In [None]:
def read_in_tloc_line(line, is_senate): 
    if is_senate: 
        chamber = "senate"
    else: 
        chamber = "house"
    split_line = line.split(",")
    if len(split_line)!=6: 
        return {"date":"", "title":"", "speaker":"", "text":"", "party":"", "dwnom":""}
    else: 
        party = split_line[4]
        if party == u'200':
            party = 1 # Republicans 
        else: 
            party = 0

        return {"date":split_line[0], "title":split_line[1], "speaker":split_line[2], "text":split_line[3], "party":party, "chamber":chamber, "dwnom1":split_line[5]}

def read_in_candidate_line(line): 
    split_line = line.split(",")
    if len(split_line)!=4:
        return {"date":"", "speaker":"", "text":"", "type":""}
    else: 
        return {"date":split_line[0], "speaker":split_line[1], "text":split_line[2], "type":split_line[3]}

In [None]:
house_raw = sc.textFile('../final_data/house_thomasloc_text.csv', minPartitions=20).map(lambda x: read_in_tloc_line(x, False))
senate_raw = sc.textFile('../final_data/senate_thomasloc_text.csv', minPartitions=20).map(lambda x: read_in_tloc_line(x, True))
all_cong = house_raw.union(senate_raw)
candidates_raw = sc.textFile('../final_data/all_election_documents.csv', minPartitions=20).map(read_in_candidate_line)

In [None]:
all_republicans = all_cong.filter(lambda x: x['party']==1)
all_democrats = all_cong.filter(lambda x: x['party']==0) ## CHANGE THIS IF USING -1 FOR DEM 
all_cong = all_republicans.union(all_democrats)
all_other_party = all_cong.filter(lambda x: x['party'] not in [0, 1])

In [None]:
import re
from stemming.porter2 import stem # Note: you need to install 'stemming' on each slave + master via ssh 

def parse_text(x): 
    ''' 
    Parses full text and returns words. Removes punctuation, numbers, and stop words. Stop word list is from http://xpo6.com/list-of-english-stop-words/
    Also uses a porter stemmer to stem each word. So "pour" "poured" "pouring" would all become "pour" 
    '''
    stopwords = open('stop_words.txt', 'r').read().split()
    words = x['text'].lower()
    words = re.sub('[^A-Za-z]+', ' ', words)
    x['words'] = [w for w in words.split() if w not in stopwords]
    x['words'] = [stem(w) for w in x['words']]
    x['words'] = [w for w in x['words'] if len(w)>1]
    x.pop("text")
    return x

parsed_republicans = all_republicans.map(parse_text)
parsed_democrats = all_democrats.map(parse_text)
parsed_all_cong = all_cong.map(parse_text)
parsed_candidates = candidates_raw.map(parse_text)

In [None]:
def create_bigrams(x): 
    text = " ".join(x['words'])
    if len(text)<10: 
        x['bigrams'] = []
        return x
    bigrams = [b for b in zip(text.split(" ")[:-1], text.split(" ")[1:])]
    x['bigrams'] = bigrams
    x['num_bigrams'] = len(bigrams)
    return x

republican_bigrams = parsed_republicans.map(create_bigrams)
democrat_bigrams = parsed_democrats.map(create_bigrams)
candidates_bigrams = parsed_candidates.map(create_bigrams)
all_cong_bigrams = parsed_all_cong.map(create_bigrams)

In [None]:
republican_bigrams_counts = republican_bigrams.flatMap(lambda x: x['bigrams']).countByValue().items()
republican_bigrams_counts.sort(key = lambda x: x[1], reverse=True)

democrat_bigrams_counts = democrat_bigrams.flatMap(lambda x: x['bigrams']).countByValue().items()
democrat_bigrams_counts.sort(key = lambda x: x[1], reverse=True)

all_cong_bigrams_counts = all_cong_bigrams.flatMap(lambda x: x['bigrams']).countByValue().items()
all_cong_bigrams_counts.sort(key = lambda x: x[1], reverse=True)

In [None]:
candidates_bigrams_counts = candidates_bigrams.flatMap(lambda x: x['bigrams']).countByValue().items()
candidates_bigrams_counts.sort(key = lambda x: x[1], reverse=True)

In [None]:
len(candidates_bigrams_counts)

In [None]:
len(all_cong_bigrams_counts)

In [None]:
# list of num of documents each bigram appears in 
def drop_duplicates(x): 
    temp = x['bigrams']
    x['bigrams'] = list(temp)
    return x

bigrams_nodups = all_cong_bigrams.map(drop_duplicates)
bigrams_doc_count = bigrams_nodups.flatMap(lambda x: x['bigrams']).countByValue().items()

In [None]:
import numpy as np
sorted_bigrams_doc_count = sorted(bigrams_doc_count, key=lambda x: x[1], reverse=True)
doc_count = np.array([x[1] for x in sorted_bigrams_doc_count])
print "The 99th ptile is %d documents" % np.percentile(doc_count, 99)
print "The 90th ptile is %d documents" % np.percentile(doc_count, 90)
print "The 80th ptile is %d documents" % np.percentile(doc_count, 80)
print "The 75th ptile is %d documents" % np.percentile(doc_count, 75)
print "The 60th ptile is %d documents" % np.percentile(doc_count, 60)
print "The 50th ptile is %d documents" % np.percentile(doc_count, 50)
print "The 40th ptile is %d documents" % np.percentile(doc_count, 40)
print "The 25th ptile is %d documents" % np.percentile(doc_count, 25)
print "The 15th ptile is %d documents" % np.percentile(doc_count, 15)
print "The 5th ptile is %d documents" % np.percentile(doc_count, 5)

##### Finalizing feature set 
Based on the congressional bigrams (since that will be the training set). 

Parameters: 
- Needs to appear in >15 documents
- Needs to be mentioned 20 times
- Needs to be spoken by 5 unique individuals
- (inactive -- unclear if valuabe) If in both top 50 Republican and top 50 Democrat, not worth keeping

In [None]:
from random import shuffle
def create_bigrams_rep_and_map(all_bigrams, dem_bigrams, rep_bigrams, doc_counts):
    shuffle(all_bigrams)
    
    # PARAMETERS
    doc_cutoff = 15
    top_both_cutoff = 50
    total_mention_cutoff = 20
    
    # keep only with over cutoff mentions
    rv = [l[0] for l in all_bigrams if l[1]>total_mention_cutoff]
    
    # not in top total_mention_cutoff of both rep and dem bigrams
#     top_rep = [l[0] for l in rep_bigrams[:top_both_cutoff]]
#     top_dem = [l[0] for l in dem_bigrams[:top_both_cutoff]]
#     top_both = [l for l in top_rep if l in top_dem]
#     rv = [l for l in rv if l not in top_both]
    
    # in more than doc_cutoff documents 
    bigrams_over_doc_cutoff = [l[0] for l in doc_counts if l[1]>doc_cutoff]
    rv = [l for l in rv if l in bigrams_over_doc_cutoff]
    
    # Some of the most common procedural bigrams
    procedural_bigrams = [(u'remind', u'speaker'), (u'rollcal', u'motion'), (u'hous', u'major'),(u'file', u'amend'),
                          (u'rollcal', u'motion'), (u'introduc', u'resolut'), (u'book', u'call'), (u'speaker', u'rise'),
                          (u'amend', u'minut'), (u'presid', u'speaker'), (u'oppos', u'bill'), (u'speaker', u'democrat'), (u'hous', u'chamber'), 
                          (u'congression', u'poverti'), (u'seventh', u'congression'), (u'amend', u'minut'), (u'opposit', u'call'), 
                          (u'inquir', u'schedul'),  (u'schedul', u'week'),  (u'bill', u'text'),  (u'move', u'suspend'),  (u'purpos', u'clerk'), 
                          (u'mile', u'colorado'),  (u'address', u'floor'),  (u'page', u'regul'),  (u'hollen', u'chairman'),  (u'read', u'title'), 
                          (u'text', u'bill'),  (u'barbara', u'lee'), (u'time', u'ms'), (u'build', u'clerk'), (u'house', u'meet'),(u'meet', u'noon'),
                          (u'congresswoman', u'christensen'), (u'recomit', u'desk'), (u'capitol', u'due'), (u'read', u'titl'),(u'purpos', u'inquir'),
                          (u'progress', u'messag'), (u'leader', u'harri'),(u'yield', u'floor'), (u'due', u'prior'),(u'madam', u'presid'), (u'recommit', u'desk'), 
                          (u'madam', u'speaker'), (u'piec', u'legisl'),(u'speaker', u'thank'),(u'thank', u'chairman'), (u'yield', u'time'), (u'speaker', u'yield'),
                          (u'presid', u'yeild'), (u'amend', u'desk')]
    rv = [l for l in rv if l not in procedural_bigrams]
    # There are some words leftover that are procedural and we should just remove all bigrams that include them 
    rv = [l for l in rv if 'motion' not in l and 'speaker' not in l and 'dr' not in l and 'hour' not in l and 'rollcall' not in l]
    rv = [l for l in rv if 'house' not in l and 'meet' not in l and 'noon' not in l and 'repres' not in l and 'speaker' not in l]
    rv = [l for l in rv if 'monday' not in l and 'hous' not in l and 'announc' not in l and 'clerk' not in l and 'whip' not in l]
    rv = [l for l in rv if 'floor' not in l and 'week' not in l and 'tuesday' not in l and 'pass' not in l and 'postpon' not in l]

    # Finally, we need an mapping to help us locate the bigrams by their index more easily
    bigram_to_index = {}
    index_to_bigram = {}
    i = 1
    for bigram in rv: 
        bigram_to_index[bigram] = i
        i+=1 
        index_to_bigram[i] = bigram
    
    return rv, bigram_to_index, index_to_bigram
    

In [None]:
bigrams_list, bigram_to_index, index_to_bigram = create_bigrams_rep_and_map(all_cong_bigrams_counts, democrat_bigrams_counts, republican_bigrams_counts, sorted_bigrams_doc_count)

In [None]:
len(bigrams_list)

We also want to restrict to bigrams that have a certain number of unique speakers. If a bigram were to only be spoken by a single person (even multiple times), could cause identification problems. But first we need to filter or congressional speach down to only bigrams in the above representation

In [None]:
def filter_features(x):
    x['features'] = [l for l in x['bigrams'] if l in bigrams_list]
    x['num_bigrams'] = len(x['bigrams'])
    x['num_features'] = len(x['features'])
    x.pop('bigrams')
    x.pop('words')
    return x

In [None]:
temp_features = all_cong_bigrams.map(filter_features).collect()

In [None]:
def filter_on_unique_speakers(bigrams_list, features, cutoff): 
    ''' 
    Drops bigrams with less than <cutoff> unique speakers
    '''
    counts = {}
    for bg in bigrams_list: 
        counts[bg] = []
    for speaker in features: 
        for bg in speaker['features']: 
            if speaker not in counts[bg]: 
                counts[bg].append(speaker)
    rv = [] 
    for bg in counts: 
        if len(counts[bg])>=cutoff: 
            rv.append(bg)
    return rv 

bigrams_temp = filter_on_unique_speakers(bigrams_list, temp_features, 5)

In [None]:
print len(bigrams_temp)
print len(bigrams_list)
bigrams_temp = bigrams_list

In [None]:
num_bigrams = len(bigrams_list)
bc_bigram_to_index = sc.broadcast(bigram_to_index)
bc_index_to_bigram = sc.broadcast(index_to_bigram)
print "Our representation has %d bigrams" %num_bigrams

In [None]:
# Apply the filter features map to the congressional bigrams. Don't collect (hence why can't just use temp_features)
all_cong_features = all_cong_bigrams.map(filter_features)
all_cong_features_collected = temp_features

## Collapsing to speaker level + summary statistics
For the congressional bigrams, we don't really care about individual speeches. We want a dataset where each row is a speaker and the columns are all the features. So first, let's collapse it so we have (speaker)-(every feature spoken). We'll later turn this into count vectors. 

Also, good time to check some basic summary statistics

In [None]:
final_features = {}
for f in all_cong_features_collected: 
    name = f['speaker']
    if name not in final_features: 
        final_features[name] = {} 
    if 'features' not in final_features[name]:
        final_features[name]['features'] = []
        final_features[name]['chamber'] = f['chamber']
        final_features[name]['dwnom'] = f['dwnom1']
        final_features[name]['party'] = f['party']
    final_features[name]['features'] += f['features']

In [None]:
all_speakers = set() 
ac = all_cong.collect()
for speech in ac: 
    if (speech['speaker'], speech['party'], speech['chamber']) not in all_speakers: 
        all_speakers.add((speech['speaker'], speech['party'], speech['chamber']))
all_speakers = [l for l in list(all_speakers)]
print "There are %d unique speakers" % len(all_speakers)
print "%d of which are Republican" % len([l for l in all_speakers if l[1]==1])

In [None]:
def num_words(x): 
    x = len(x['text'].split())
    return x
num_speeches = all_cong.count()
print "Average words per speech: " + str(all_cong.map(num_words).reduce(lambda a, b: a+b)/num_speeches)
print "Number of speeches: %d " % num_speeches

In [None]:
num_feats = [] 
for i in range(len(final_features.keys())):
    num_feats.append(len(final_features[final_features.keys()[i]]['features']))
print "Average number of active features per speaker:"
reduce(lambda x, y: x+y, num_feats)/float(len(num_feats))

In [None]:
def num_features(x): 
    x = len(x['features'])
    return x
print "Average active features per speech: " + str(all_cong_features.map(num_features).reduce(lambda a, b: a+b)/all_cong_features.count())

## Exporting congressional speech data for use in R
To use these features in R, we need to have a consistent 2d representation where each row is a speaker and each column represents a feature -- this way we can store the count of # of times the speaker spoke a given feature

#### Fix the bigrams index to make room for three new columns
We need the 2d representation to have [speaker, party, dwnom] preceeding the other features

In [None]:
speaker_to_index = {} 
index_to_speaker = {}
i = 0
for speaker in all_speakers: 
    speaker_to_index[speaker[0]] = i
    index_to_speaker[i] = speaker[0]
    i+=1

bigram_to_index_2 = {}
i = 2
for bg in bigrams_list: 
    bigram_to_index_2[bg] = i
    i+=1
index_to_bigram_2 = {}
for key in bigram_to_index_2: 
    index_to_bigram_2[bigram_to_index_2[key]] = key

In [None]:
len_bg = len(bigrams_list)+2
len_as = len(all_speakers)
main = [[0]*len_bg for _ in range(len_as)]

In [None]:
for speaker in final_features: 
    if final_features[speaker]['dwnom'] != 'dwnom':
        main[speaker_to_index[speaker]][0] = int(final_features[speaker]['party'])
        main[speaker_to_index[speaker]][1] = float(final_features[speaker]['dwnom'])
        for feature in final_features[speaker]['features']: 
            main[speaker_to_index[speaker]][bigram_to_index_2[feature]] += 1

#### Breaking into covars and feature vectors
Adds column names, generally formats nicely, etc. 

In [None]:
covars = np.asmatrix(main)[:,:2]
cong2016 = np.asmatrix(main)[:,2:]

In [None]:
covars_speaker_names = [[l[0].replace(".", "")] for l in all_speakers]
covars_colnames = ["speaker", "party", "dwnom"]
covars = np.asmatrix(main)[:,:2].tolist()
covars = np.asarray([covars_colnames] + [l[0]+l[1] for l in zip(covars_speaker_names, covars)])

In [None]:
cong2016_list = cong2016.tolist()
cong2016_colnames =  [l[0]+"."+l[1] for l in bigrams_list]
cong2016_list = [cong2016_colnames] + cong2016_list

In [None]:
import csv
f = csv.writer(open("../final_data/cong2016_covars.csv", "wb+"))
for row in covars: 
    f.writerow(row)

In [None]:
f2 = csv.writer(open("../final_data/cong2016_counts.csv", "wb+"))
for row in cong2016_list: 
    f2.writerow(row)

## Exporting candidate data for use in R
Similarly, we need to have the candidate data saved in a way we can use in R. 

Instead of having each speaker-date be a row, we want to collapse to speaker-month-features. 

#### Restrict to candidates of interest and combine all ways their name appears
The different data sources have different ways of writing the candidates names. We want to restrict to a certain set of candidates and standardize the name. This dictionary provides a map to do so. 

In [None]:
candidates_of_interest = {'Chris Christie':'christie','christie':'christie',
                          'Hillary Clinton': 'clinton', 'clinton':'clinton',
                          'Bernie Sanders': 'sanders', 'sanders':'sanders',
                          'Ted Cruz':'cruz', 'cruz':'cruz', 
                          'Donald Trump':'trump', 'trump':'trump',
                          'Rand Paul':'paul', 'paul':'paul',
                          'Jeb Bush':'bush', 'bush':'bush',
                          'Marco Rubio':'rubio', 'rubio':'rubio',
                          'John Kasich':'kasich', 'kasich':'kasich', 
                          'Ben Carson':'carson', 'carson':'carson'
                         }

In [None]:
candidate_features = candidates_bigrams.map(filter_features)
cand_feats = candidate_features.collect()
cand_feats = cand_feats[1:]

In [None]:
final_cand_feats = {}
for item in cand_feats:
    # Drop press releases. Often linked news articles
    if item['type']=='press releases':
        continue
    name = item['speaker']
    # Confirm we care about this speaker and substitute for the correct name
    if name not in candidates_of_interest.keys(): 
        continue
    name = candidates_of_interest[name]
    date = item['date'].strip()[2:]
    # Initialize new dict if one doesn't exist
    if name not in final_cand_feats:
        final_cand_feats[name] = {} 
    if date not in final_cand_feats[name]: 
        final_cand_feats[name][date] = {}
    if 'features' not in final_cand_feats[name][date]:
        final_cand_feats[name][date]['features'] = [] 
        final_cand_feats[name][date]['num_bigrams'] = 0
        final_cand_feats[name][date]['num_features'] = 0
    final_cand_feats[name][date]['features'] += [l[0]+"."+l[1] for l in item['features']]
    final_cand_feats[name][date]['num_bigrams'] += item['num_bigrams']
    final_cand_feats[name][date]['num_features'] += item['num_features']       

In [None]:
# List of all candidate-date combos
from datetime import datetime
all_cand_dates = set()
for speaker in final_cand_feats:
    for date in final_cand_feats[speaker]:
        d = datetime.strptime(date[:8], "%y-%m-%d")
        # turn to a monthly date
        date_monthly = str(d.month) + "-" + str(d.year)
        all_cand_dates.add(speaker+"_"+date_monthly)
all_cand_dates = list(all_cand_dates)

In [None]:
# Intialize and fill main array 
len_cd = len(all_cand_dates)
len_feats = len(cong2016_list[0])
main_cand = [[0]*len_feats for _ in range(len_cd)]
for speaker in final_cand_feats:
    for date in final_cand_feats[speaker]:
        d = datetime.strptime(date[:8], "%y-%m-%d")
        date_monthly = str(d.month) + "-" + str(d.year)
        key = speaker+"_"+date_monthly
        ind_r = all_cand_dates.index(key)
        for feat in final_cand_feats[speaker][date]['features']:
            if feat in cong2016_list[0]: 
                ind_c = cong2016_list[0].index(feat)
                main_cand[ind_r][ind_c]+=1

In [None]:
# Add header of bigram names
main_cand_final = [cong2016_list[0]] + main_cand
assert(len(main_cand_final)==len_cd+1)

#### Export to R

In [None]:
# convert to csv 
f3 = csv.writer(open("../final_data/pres_cand_counts.csv", "wb+"))
for row in main_cand_final: 
    f3.writerow(row)

In [None]:
# Create csv of Candidate - Date that aligns with above 
f4 = csv.writer(open("../final_data/cand_date_labels.csv", "wb+"))
for cand in all_cand_dates: 
    rv = cand.split("_")
    f4.writerow([rv[0], rv[1]])

### All speech

In [None]:
# List of all candidates
all_speakers = final_cand_feats.keys()

In [None]:
# Intialize and fill main array 
len_cd = len(all_speakers)
len_feats = len(cong2016_list[0])
main_cand = [[0]*len_feats for _ in range(len_cd)]

In [None]:
for speaker in final_cand_feats:
    for date in final_cand_feats[speaker]:
        ind_r = all_speakers.index(speaker)
        for feat in final_cand_feats[speaker][date]['features']:
            if feat in cong2016_list[0]: 
                ind_c = cong2016_list[0].index(feat)
                main_cand[ind_r][ind_c]+=1

In [None]:
main_cand_final = [cong2016_list[0]] + main_cand

In [None]:
# convert to csv 
f3 = csv.writer(open("../final_data/candidates_all_speech.csv", "wb+"))
for row in main_cand_final: 
    f3.writerow(row)

In [None]:
# Create csv of Candidate - Date that aligns with above 
f4 = csv.writer(open("../final_data/candidates_all_speech_labels.csv", "wb+"))
for cand in all_speakers: 
    f4.writerow([cand])