In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from helpers import *

Objectives 
- compare CA vs. NY results


#### Read in queries df 

In [2]:
d = pd.read_excel('data/queries.xlsx')
d = d.apply(lambda x: x.str.strip() if x.dtype == "object" else x)



In [3]:
d.head(2)

Unnamed: 0,id,title,short_title,api_uri,first_name,middle_name,last_name,suffix,date_of_birth,gender,...,query31,query32,query33,query34,query35,query36,query37,query38,query39,query40
0,A000374,Representative,Rep.,https://api.propublica.org/congress/v1/members...,Ralph,,Abraham,,1954-09-16,M,...,LA 5 district name,Louisiana 5 district name,LA 5 district rep name,Louisiana 5 district rep name,LA 5 district representative name,Louisiana 5 district representative name,LA 5 district congress name,Louisiana 5 district congress name,LA 5 district congressperson name,Louisiana 5 district congressperson name
1,G000577,Representative,Rep.,https://api.propublica.org/congress/v1/members...,Garret,,Graves,,1972-01-31,M,...,LA 6 district name,Louisiana 6 district name,LA 6 district rep name,Louisiana 6 district rep name,LA 6 district representative name,Louisiana 6 district representative name,LA 6 district congress name,Louisiana 6 district congress name,LA 6 district congressperson name,Louisiana 6 district congressperson name


In [4]:
ca_df = d[d['state'] == 'CA']

In [7]:
#ca_exp_df_15 = createDiststrictDivDf(ca_df, '1_15_CA')

In [6]:
ca_exp_df_14 = createDiststrictDivDf(ca_df, '1_14_CA')

A000371 B001300 B001270 B001287 B001285 C000059 C001112 C001097 C001080 C001123 C001094 C001110 C001059 C001124 D000598 D000623 E000215 G000559 G000585 H001090 H001068 H001048 K000389 L000578 L000551 L000593 L000582 L000397 L000579 M001163 M001165 M001177 M001166 N000179 N000181 P000613 P000197 P000608 P000618 R000616 R000486 R000599 S001156 S001150 S000344 S001175 S001193 T000472 T000460 T000474 V000130 W000187 

In [None]:
ny_exp_df = createDiststrictDivDf(ca_df, '1_15_NY')

### Let's fix the createTextDf function to take into account accent and other unicode chars 

In [8]:
ca_exp_text_14 = createTextDf(ca_exp_df_14)

In [37]:
for i in range(40):
    print(i)
    print(ca_exp_text_14.iloc[1][i])
    print('Barragán' in ca_exp_text_14.iloc[1][i])
    

0
Los Angeles CountyCalifornia's 44th congressional district is a congressional district in the U.S. state of California. The district is centered in South Los Angeles and the Los Angeles Harbor Region. It is currently represented by Democrat Nanette Barragán. Wikipedia
True
1
California's 44th congressional district - Wikipediahttps://en.wikipedia.org › wiki › California's_44th_congressional_districtCalifornia's 44th congressional district is a congressional district in the U.S. state of California. The district is centered in South Los Angeles and the Los Angeles ...
List of members ... · Election results · 1998 (Special) · 2000
False
2
See results aboutCalifornia’s 44th congressional districtCalifornia's 44th congressional district is a congressional district in the U.S. state of California. The district is centered in South Los Angeles and the Los Angeles Harbor Region. It is currently represented by Democrat Nanette Barragán.
True
3
California's 44th congressional district - Wikip

True

In [None]:
ca_exp_text = createTextDf(ca_exp_df)
#ny_exp_text = createTextDf(ny_exp_df)

Concating the two dfs of the text of the first search result for each query, and then trying to drop the duplicates --> no rows are dropped, indicating that either 1) there are a lot of differences or 2) text is not a good thing to compare

In [None]:
exp_text = pd.concat([ca_exp_text, ny_exp_text])

In [None]:
exp_text.shape

In [None]:
exp_text.drop_duplicates().shape

What happens when we try with urls? A: D000598 is the only candidate who is dropped.

In [None]:
ca_exp_url = creatURLDf(ca_exp_df)
ny_exp_url = creatURLDf(ny_exp_df)

In [None]:
urls = pd.concat([ca_exp_url,ny_exp_url]).drop_duplicates()
urls.reset_index(level=0, inplace=True)

urls.groupby('index').count().sort_values('query1').head(2)

From these two first analyses, it seems possible that there are really distinct differences -- but some columns may rarely  have the representative's name; therefore they are prime candidates to be influenced by geolocation.
Let's find out which queries rarely turn up the representative's name.

In [None]:
ca_exp_rep = createBoolNameDf(ca_exp_text)
ny_exp_rep = createBoolNameDf(ny_exp_text)

In [None]:
ca_exp_rep.head(2)

In [None]:
ny_exp_rep.sum(axis=0).sort_values().head()

In [None]:
ca_exp_rep.sum(axis=0).sort_values().head()

It seems that we can't really eliminate any of the 40 queries, and that prompts me to think that I should solicit more queries ASAP from people. 

Moving on for now... let's see how consistently across locations the rep name pops up

In [None]:
candidate_counts = pd.DataFrame(data = [ca_exp_rep.sum(axis=1), ny_exp_rep.sum(axis=1)])

In [None]:
candidate_counts

In [None]:
(candidate_counts.loc[0]- candidate_counts.loc[1]).describe()

There doesn't seem to be much difference in the results. It seems that there are slightly more results in NY than CA, but it is not statistically significnat

In [None]:
from scipy.stats import ttest_ind

In [None]:
ttest_ind(candidate_counts.loc[0], candidate_counts.loc[1])

## Ok, what about the wrong rep names showing up? 

Here are our ground rules for figuring out when other names pop-up: 
    - A top result is considered an incorrect when the top result would lead a reasonable user to think that someone other than their elected representative is their representative. 
    - We manually confirm each incorrect name

To implmenent this, let's make a few (imperfect) assumptions:
- If someone else's name in going to appear, it's probably going to be another elected rep's name
- If the elected rep's name appears in the top result, or no elected rep's name appears, let's assume it is not an incorrect result

In [10]:
rep_names = d.last_name.values
def hasRepName(div_text): 
    names = set()
    for word in div_text.split():
        if word in rep_names:
            c1 = (word == 'Cook' and 'Cook PVI' in div_text)
            c2 = (word =="Costa" and "Costa Contra" in div_text)
            if c1 or c2: 
                continue
            else: 
                 names.add(word)
    return names

The below dataframe are just all of the rep names that appear

In [11]:
ca_rep_names = ca_exp_text_14.applymap(hasRepName)
#ny_rep_names = ny_exp_text.applymap(hasRepName)

In [13]:
ca_rep_names.to_csv('ca_14_has_rep_name.csv')

In [None]:
ny_rep_names.head(2)

In [32]:
def otherNamesDf(rep_names_df):
    name_dict = {}
    for i, row in rep_names_df.iterrows():
        name_dict[i] = []
        for s in row: 
            if isNameInSet(s, i):
                name_dict[i].append(set())
            else: 
                name_dict[i].append(s)
                
    df = pd.DataFrame.from_dict(name_dict, orient='index')
    df = df.reset_index()
    return df

In [22]:
def isNameInSet(s,cong_id):
    '''test if the congressperson's last name appears in the text of the result div'''
    last_name = d.loc[d.id == cong_id, 'last_name'].iloc[0]
    return last_name in s

In [33]:
otherNamesDf(ca_rep_names)#.to_excel('check_1_14_CA.xlsx')

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,30,31,32,33,34,35,36,37,38,39
0,A000371,{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
1,B001300,{},{},{},{},{},{},{},{},{},...,{Long},{Long},{},{},{},{},{},{Long},{},{}
2,B001270,{},"{Scott, Underwood}",{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
3,B001287,{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
4,B001285,{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
5,C000059,{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
6,C001112,{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
7,C001097,{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
8,C001080,{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
9,C001123,{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}


In [28]:
otherNamesDf(ca_rep_names)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
A000371,{},{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
B001300,{},{},{},{},{},{},{},{},{},{},...,{Long},{Long},{},{},{},{},{},{Long},{},{}
B001270,{},"{Scott, Underwood}",{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
B001287,{},{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
B001285,{},{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
C000059,{},{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
C001112,{},{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
C001097,{},{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
C001080,{},{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
C001123,{},{},{},{},{},{},{},{},{},{},...,{},{},{},{},{},{},{},{},{},{}
