In [78]:
import numpy as np
import pandas as pd
pd.set_option("max_colwidth", 200)

import json
import re

import spacy
from spacy.language import Language
from spacy.lang.en.stop_words import STOP_WORDS

from tqdm.notebook import tqdm

from gen_funcs_fb import *
import warnings
warnings.filterwarnings("ignore")

### Importing data and applying labels.

Preparing FB data to make predictions on.

In [79]:
fb_data = pd.read_csv('/Users/christineegan/spacy_workspace/data/facebook/en-US.csv')
print('# of observations:', len(fb_data))

# of observations: 222186


### Identifying Most Common Payers

In [80]:
# figuring out if the person who is paying for the ad is con or lib
payer_vals = pd.DataFrame(fb_data['paid_for_by'].value_counts())

# limiting the list to only those payers who places 100 or more ads 
payer_vals = payer_vals[payer_vals['paid_for_by'] >= 100]
payers = list(payer_vals.index)
# payers = [payer.lower() for payer in payers]
# payers = [payer.lower() for payer in payers]
payers = list(set(payers)) # dropping any exact duplicates
print(len(payers), 'different payers with more than 100 ads.\n')
# pprint(payers)

264 different payers with more than 100 ads.



In [81]:
# fb_data['in_payers'] = fb_data['paid_for_by'].apply(lambda x: 1 if x in payers else 0)
# data = fb_data.loc[fb_data['in_payers'] == 1]
# print(len(data), 'different ads from payers with more than 100 ads.\n')

In [82]:
fb_data['paid_for_by'] = fb_data['paid_for_by'].fillna('Missing')
fb_payers = list(fb_data['paid_for_by'])
fb_payers = [payer.lower() for payer in fb_payers if payer.lower() in payers]
fb_data['in_payers'] = fb_data['paid_for_by'].apply(lambda x: 1 if x.lower() in fb_payers else 0)

In [83]:
data = fb_data.loc[fb_data['in_payers'] == 1]
print(len(data), 'different ad from payers with more than 100 ads.\n')

266 different ad from payers with more than 100 ads.



### Creating a Payers DataFrame

In [84]:
# preprocessing

# I decided not to clean these because it messes up the NER
payer_df = pd.DataFrame(payers, columns=['payers'])

nlp = spacy.load('en_core_web_md')
docs = [nlp(doc) for doc in payer_df['payers']] # a list of nlp.docs
lemmatized_docs = [[n.lemma_ for n in doc 
                    if n.is_stop == False  # removing stops
                    and n.is_punct == False # ...punctuation
                    and n.is_digit == False # ...digits
                    and len(str(n)) >= 3]  # ...less than 3 letters
                   for doc in docs]  # list of lemmatized nlp docs
nes = [[ent.text for ent in doc.ents] for doc in docs]
ne_labels = [[ent.label_ for ent in doc.ents] for doc in docs]

payer_df['docs'] = docs
payer_df['lemmatized_docs'] = lemmatized_docs
payer_df['named_ents'] = nes
payer_df['ne_labels'] = ne_labels

In [85]:
docs # sanity check

[World Wildlife Fund,
 Everytown for Gun Safety Action Fund,
 AIDS United, Inc.,
 Covenant House International,
 Mike Gravel 2020,
 Cory 2020,
 END CITIZENS UNITED,
 HICKENLOOPER FOR COLORADO,
 the Republican National Committee,
 Feeding America,
 Shareblue Media,
 Montanans for Tester,
 Bee Site,
 Oxfam America,
 DONALD J. TRUMP FOR PRESIDENT, INC.,
 Greenpeace USA,
 National Domestic Workers Alliance,
 Beto for Texas,
 Independence USA. Not authorized by any candidate or candidate's committee.,
 UNICEF USA,
 Ben & Jerry's,
 America First Policies,
 Bullock for President,
 DNC SERVICES CORP./DEM. NAT'L COMMITTEE,
 MoveOn.org Civic Action.,
 the Trump Make America Great Again Committee,
 BOOKER FOR KENTUCKY,
 SIERRA CLUB,
 Indivisible Action,
 UNITED STATES ASSOCIATION FOR UNHCR,
 PredictIt is a political prediction market, a stock market for politics. A project of Victoria University of Wellington, P…,
 314 ACTION FUND,
 Zephyr for NY,
 Working Families Party,
 Reason To Believe PAC,


In [86]:
payer_df['original'] = payer_df['payers']
payer_df['payers'] = payer_df['payers'].str.lower()
payer_df = payer_df.drop_duplicates(subset=['payers'])
payer_df = payer_df.sort_values(by='named_ents').reset_index().drop('index', axis=1)
# display(payer_df)

# there seem to be a lot of empty ones
payer_df['label'] = payer_df['named_ents'].apply(lambda x: x[0] if len(x) != 0 else 'missing')
# display(payer_df)

In [87]:
payer_df = payer_df.drop_duplicates(subset=['payers'])
unknown_payers = payer_df.loc[payer_df['label'] == 'missing']
known_payers = payer_df.loc[payer_df['label'] != 'missing']
print('# of unknown payers:', len(unknown_payers)) # i'll call these 'unknown payers' and segment them to work with them.
print('# of known payers:', len(known_payers))

# of unknown payers: 66
# of known payers: 187


### Known payers

In [88]:
display(known_payers)

Unnamed: 0,payers,docs,lemmatized_docs,named_ents,ne_labels,original,label
66,bernie 2020,"(BERNIE, 2020)",[BERNIE],[2020],[DATE],BERNIE 2020,2020
67,hickenlooper 2020,"(Hickenlooper, 2020)",[Hickenlooper],[2020],[DATE],Hickenlooper 2020,2020
68,cory 2020,"(Cory, 2020)",[Cory],[2020],[DATE],Cory 2020,2020
69,gillibrand 2020,"(Gillibrand, 2020)",[Gillibrand],[2020],[DATE],Gillibrand 2020,2020
70,314 action fund,"(314, ACTION, FUND)","[action, FUND]",[314],[CARDINAL],314 ACTION FUND,314
...,...,...,...,...,...,...,...
248,the national democratic training committee,"(the, National, Democratic, Training, Committee)","[National, Democratic, Training, Committee]",[the National Democratic Training Committee],[ORG],the National Democratic Training Committee,the National Democratic Training Committee
249,the national network of abortion funds,"(the, National, Network, of, Abortion, Funds)","[National, Network, Abortion, Funds]",[the National Network of Abortion Funds],[ORG],the National Network of Abortion Funds,the National Network of Abortion Funds
250,the progressive change campaign committee pac and not authorized by any candidate or candidate's committee. contributions …,"(the, Progressive, Change, Campaign, Committee, PAC, and, not, authorized, by, any, candidate, or, candidate, 's, committee, ., Contributions, …)","[Progressive, Change, Campaign, Committee, PAC, authorize, candidate, candidate, committee, contribution]",[the Progressive Change Campaign Committee],[ORG],the Progressive Change Campaign Committee PAC and not authorized by any candidate or candidate's committee. Contributions …,the Progressive Change Campaign Committee
251,the republican national committee,"(the, Republican, National, Committee)","[Republican, National, Committee]",[the Republican National Committee],[ORG],the Republican National Committee,the Republican National Committee


In [89]:
# some are wrong, so we should deal with those
# 2020
known_payers['label'] = known_payers['label'].apply(lambda x: x.lower())
known_payers['label'] = known_payers['label'].str.replace(r'(2020.*$)', 'missing', regex=True)

In [90]:
known_payers['label'] = known_payers['label'].str.replace(r'(^.*democrat.*$)', 'lib', regex=True)
known_payers['label'] = known_payers['label'].str.replace(r'(^.*republican.*$)', 'con', regex=True)
known_payers['label'] = known_payers['label'].str.replace(r'(^.*progressive.*$)', 'lib', regex=True)

In [91]:
known_payers

Unnamed: 0,payers,docs,lemmatized_docs,named_ents,ne_labels,original,label
66,bernie 2020,"(BERNIE, 2020)",[BERNIE],[2020],[DATE],BERNIE 2020,missing
67,hickenlooper 2020,"(Hickenlooper, 2020)",[Hickenlooper],[2020],[DATE],Hickenlooper 2020,missing
68,cory 2020,"(Cory, 2020)",[Cory],[2020],[DATE],Cory 2020,missing
69,gillibrand 2020,"(Gillibrand, 2020)",[Gillibrand],[2020],[DATE],Gillibrand 2020,missing
70,314 action fund,"(314, ACTION, FUND)","[action, FUND]",[314],[CARDINAL],314 ACTION FUND,314
...,...,...,...,...,...,...,...
248,the national democratic training committee,"(the, National, Democratic, Training, Committee)","[National, Democratic, Training, Committee]",[the National Democratic Training Committee],[ORG],the National Democratic Training Committee,lib
249,the national network of abortion funds,"(the, National, Network, of, Abortion, Funds)","[National, Network, Abortion, Funds]",[the National Network of Abortion Funds],[ORG],the National Network of Abortion Funds,the national network of abortion funds
250,the progressive change campaign committee pac and not authorized by any candidate or candidate's committee. contributions …,"(the, Progressive, Change, Campaign, Committee, PAC, and, not, authorized, by, any, candidate, or, candidate, 's, committee, ., Contributions, …)","[Progressive, Change, Campaign, Committee, PAC, authorize, candidate, candidate, committee, contribution]",[the Progressive Change Campaign Committee],[ORG],the Progressive Change Campaign Committee PAC and not authorized by any candidate or candidate's committee. Contributions …,lib
251,the republican national committee,"(the, Republican, National, Committee)","[Republican, National, Committee]",[the Republican National Committee],[ORG],the Republican National Committee,con


In [92]:
# if the word 'for' appears in the payers info, it is likely a candidate, so we can check for candidates
# and their affliations by checking out payers that contain 'for'
candidates_for = known_payers[known_payers['payers'].str.contains('for')]

display(candidates_for)

Unnamed: 0,payers,docs,lemmatized_docs,named_ents,ne_labels,original,label
73,"amy mcgrath for senate, inc.","(AMY, MCGRATH, FOR, SENATE, ,, INC, .)","[AMY, MCGRATH, SENATE, INC]","[AMY MCGRATH, SENATE, INC]","[PERSON, ORG, ORG]","AMY MCGRATH FOR SENATE, INC.",amy mcgrath
74,adam schiff for congress,"(Adam, Schiff, for, Congress)","[Adam, Schiff, Congress]","[Adam Schiff, Congress]","[PERSON, ORG]",Adam Schiff for Congress,adam schiff
75,alexandria ocasio-cortez for congress,"(Alexandria, Ocasio, -, Cortez, for, Congress)","[Alexandria, Ocasio, Cortez, Congress]","[Alexandria Ocasio-Cortez, Congress]","[PERSON, ORG]",Alexandria Ocasio-Cortez for Congress,alexandria ocasio-cortez
76,amy for america,"(Amy, for, America)","[Amy, America]",[America],[GPE],Amy for America,america
81,americans for prosperity,"(Americans, for, Prosperity)","[Americans, prosperity]",[Americans],[NORP],Americans for Prosperity,americans
...,...,...,...,...,...,...,...
235,"predictit is a political prediction market, a stock market for politics. a project of victoria university of wellington, p…","(PredictIt, is, a, political, prediction, market, ,, a, stock, market, for, politics, ., A, project, of, Victoria, University, of, Wellington, ,, P, …)","[PredictIt, political, prediction, market, stock, market, politic, project, Victoria, University, Wellington]",[Victoria University of Wellington],[ORG],"PredictIt is a political prediction market, a stock market for politics. A project of Victoria University of Wellington, P…",victoria university of wellington
236,"warren for president, inc.","(WARREN, FOR, PRESIDENT, ,, INC, .)","[WARREN, PRESIDENT, INC]","[WARREN, INC]","[ORG, ORG]","WARREN FOR PRESIDENT, INC.",warren
237,warren for president,"(Warren, for, President)","[Warren, President]",[Warren],[ORG],Warren for President,warren
238,"winning for women, inc.","(Winning, For, Women, ,, Inc.)","[win, Women, Inc.]","[Winning For Women, Inc.]",[ORG],"Winning For Women, Inc.","winning for women, inc."


In [93]:
# from that i extracted some known libs and cons
libs = ['elizabeth', 'nancy pelosi', 'hickenlooper', 'ammar campa-najjar', 
        'inslee for america', 'jaime harrison', 'adam schiff', 'mark kelly',
        'andrew cuomo', 'sinema', 'bill nelson', 'marianne williamson', 
        'dean phillips', 'jon ossoff', 'giffords', 'mikie sherrill', 'warren',
        'stacey abrams', 'gillibrand 2020', 'jeff merkley', 'pete for america',
        'hickenlooper 2020', 'seth moulton', 'biden', 'mike bloomberg 2020 inc',
        'andy kim', 'sara gideon', 'doug jones', 'bernie 2020', 'jb', 'kamala harris',
        'zephyr', 'cory booker', 'alexandria ocasio-cortez', 'beto', 'bernie 2020.',
        'randy bryce', 'amy mcgrath', 'sharice', 'lupe valdez', 'mccaskill', 'gillibrand',
        'mike gravel 2020', 'cory 2020', 'andrew gillum', 'joe kennedy', 'kim schrier',
        'tom steyer 2020', 'andrew janz']

cons = ['donald j. trump', 'cathy', 'tester']

In [94]:
candidates_for['label'] = candidates_for['label'].apply(lambda x: 'lib' if x.lower() in libs else x)
candidates_for['label'] = candidates_for['label'].apply(lambda x: 'con' if x.lower() in cons else x)

In [95]:
# the congress label people
candidates_for.loc[candidates_for['payers'] == 'schiff for congress', 'label'] = 'lib'
candidates_for.loc[candidates_for['payers'] == 'mccready for congress', 'label'] = 'lib'
candidates_for.loc[candidates_for['payers'] == 'o’connor for congress', 'label'] = 'con'

In [96]:
candidates_for.label.value_counts()

lib                                         48
con                                          3
everytown for gun safety action fund         2
montanans                                    2
national trust for historic preservation     1
north dakota                                 1
americans                                    1
bennet for america                           1
united states association                    1
winning for women, inc.                      1
better minnesota action fund                 1
everytown for gun safety action fund inc     1
america                                      1
better place forests                         1
concerned veterans for america               1
tony                                         1
victoria university of wellington            1
center for biological diversity              1
kentucky                                     1
cal                                          1
authorized                                   1
texas        

In [97]:
orgs = ['united states association', 'victoria university of wellington', 'center for biological diversity',
       'concerned veterans for america', 'concerned veterans for america', 'better minnesota action fund',
       'everytown for gun safety action fund inc', 'better place forests', 'winning for women, inc.', 
       'reform austin', 'national trust for historic preservation', 'everytown for gun safety action fund']

In [98]:
candidates_for['label'] = candidates_for['label'].apply(lambda x: 'org' if x in orgs else x)

In [99]:
candidates_for.label.value_counts()
candidates_for[(candidates_for['label'] != 'org') & (candidates_for['label'] != 'lib') & (candidates_for['label'] != 'con')]

Unnamed: 0,payers,docs,lemmatized_docs,named_ents,ne_labels,original,label
76,amy for america,"(Amy, for, America)","[Amy, America]",[America],[GPE],Amy for America,america
81,americans for prosperity,"(Americans, for, Prosperity)","[Americans, prosperity]",[Americans],[NORP],Americans for Prosperity,americans
90,and authorized by andrew janz for congress.,"(and, Authorized, by, , Andrew, Janz, for, Congress, .)","[authorize, Andrew, Janz, Congress]","[Authorized, Andrew Janz, Congress]","[ORG, PERSON, ORG]",and Authorized by Andrew Janz for Congress.,authorized
97,bennet for america,"(Bennet, for, America)","[Bennet, America]",[Bennet for America],[ORG],Bennet for America,bennet for america
105,cal for nc,"(CAL, FOR, NC)",[CAL],"[CAL, NC]","[ORG, GPE]",CAL FOR NC,cal
162,booker for kentucky,"(BOOKER, FOR, KENTUCKY)","[BOOKER, KENTUCKY]",[KENTUCKY],[GPE],BOOKER FOR KENTUCKY,kentucky
176,montanans for bullock,"(Montanans, for, Bullock)","[Montanans, Bullock]",[Montanans],[NORP],Montanans for Bullock,montanans
177,montanans for tester,"(Montanans, for, Tester)","[Montanans, Tester]","[Montanans, Tester]","[NORP, GPE]",Montanans for Tester,montanans
191,heidi for north dakota,"(Heidi, for, North, Dakota)","[Heidi, North, Dakota]",[North Dakota],[GPE],Heidi for North Dakota,north dakota
219,mj for texas,"(MJ, for, Texas)",[Texas],[Texas],[GPE],MJ for Texas,texas


In [100]:
candidates_for['label'] = candidates_for['label'].str.replace(r'(^.*american.*$)', 'org', regex=True)
candidates_for['label'] = candidates_for['label'].str.replace(r'(^.*bennet.*$)', 'lib', regex=True)
candidates_for['label'] = candidates_for['label'].str.replace(r'(^.*america.*$)', 'lib', regex=True)
candidates_for['label'] = candidates_for['label'].str.replace(r'(^.*cal.*$)', 'lib', regex=True)
candidates_for['label'] = candidates_for['label'].str.replace(r'(^.*kentucky.*$)', 'lib', regex=True)
candidates_for['label'] = candidates_for['label'].str.replace(r'(^.*montanans.*$)', 'lib', regex=True)
candidates_for['label'] = candidates_for['label'].str.replace(r'(^.*north dakota.*$)', 'lib', regex=True)
candidates_for['label'] = candidates_for['label'].str.replace(r'(^.*texas.*$)', 'lib', regex=True)
candidates_for['label'] = candidates_for['label'].str.replace(r'(^.*tony.*$)', 'lib', regex=True)
candidates_for['label'] = candidates_for['label'].str.replace(r'(^.*usa.*$)', 'lib', regex=True)
candidates_for['label'] = candidates_for['label'].str.replace(r'(^.*authorized.*$)', 'lib', regex=True)

In [101]:
# concating the labeled candidates with the known_payers
candidates = list(candidates_for['payers'])
inverse_boolean_series = ~known_payers['payers'].isin(candidates)
known_payers = known_payers[inverse_boolean_series]
known_payers = pd.concat([known_payers, candidates_for], axis=0)
print(len(known_payers)) # 

187


In [102]:
# isolating the unlabeled payers in known payers
unlabeled_payers = known_payers[(known_payers['label'] != 'org') & (known_payers['label'] != 'lib') & (known_payers['label'] != 'con')]
display(unlabeled_payers)

Unnamed: 0,payers,docs,lemmatized_docs,named_ents,ne_labels,original,label
66,bernie 2020,"(BERNIE, 2020)",[BERNIE],[2020],[DATE],BERNIE 2020,missing
67,hickenlooper 2020,"(Hickenlooper, 2020)",[Hickenlooper],[2020],[DATE],Hickenlooper 2020,missing
68,cory 2020,"(Cory, 2020)",[Cory],[2020],[DATE],Cory 2020,missing
69,gillibrand 2020,"(Gillibrand, 2020)",[Gillibrand],[2020],[DATE],Gillibrand 2020,missing
70,314 action fund,"(314, ACTION, FUND)","[action, FUND]",[314],[CARDINAL],314 ACTION FUND,314
...,...,...,...,...,...,...,...
240,world food program usa,"(World, Food, Program, USA)","[World, Food, Program, USA]",[World Food Program USA],[ORG],World Food Program USA,world food program usa
241,world wildlife fund,"(World, Wildlife, Fund)","[World, Wildlife, Fund]",[World Wildlife Fund],[ORG],World Wildlife Fund,world wildlife fund
242,yale program on climate change communication,"(Yale, Program, on, Climate, Change, Communication)","[Yale, Program, Climate, Change, communication]",[Yale Program],[ORG],Yale Program on Climate Change Communication,yale program
249,the national network of abortion funds,"(the, National, Network, of, Abortion, Funds)","[National, Network, Abortion, Funds]",[the National Network of Abortion Funds],[ORG],the National Network of Abortion Funds,the national network of abortion funds


In [103]:
# from the view of the df, i can spot a few more liberal payers off the bat.
libs = ['andrew yang', 'biden', 'bernie 2020', 'mike bloomberg',
        'mike gravel', 'richard ojeda', 'tom steyer']
unlabeled_payers['label'] = unlabeled_payers['label'].apply(lambda x: 'lib' if x.lower() in libs else x)
display(unlabeled_payers)

Unnamed: 0,payers,docs,lemmatized_docs,named_ents,ne_labels,original,label
66,bernie 2020,"(BERNIE, 2020)",[BERNIE],[2020],[DATE],BERNIE 2020,missing
67,hickenlooper 2020,"(Hickenlooper, 2020)",[Hickenlooper],[2020],[DATE],Hickenlooper 2020,missing
68,cory 2020,"(Cory, 2020)",[Cory],[2020],[DATE],Cory 2020,missing
69,gillibrand 2020,"(Gillibrand, 2020)",[Gillibrand],[2020],[DATE],Gillibrand 2020,missing
70,314 action fund,"(314, ACTION, FUND)","[action, FUND]",[314],[CARDINAL],314 ACTION FUND,314
...,...,...,...,...,...,...,...
240,world food program usa,"(World, Food, Program, USA)","[World, Food, Program, USA]",[World Food Program USA],[ORG],World Food Program USA,world food program usa
241,world wildlife fund,"(World, Wildlife, Fund)","[World, Wildlife, Fund]",[World Wildlife Fund],[ORG],World Wildlife Fund,world wildlife fund
242,yale program on climate change communication,"(Yale, Program, on, Climate, Change, Communication)","[Yale, Program, Climate, Change, communication]",[Yale Program],[ORG],Yale Program on Climate Change Communication,yale program
249,the national network of abortion funds,"(the, National, Network, of, Abortion, Funds)","[National, Network, Abortion, Funds]",[the National Network of Abortion Funds],[ORG],the National Network of Abortion Funds,the national network of abortion funds


In [104]:
# i can also spot a few obvious libs and cons by the lem democrat, republic, progressive
unlabeled_payers['label'] = unlabeled_payers['label'].str.replace(r'(^.*emocrat.*$)', 'lib', regex=True)
unlabeled_payers['label'] = unlabeled_payers['label'].str.replace(r'(^.*epublican.*$)', 'con', regex=True)
unlabeled_payers['label'] = unlabeled_payers['label'].str.replace(r'(^.*rogressive.*$)', 'lib', regex=True)

In [105]:
# need to fix the 2020 vals
unlabeled_payers.loc[unlabeled_payers['payers'] == 'hickenlooper 2020', 'label'] = 'lib'
unlabeled_payers.loc[unlabeled_payers['payers'] == 'bernie 2020', 'label'] = 'lib'
unlabeled_payers.loc[unlabeled_payers['payers'] == 'cory 2020', 'label'] = 'lib'
unlabeled_payers.loc[unlabeled_payers['payers'] == 'gillibrand 2020', 'label'] = 'lib'
unlabeled_payers.loc[unlabeled_payers['payers'] == 'tulsi now', 'label'] = 'lib'

In [106]:
display(unlabeled_payers)

Unnamed: 0,payers,docs,lemmatized_docs,named_ents,ne_labels,original,label
66,bernie 2020,"(BERNIE, 2020)",[BERNIE],[2020],[DATE],BERNIE 2020,lib
67,hickenlooper 2020,"(Hickenlooper, 2020)",[Hickenlooper],[2020],[DATE],Hickenlooper 2020,lib
68,cory 2020,"(Cory, 2020)",[Cory],[2020],[DATE],Cory 2020,lib
69,gillibrand 2020,"(Gillibrand, 2020)",[Gillibrand],[2020],[DATE],Gillibrand 2020,lib
70,314 action fund,"(314, ACTION, FUND)","[action, FUND]",[314],[CARDINAL],314 ACTION FUND,314
...,...,...,...,...,...,...,...
240,world food program usa,"(World, Food, Program, USA)","[World, Food, Program, USA]",[World Food Program USA],[ORG],World Food Program USA,world food program usa
241,world wildlife fund,"(World, Wildlife, Fund)","[World, Wildlife, Fund]",[World Wildlife Fund],[ORG],World Wildlife Fund,world wildlife fund
242,yale program on climate change communication,"(Yale, Program, on, Climate, Change, Communication)","[Yale, Program, Climate, Change, communication]",[Yale Program],[ORG],Yale Program on Climate Change Communication,yale program
249,the national network of abortion funds,"(the, National, Network, of, Abortion, Funds)","[National, Network, Abortion, Funds]",[the National Network of Abortion Funds],[ORG],the National Network of Abortion Funds,the national network of abortion funds


In [107]:
# now, it's mostly orgs/pacs/etc leftover, so I'm going to try and identify those, then find a way to class them
orgs = list(unlabeled_payers[(unlabeled_payers['label'] != 'lib') & (unlabeled_payers['label'] != 'con')]['payers'])

In [108]:
orgs # we have to use the payer val in this case or we can't see the entire org name

['314 action fund',
 'aarp',
 'the aclu',
 'stand up america',
 'define american',
 'american civil liberties union',
 'american medical association',
 'amnesty international usa',
 'anti-defamation league',
 'ballotready',
 'beachside media, inc.',
 'bee site',
 "ben & jerry's",
 'bernie 2020.',
 'doctors without borders/médecins sans frontières (msf)',
 'credo mobile',
 'crtv',
 'care2',
 'catholic relief services',
 'chad benson',
 'chesapeake bay foundation',
 'civiqs',
 'clean air clean energy washington 4347 roosevelt way ne seattle, wa 98105',
 'our colorado way of life',
 'cordray/sutton committee',
 'covenant house international',
 "dnc services corp./dem. nat'l committee",
 'the dscc',
 'dscc',
 'daily wire',
 "ditch fund and not authorized by any candidate or candidate's committee",
 "emily's list",
 'friends of the earth',
 'exxonmobil',
 'fctry',
 'greenpeace, inc.',
 'voters not politicians | po box 8362 grand rapids, mi 49518',
 'greenpeace usa',
 'hias',
 'house majorit

In [109]:
# let's knock out the ones where the label is the same as the payer val
unlabeled_payers['label'] = unlabeled_payers['label'].apply(lambda x: 'org' if x.lower() in orgs else x)

In [110]:
unlabeled_payers.label.value_counts()

org                                 64
lib                                 11
priorities usa action                2
inc                                  2
american                             1
greenpeace                           1
independence usa                     1
aclu                                 1
emily                                1
house                                1
america                              1
bernie missing                       1
colorado way of life                 1
yale program                         1
dnc services corp./dem               1
grand rapids                         1
today                                1
the christian science monitor        1
borders/médecins sans frontières     1
ditch fund                           1
march                                1
nrcc                                 1
clean air clean energy               1
earth                                1
314                                  1
patagonia                

In [111]:
# adding the ones that didn't match
orgs += ['priorities usa action', 'inc', 'nrcc', 'the christian science monitor', 
         'yale program', 'march', '314', 'today', 'earth', 'ditch fund', 
         'clean air clean energy', 'independence usa', 'greenpeace', 'house', 
         'michigan leadership committee', 'emily', 'borders/médecins sans frontières',
         'patagonia', 'aclu', 'grand rapids', 'america', 'colorado way of life',
         'dnc services corp./dem', 'american', 'bernie missing']

unlabeled_payers['label'] = unlabeled_payers['label'].apply(lambda x: 'org' if x.lower() in orgs else x)
unlabeled_payers.label.value_counts()

org    91
lib    11
Name: label, dtype: int64

In [112]:
un_payers = list(unlabeled_payers['payers'])
inverse_boolean_series = ~known_payers['payers'].isin(un_payers)
known_payers = known_payers[inverse_boolean_series]
known_payers = pd.concat([known_payers, unlabeled_payers], axis=0) # concating the unlabeled_payers df back onto known_payers

In [113]:
known_payers['label'].value_counts()

org    104
lib     78
con      5
Name: label, dtype: int64

In [114]:
import re

In [115]:
print(len(known_payers))

187


In [116]:
# i'm going to skim through the entries and fix an obvious dupes
known_payers['payers'] = known_payers['payers'].str.replace(r'(^.*everytown.*$)', 'everytown for gun safety action fund', regex=True)
known_payers['payers'] = known_payers['payers'].str.replace(r'(^.*american civil liberties.*$)', 'aclu', regex=True)
known_payers['payers'] = known_payers['payers'].str.replace(r'(^.*aclu.*$)', 'aclu', regex=True)
known_payers['payers'] = known_payers['payers'].str.replace(r'(^.*greenpeace.*$)', 'greenpeace', regex=True)
known_payers['payers'] = known_payers['payers'].str.replace(r'(^.*nrdc.*$)', 'nrdc action func', regex=True)
known_payers['payers'] = known_payers['payers'].str.replace(r'(^.*priorities usa.*$)', 'priorities usa', regex=True)
# known_payers = known_payers.drop_duplicates(subset=['payers'])
print(len(known_payers)) # knocked out 7 duplicates

187


In [117]:
known_payers.label.value_counts()

org    104
lib     78
con      5
Name: label, dtype: int64

In [118]:
known_orgs = known_payers.loc[known_payers['label'] == 'org']

In [119]:
known_orgs['payers'] = known_orgs['payers'].str.replace('^ ', '', regex=True).replace(' $')
known_orgs['label'] = known_orgs['payers'].str.replace('^ ', '', regex=True).replace(' $')

In [120]:
libs = ['aclu', 'friends of the earth ', "dnc services corp./dem. nat'l committee", 'working families party', 'national trust for historic preservation', 'ballotready', 'chesapeake bay foundation ', 'league of conservation voters', 'unicef usa', 'national domestic workers alliance', 'world food program usa ', 'the washington post', 'bernie 2020.', 'care2', 'union of concerned scientists, inc.', 'southern poverty law center', 'planned parenthood federation of america', 'natural resources defense council ', 'union of concerned scientists', 'human rights campaign, inc.', 'catholic relief services', "independence usa. not authorized by any candidate or candidate's committee.", 'yale program on climate change communication', 'national park foundation', 'the dscc', 'everytown for gun safety action fund', '314 action fund', 'ocean conservancy', 'aarp', "ben & jerry's", 'covenant house international', "women's march", 'sierra club', 'the wilderness society', 'alliance for a better minnesota action fund', 'winning for women, inc.', 'credo mobile', 'tyt network ', 'the voter participation center', 'national geographic society', 'the national network of abortion funds', 'national education association', 'sandy hook promise', 'greenpeace', 'dscc', 'world wildlife fund', 'patagonia ', 'michigan leadership committee. not authorized by any candidate.', 'nrdc action func', 'priorities usa', 'national audubon society, inc. ', 'international rescue committee', 'center for biological diversity', 'penzeys', 'oceana', 'our colorado way of life', '314 action fund ', 'amnesty international usa', "emily's list", 'tyt network', 'the climate reality project', "ditch fund and not authorized by any candidate or candidate's committee", 'anti-defamation league', 'naral pro-choice america', 'clean air clean energy washington 4347 roosevelt way ne seattle, wa 98105', 'ohio safe & healthy communities campaign', 'reform austin, inc.', 'friends of the earth', 'chesapeake bay foundation', 'reform austin, inc. ']
cons = ["the nrcc and not authorized by any candidate or candidate's committee.", 'exxonmobil', 'chad benson ', 'reform austin, inc.', 'americans for prosperity', 'crtv', 'cordray/sutton committee', 'daily wire', 'concerned veterans for america']
drop = ['civiqs', 'stand up america ', 'better place forests', 'bee site', 'doctors without borders/médecins sans frontières (msf)', 'fctry', 'american medical association', 'united states association for unhcr', 'define american', 'predictit is a political prediction market, a stock market for politics. a project of victoria university of wellington, p…', 'beachside media, inc.', 'hias ', 'ivn news ', 'world food program usa', 'the wall street journal', 'solar energy today', 'national audubon society', 'chad benson', 'natural resources defense council', 'u.s. chamber of commerce', 'national audubon society, inc.', 'the beauty of life', 'ozy media', 'voters not politicians | po box 8362 grand rapids, mi 49518', 'the humane society of the united states', 'the christian science monitor, an international news organization', 'hias', 'stand up america', 'ivn news', 'u.s. census bureau', 'secure financial advisory group']

In [121]:
known_orgs['label'] = known_orgs['label'].apply(lambda x: 'lib' if x in libs else x)
known_orgs['label'] = known_orgs['label'].apply(lambda x: 'con' if x in cons else x)
known_orgs['label'] = known_orgs['label'].apply(lambda x: 'drop' if x in drop else x)

In [122]:
known_orgs.label.value_counts()

lib     69
drop    28
con      7
Name: label, dtype: int64

In [123]:
inverse_boolean_series = ~known_orgs['label'].isin(drop)
known_orgs = known_orgs[inverse_boolean_series]

In [124]:
len(known_orgs)

104

In [125]:
known_payers = pd.concat([known_orgs, known_payers], axis=0)

In [126]:
known_payers = known_payers[known_payers['label'] != 'org']
known_payers = known_payers[known_payers['label'] != 'drop']

In [127]:
known_payers.label.value_counts()

lib    147
con     12
Name: label, dtype: int64

In [128]:
known_payers

Unnamed: 0,payers,docs,lemmatized_docs,named_ents,ne_labels,original,label
81,americans for prosperity,"(Americans, for, Prosperity)","[Americans, prosperity]",[Americans],[NORP],Americans for Prosperity,con
101,alliance for a better minnesota action fund,"(Alliance, for, a, Better, Minnesota, Action, Fund)","[alliance, Better, Minnesota, Action, Fund]",[Better Minnesota Action Fund],[ORG],Alliance for a Better Minnesota Action Fund,lib
112,center for biological diversity,"(Center, for, Biological, Diversity)","[center, Biological, Diversity]",[Center for Biological Diversity],[ORG],Center for Biological Diversity,lib
118,concerned veterans for america,"(Concerned, Veterans, for, America)","[concerned, Veterans, America]",[Concerned Veterans for America],[ORG],Concerned Veterans for America,con
137,everytown for gun safety action fund,"(Everytown, for, Gun, Safety, Action, Fund, ., Not, authorized, by, any, candidate, or, candidate, ’s, committee, .)","[Everytown, Gun, Safety, Action, Fund, authorize, candidate, candidate, committee]",[Everytown for Gun Safety Action Fund],[ORG],Everytown for Gun Safety Action Fund. Not authorized by any candidate or candidate’s committee.,lib
...,...,...,...,...,...,...,...
173,mike bloomberg 2020 inc,"(Mike, Bloomberg, 2020, Inc)","[Mike, Bloomberg, Inc]","[Mike Bloomberg, 2020, Inc]","[PERSON, DATE, GPE]",Mike Bloomberg 2020 Inc,lib
174,mike gravel 2020,"(Mike, Gravel, 2020)","[Mike, Gravel]","[Mike Gravel, 2020]","[PERSON, DATE]",Mike Gravel 2020,lib
206,committee to elect richard ojeda,"(Committee, to, Elect, Richard, Ojeda)","[Committee, elect, Richard, Ojeda]",[Richard Ojeda],[PERSON],Committee to Elect Richard Ojeda,lib
217,tulsi now,"(TULSI, NOW)",[TULSI],[TULSI NOW],[ORG],TULSI NOW,lib


### unknown payers

In [130]:
# converting case to help remove dupes
unknown_payers_df = unknown_payers.sort_values(by='payers')
unknown_payers_df['payers'] = unknown_payers_df['payers'].apply(lambda x: x.lower())
# unknown_payers_df = unknown_payers_df.drop_duplicates(subset='payers')
print(len(unknown_payers_df))

66


In [131]:
# since there are only 67 unknowns, i can visually scan the list to see if there are any other easy fixes
unknown_payers_df = unknown_payers_df.reset_index().drop('index', axis=1)
display(unknown_payers)

Unnamed: 0,payers,docs,lemmatized_docs,named_ents,ne_labels,original,label
0,the nature conservancy,"(The, Nature, Conservancy)","[Nature, Conservancy]",[],[],The Nature Conservancy,missing
1,human rights campaign,"(Human, Rights, Campaign)","[Human, Rights, Campaign]",[],[],Human Rights Campaign,missing
2,bird + stone,"(Bird, +, Stone)","[bird, Stone]",[],[],Bird + Stone,missing
3,need to impeach,"(Need, to, Impeach)","[need, impeach]",[],[],Need to Impeach,missing
4,kialo,(Kialo),[Kialo],[],[],Kialo,missing
...,...,...,...,...,...,...,...
61,"moveon.org political action, not authorized by any candidate or candidate’s committee.","(MoveOn.org, Political, Action, ,, not, authorized, by, any, candidate, or, candidate, ’s, committee, .)","[MoveOn.org, Political, Action, authorize, candidate, candidate, committee]",[],[],"MoveOn.org Political Action, not authorized by any candidate or candidate’s committee.",missing
62,"aids united, inc.","(AIDS, United, ,, Inc.)","[AIDS, United, Inc.]",[],[],"AIDS United, Inc.",missing
63,trump make america great again committee,"(TRUMP, MAKE, AMERICA, GREAT, AGAIN, COMMITTEE)","[TRUMP, AMERICA, GREAT, COMMITTEE]",[],[],TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE,missing
64,reason to believe pac,"(Reason, To, Believe, PAC)","[reason, believe, PAC]",[],[],Reason To Believe PAC,missing


In [132]:
# i can see there are a few examples where it is obviously the same organization
# so I can write a few quick lines and fix those obvious dupes
unknown_payers_df['payers'] = unknown_payers_df['payers'].str.replace(r'(^.*moveon\.org.*$)', 'moveon.org', regex=True)
unknown_payers_df['payers'] = unknown_payers_df['payers'].str.replace(r'(^.*trump.*$)', 'maga committee', regex=True)
unknown_payers_df['payers'] = unknown_payers_df['payers'].str.replace(r'(^.*swing left.*$)', 'swing left', regex=True)
unknown_payers_df['payers'] = unknown_payers_df['payers'].str.replace(r'(^.*prager.*$)', 'prager u', regex=True)
unknown_payers_df['payers'] = unknown_payers_df['payers'].str.replace(r'(^.*planned parenthood.*$)', 'planned parenthood', regex=True)
# unknown_payers_df = unknown_payers_df.drop_duplicates(subset='payers')

print(len(unknown_payers_df)) # 58 -- knocked off 8 duplicates
display(unknown_payers_df)

66


Unnamed: 0,payers,docs,lemmatized_docs,named_ents,ne_labels,original,label
0,"aids united, inc.","(AIDS, United, ,, Inc.)","[AIDS, United, Inc.]",[],[],"AIDS United, Inc.",missing
1,alliance defending freedom,"(Alliance, Defending, Freedom)","[Alliance, defend, Freedom]",[],[],Alliance Defending Freedom,missing
2,america first policies,"(America, First, Policies)","[America, Policies]",[],[],America First Policies,missing
3,ban assault weapons now,"(Ban, Assault, Weapons, NOW)","[Ban, Assault, Weapons]",[],[],Ban Assault Weapons NOW,missing
4,be a hero pac. not authorized by any candidate or candidate’s committee.,"(Be, a, Hero, PAC, ., Not, authorized, by, any, candidate, or, candidate, ’s, committee, .)","[Hero, PAC, authorize, candidate, candidate, committee]",[],[],Be a Hero PAC. Not authorized by any candidate or candidate’s committee.,missing
...,...,...,...,...,...,...,...
61,the nature conservancy,"(The, Nature, Conservancy)","[Nature, Conservancy]",[],[],The Nature Conservancy,missing
62,maga committee,"(the, Trump, Make, America, Great, Again, Committee)","[Trump, America, Great, Committee]",[],[],the Trump Make America Great Again Committee,missing
63,maga committee,"(TRUMP, MAKE, AMERICA, GREAT, AGAIN, COMMITTEE)","[TRUMP, AMERICA, GREAT, COMMITTEE]",[],[],TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE,missing
64,votevets action fund,"(VoteVets, Action, Fund)","[VoteVets, Action, Fund]",[],[],VoteVets Action Fund,missing


In [133]:
# next, I skimmed through the 58 entries, and classified the ones i knew as liberal or conservative. 
# for the ones I was unsure of, I checked sites like open secrets

cons = ['alliance defending freedom', 'america first policies', 'care', 'civic action', 'judicial watch', 
        'keep and bear', 'life outreach international ', 'prager u', 'the lincoln project', 'maga committee']
libs = ['ban assault weapons now', 'be a hero pac. not authorized by any candidate or candidate’s committee.',
       'bird + stone ', 'bullock for president', 'color of change', 'covenant house', 'dccc', 'ditch fund',
       'dissent pins', 'end citizens united', 'environmental defense fund', 'fair fight action', 'feeding america',
        'fwd.us', 'heifer international', 'hoover institution', 'human rights campaign', 'human rights watch',
       'indivisible action', 'indivisible project', 'julián for the future', 'justice democrats pac', 'moveon.org',
       'need to impeach', 'oxfam america', 'planned parenthood', 'progressive turnout project', 'reproaction',
       'pac to the future', 'reason to believe pac', 'shareblue media', 'swing left', 'the intercept', 'the nature conservancy',
       'votevets action fund', 'the motley fool', 'the nature conservancy', 'wilderness society, the',
       'bird + stone ', 'defenders of wildlife', 'defending democracy together', 'earthjustice', '314 action fund']

drop = ['mercy corps', 'phone2action', 'cleancult', 'aids united, inc.', 'no on proposition 8: stop the dangerous dialysis proposition', 'news for democracy', 'no kid hungry', 'kialo']

unknown_payers_df['label'] = unknown_payers_df['payers'].apply(lambda x: 'lib' if x.lower() in libs else x)
unknown_payers_df['label'] = unknown_payers_df['label'].apply(lambda x: 'con' if x.lower() in cons else x)
unknown_payers_df['label'] = unknown_payers_df['label'].apply(lambda x: 'drop' if x.lower() in drop else x)
unknown_payers_df.label.value_counts()

lib     46
con     12
drop     8
Name: label, dtype: int64

In [134]:
# there are some leftover that I could not classify, that I will drop

In [135]:
boolean_series = unknown_payers_df['label'].isin(['lib', 'con'])
unknown_payers_df = unknown_payers_df[boolean_series]

In [136]:
len(unknown_payers_df)

58

In [137]:
unknown_payers_df.label.value_counts()

lib    46
con    12
Name: label, dtype: int64

In [138]:
known_payers.label.value_counts()

lib    147
con     12
Name: label, dtype: int64

In [139]:
payer_data = pd.concat([unknown_payers_df, known_payers], axis=0)

In [140]:
payer_data.label.value_counts()

lib    193
con     24
Name: label, dtype: int64

In [141]:
payer_dict = dict(zip(payer_data['original'], payer_data['label']))

In [142]:
payer_dict

{'Alliance Defending Freedom': 'con',
 'America First Policies': 'con',
 'Ban Assault Weapons NOW': 'lib',
 'Be a Hero PAC. Not authorized by any candidate or candidate’s committee.': 'lib',
 'Bird + Stone ': 'lib',
 'Bullock for President': 'lib',
 'CARE': 'con',
 'Civic Action': 'con',
 'Color Of Change': 'lib',
 'Covenant House': 'lib',
 'DCCC': 'lib',
 'Defenders of Wildlife': 'lib',
 'Defending Democracy Together': 'lib',
 'Dissent Pins': 'lib',
 'DITCH FUND': 'lib',
 'Earthjustice': 'lib',
 'END CITIZENS UNITED': 'lib',
 'Environmental Defense Fund': 'lib',
 'Fair Fight Action': 'lib',
 'Feeding America': 'lib',
 'FWD.us': 'lib',
 'Heifer International': 'lib',
 'Hoover Institution': 'lib',
 'Human Rights Campaign': 'lib',
 'Human Rights Watch': 'lib',
 'Indivisible Action': 'lib',
 'Indivisible Project': 'lib',
 'Judicial Watch': 'con',
 'Julián for the Future': 'lib',
 'JUSTICE DEMOCRATS PAC': 'lib',
 'Keep and Bear': 'con',
 'LIFE Outreach International ': 'con',
 'MOVEON.ORG 

In [143]:
fb_data['label'] = fb_data['paid_for_by'].map(payer_dict)

In [144]:
fb_data.label.value_counts()

lib    74485
con     5309
Name: label, dtype: int64

In [145]:
labeled_fb_data = fb_data.dropna(subset=['label'], axis=0)

In [146]:
len(labeled_fb_data)

79794

In [147]:
lib = labeled_fb_data[labeled_fb_data['label'] =='lib']
con = labeled_fb_data[labeled_fb_data['label'] =='con']

In [148]:
lib_5k = lib.sample(5000)
con_5k = con.sample(5000)

In [149]:
lib_5k.to_csv('lib_5k_data.csv', index=False)
con_5k.to_csv('con_5k_data.csv', index=False)