Confirm that all links in the people database go to working sites. 

In [1]:
import requests
import pandas as pd
import numpy as np
import time
from datetime import datetime, timedelta

import os
import sys
import csv
from tqdm import tqdm

import tweepy

consumer_key = os.environ.get('TWITTER_API_KEY')
consumer_secret = os.environ.get('TWITTER_API_KEY_SECRET')
access_key = os.environ.get('TWITTER_ACCESS_TOKEN')
access_secret = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET')

In [2]:
people = pd.read_csv('../data/people.csv')
candidates = pd.read_csv('../data/candidates.csv')
candidate_statuses = pd.read_csv('../data/candidate_statuses.csv')

people_candidates = pd.merge(people, candidates, how='inner', on='person_id')
people_candidates_status = pd.merge(people_candidates, candidate_statuses, how='inner', on='candidate_status')
people_candidates_active = people_candidates_status[people_candidates_status['count_as_candidate']].copy()

## Twitter

In [3]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

In [4]:
people_candidates_active['twitter_handle_lower'] = (
    people_candidates_active['twitter_link']
    .str.replace('https://twitter.com/', '')
    .str.lower()
)

list_of_handles = (
    people_candidates_active.loc[people_candidates_active['twitter_handle_lower'].notnull(), 'twitter_handle_lower']
    .tolist()
)

In [5]:
batch_size = 40
num_batches = np.ceil(len(list_of_handles) / batch_size)
batches = np.array_split(list_of_handles, num_batches)

In [6]:
user_df_list = []

for batch in tqdm(batches):
    lookup_response = api.lookup_users(screen_names=list(batch))
    
    temp_df = pd.DataFrame()

    for u in lookup_response:
        temp_df.loc[u.id, 'screen_name'] = u.screen_name
        temp_df.loc[u.id, 'created_at'] = u.created_at
        temp_df.loc[u.id, 'followers_count'] = u.followers_count
        temp_df.loc[u.id, 'statuses_count'] = u.statuses_count
        temp_df.loc[u.id, 'verified'] = u.verified
        temp_df.loc[u.id, 'protected'] = u.protected


    user_df_list += [temp_df]
    
    time.sleep(5)

100%|██████████| 5/5 [00:30<00:00,  6.07s/it]


In [7]:
user_df = pd.concat(user_df_list)

In [8]:
user_df['screen_name_lower'] = user_df['screen_name'].str.lower()
user_df

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
1946942610,LaylaBonnot,2013-10-08 13:53:51,947.0,2952.0,False,False,laylabonnot
182387417,zach_rybarczyk,2010-08-24 13:47:59,123.0,427.0,False,True,zach_rybarczyk
806545555094704128,MillerANC1A05,2016-12-07 17:07:00,734.0,3392.0,False,False,milleranc1a05
330233286,KentBoeseDC,2011-07-06 09:36:50,2454.0,8463.0,False,False,kentboesedc
874536666,mwraydc,2012-10-11 23:46:58,460.0,1494.0,False,False,mwraydc
...,...,...,...,...,...,...,...
2337449737,PranavNandaDC,2014-02-11 00:12:21,701.0,4599.0,False,False,pranavnandadc
229348030,thisisbossi,2010-12-22 05:10:30,1046.0,48651.0,False,False,thisisbossi
1304661831953395712,Yannikfor2A01,2020-09-12 06:03:24,202.0,293.0,False,False,yannikfor2a01
44801505,EdwardRyder,2009-06-05 02:41:23,403.0,182.0,False,False,edwardryder


In [9]:
working_screen_name_lower = [s.lower() for s in user_df['screen_name'].tolist()]

In [10]:
missing_users = [f for f in list_of_handles if f not in working_screen_name_lower]

In [11]:
missing_users

['kateforanc', 'rayelle_esq', 'ward7rebecca', 'lauralmgentile']

In [12]:
len(missing_users)

4

In [13]:
len(list_of_handles)

174

### Superlatives

In [14]:
user_df.sort_values(by='followers_count')

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
1305169316464259072,AncMeg,2020-09-13 15:40:28,0.0,3.0,False,False,ancmeg
1154769474261856257,Christo34514033,2019-07-26 15:04:32,0.0,5.0,False,False,christo34514033
1252919319111053313,SteveHoltzman3,2020-04-22 11:18:03,3.0,1.0,False,False,steveholtzman3
227554859,PierreCHines,2010-12-17 04:06:55,6.0,5.0,False,False,pierrechines
1285376801376698368,LatoyaM01766637,2020-07-21 00:51:35,6.0,0.0,False,False,latoyam01766637
...,...,...,...,...,...,...,...
803360187125993472,riotpedestrian,2016-11-28 22:09:29,4020.0,16371.0,False,False,riotpedestrian
33211889,keya_chatterjee,2009-04-19 14:33:28,4128.0,8707.0,False,False,keya_chatterjee
2196941238,salimadofo,2013-11-16 02:07:57,4136.0,44033.0,True,False,salimadofo
40059261,sabelharris,2009-05-14 18:53:42,5233.0,8811.0,False,False,sabelharris


In [15]:
user_df.sort_values(by='statuses_count')

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
47946516,BTurmail,2009-06-17 12:39:56,14.0,0.0,False,False,bturmail
1285376801376698368,LatoyaM01766637,2020-07-21 00:51:35,6.0,0.0,False,False,latoyam01766637
1252919319111053313,SteveHoltzman3,2020-04-22 11:18:03,3.0,1.0,False,False,steveholtzman3
961100345496817664,michael_sriqui,2018-02-07 04:52:33,20.0,3.0,False,False,michael_sriqui
1305169316464259072,AncMeg,2020-09-13 15:40:28,0.0,3.0,False,False,ancmeg
...,...,...,...,...,...,...,...
170780048,GeorgetownMet,2010-07-25 19:06:48,2079.0,30988.0,False,False,georgetownmet
716998860548214785,Ryanfor3F05,2016-04-04 14:40:24,892.0,32149.0,False,False,ryanfor3f05
2196941238,salimadofo,2013-11-16 02:07:57,4136.0,44033.0,True,False,salimadofo
229348030,thisisbossi,2010-12-22 05:10:30,1046.0,48651.0,False,False,thisisbossi


In [16]:
user_df[user_df['verified']]

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
1921416134,annalandre,2013-09-30 21:53:15,3216.0,2410.0,True,False,annalandre
2890872052,tmholcomb,2014-11-24 14:43:44,1040.0,1918.0,True,False,tmholcomb
2196941238,salimadofo,2013-11-16 02:07:57,4136.0,44033.0,True,False,salimadofo
305704907,MoniDiop,2011-05-26 17:22:35,956.0,6079.0,True,False,monidiop


In [17]:
user_df[user_df['protected']]

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
182387417,zach_rybarczyk,2010-08-24 13:47:59,123.0,427.0,False,True,zach_rybarczyk
31623503,vjsthoughts,2009-04-16 03:16:52,529.0,4942.0,False,True,vjsthoughts
1086741491308027910,BendaElias,2019-01-19 21:45:36,205.0,370.0,False,True,bendaelias
1011926797787762688,LaneiceMoore,2018-06-27 10:58:44,20.0,346.0,False,True,laneicemoore
352608476,MsGorJuss,2011-08-10 21:19:56,627.0,30497.0,False,True,msgorjuss


In [18]:
user_df['followers_count'].median()

279.0

In [19]:
user_df['statuses_count'].median()

523.5

In [20]:
user_df['created_days_ago'] = (datetime.utcnow() - user_df['created_at']).dt.days
(datetime.utcnow() - timedelta(days=user_df['created_days_ago'].median())).strftime('%Y-%m-%d')

'2014-04-06'

## Websites

In [21]:
def check_status_of_links(df, link_column):
    
    bad_sites = []

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        if pd.isnull(row[link_column]):
            continue

        r = requests.get(row[link_column])

        if r.status_code != 200:
            bad_sites += [row[link_column]]

        time.sleep(1)
        
    return bad_sites

In [22]:
check_status_of_links(people_candidates_active, 'website_link')

100%|██████████| 423/423 [02:43<00:00,  2.58it/s]


['https://meghanforanc.com',
 'https://www.seanbarrydc.com/',
 'https://www.robbdooling.com/',
 'https://www.mandla4anc.com/',
 'https://isaacsmithdc.nationbuilder.com/',
 'https://www.emerald4dc.com/',
 'https://www.ashleyforanc.com/',
 'https://www.justin4anc.info/',
 'https://www.sabelfordc.com/',
 'https://www.miskefor2c02.com/',
 'https://www.stacybeckforanc.com/',
 'https://www.dandrephillips.com/']

In [23]:
requests.get('http://robbdooling.com')

<Response [406]>

In [24]:
requests.get('https://meghanforanc.com')

<Response [406]>

## Facebook

In [25]:
check_status_of_links(people_candidates_active, 'facebook_link')

100%|██████████| 423/423 [02:17<00:00,  3.08it/s]


['https://www.facebook.com/seanbbarrydc/']