Confirm that all links in the people database go to working sites. 

In [1]:
import requests
import pandas as pd
import numpy as np
import time
from datetime import datetime, timedelta

import os
import sys
import csv
from tqdm import tqdm

import tweepy

consumer_key = os.environ.get('TWITTER_API_KEY')
consumer_secret = os.environ.get('TWITTER_API_KEY_SECRET')
access_key = os.environ.get('TWITTER_ACCESS_TOKEN')
access_secret = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET')

In [2]:
people = pd.read_csv('../data/people.csv')
candidates = pd.read_csv('../data/candidates.csv')
candidate_statuses = pd.read_csv('../data/candidate_statuses.csv')

people_candidates = pd.merge(people, candidates, how='inner', on='person_id')
people_candidates_status = pd.merge(people_candidates, candidate_statuses, how='inner', on='candidate_status')
people_candidates_active = people_candidates_status[people_candidates_status['count_as_candidate']].copy()

## Twitter

In [3]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

In [4]:
people_candidates_active['twitter_handle_lower'] = (
    people_candidates_active['twitter_link']
    .str.replace('https://twitter.com/', '')
    .str.lower()
)

list_of_handles = (
    people_candidates_active.loc[people_candidates_active['twitter_handle_lower'].notnull(), 'twitter_handle_lower']
    .tolist()
)

In [5]:
batch_size = 40
num_batches = np.ceil(len(list_of_handles) / batch_size)
batches = np.array_split(list_of_handles, num_batches)

In [6]:
user_df_list = []

for batch in tqdm(batches):
    lookup_response = api.lookup_users(screen_names=list(batch))
    
    temp_df = pd.DataFrame()

    for u in lookup_response:
        temp_df.loc[u.id, 'screen_name'] = u.screen_name
        temp_df.loc[u.id, 'created_at'] = u.created_at
        temp_df.loc[u.id, 'followers_count'] = u.followers_count
        temp_df.loc[u.id, 'statuses_count'] = u.statuses_count
        temp_df.loc[u.id, 'verified'] = u.verified
        temp_df.loc[u.id, 'protected'] = u.protected


    user_df_list += [temp_df]
    
    time.sleep(5)

100%|██████████| 6/6 [00:33<00:00,  5.63s/it]


In [7]:
user_df = pd.concat(user_df_list)

In [8]:
user_df['screen_name_lower'] = user_df['screen_name'].str.lower()
# user_df

In [9]:
working_screen_name_lower = [s.lower() for s in user_df['screen_name'].tolist()]

In [10]:
missing_users = [f for f in list_of_handles if f not in working_screen_name_lower]

In [11]:
missing_users

[]

In [12]:
len(missing_users)

0

In [13]:
len(list_of_handles)

203

### Superlatives

In [14]:
user_df.sort_values(by='followers_count')

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
1154769474261856257,Christo34514033,2019-07-26 15:04:32,0.0,5.0,False,False,christo34514033
353497656,leftofdacenter,2011-08-12 04:56:37,7.0,2.0,False,False,leftofdacenter
1168654632517865472,GerstenVicki,2019-09-02 22:39:23,7.0,1.0,False,False,gerstenvicki
1288727768893710337,EKlossou,2020-07-30 06:47:10,7.0,29.0,False,False,eklossou
1305169316464259072,AncMeg,2020-09-13 15:40:28,7.0,3.0,False,False,ancmeg
...,...,...,...,...,...,...,...
803360187125993472,riotpedestrian,2016-11-28 22:09:29,4102.0,16847.0,False,False,riotpedestrian
33211889,keya_chatterjee,2009-04-19 14:33:28,4167.0,8762.0,False,False,keya_chatterjee
2196941238,salimadofo,2013-11-16 02:07:57,4188.0,44223.0,True,False,salimadofo
40059261,sabelharris,2009-05-14 18:53:42,5231.0,8850.0,False,False,sabelharris


In [15]:
user_df.sort_values(by='statuses_count')

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
1226154390,5E09Commish,2013-02-27 22:20:44,18.0,0.0,False,False,5e09commish
1296877764818227208,AudainANC7D05,2020-08-21 18:32:18,8.0,0.0,False,False,audainanc7d05
47946516,BTurmail,2009-06-17 12:39:56,17.0,0.0,False,False,bturmail
1285376801376698368,LatoyaM01766637,2020-07-21 00:51:35,11.0,0.0,False,False,latoyam01766637
1252919319111053313,SteveHoltzman3,2020-04-22 11:18:03,8.0,1.0,False,False,steveholtzman3
...,...,...,...,...,...,...,...
170780048,GeorgetownMet,2010-07-25 19:06:48,2093.0,31167.0,False,False,georgetownmet
716998860548214785,Ryanfor3F05,2016-04-04 14:40:24,915.0,32848.0,False,False,ryanfor3f05
2196941238,salimadofo,2013-11-16 02:07:57,4188.0,44223.0,True,False,salimadofo
229348030,thisisbossi,2010-12-22 05:10:30,1059.0,48910.0,False,False,thisisbossi


In [16]:
user_df[user_df['verified']]

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
1921416134,annalandre,2013-09-30 21:53:15,3319.0,2432.0,True,False,annalandre
2890872052,tmholcomb,2014-11-24 14:43:44,1190.0,1942.0,True,False,tmholcomb
2196941238,salimadofo,2013-11-16 02:07:57,4188.0,44223.0,True,False,salimadofo
305704907,MoniDiop,2011-05-26 17:22:35,966.0,6089.0,True,False,monidiop


In [17]:
user_df[user_df['protected']]

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
865959595797356544,Ladylaw1996,2017-05-20 15:57:11,69.0,18.0,False,True,ladylaw1996
2365814866,nestride,2014-02-25 22:28:14,2882.0,338.0,False,True,nestride


In [18]:
user_df['followers_count'].median()

206.0

In [19]:
user_df['statuses_count'].median()

380.0

In [20]:
user_df['created_days_ago'] = (datetime.utcnow() - user_df['created_at']).dt.days
(datetime.utcnow() - timedelta(days=user_df['created_days_ago'].median())).strftime('%Y-%m-%d')

'2014-06-15'

## Websites

In [21]:
def check_status_of_links(df, link_column):
    
    bad_sites = []

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        if pd.isnull(row[link_column]):
            continue

        r = requests.get(row[link_column])

        if r.status_code != 200:
            bad_sites += [row[link_column]]

        time.sleep(1)
        
    return bad_sites

In [22]:
# check_status_of_links(people_candidates_active, 'website_link')

In [23]:
# requests.get('http://robbdooling.com')

In [24]:
# requests.get('https://meghanforanc.com')

## Facebook

In [25]:
# check_status_of_links(people_candidates_active, 'facebook_link')