Confirm that all links in the people database go to working sites. 

In [1]:
# Move working directory one directory up
import os
os.chdir('../')

import requests
import pandas as pd
import numpy as np
import time
from datetime import datetime, timedelta

import os
import sys
import csv
from tqdm import tqdm

import tweepy

from scripts.data_transformations import list_commissioners

consumer_key = os.environ.get('TWITTER_API_KEY')
consumer_secret = os.environ.get('TWITTER_API_KEY_SECRET')
access_key = os.environ.get('TWITTER_ACCESS_TOKEN')
access_secret = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET')

In [2]:
people = pd.read_csv('data/people.csv')
candidates = pd.read_csv('data/candidates.csv')
candidate_statuses = pd.read_csv('data/candidate_statuses.csv')
commissioners = list_commissioners(status='current')

people_candidates = pd.merge(people, candidates, how='inner', on='person_id')
people_candidates_status = pd.merge(people_candidates, candidate_statuses, how='inner', on='candidate_status')
people_candidates_active = people_candidates_status[
    (people_candidates_status['count_as_candidate'])
    & (people_candidates_status['election_year'] == 2022)
].copy()

people_commissioners = pd.merge(people, commissioners, how='inner', on='person_id')

# Group all twitter links for active candidates and current commissioners in one list
twitter_links = pd.concat([people_commissioners.twitter_link, people_candidates_active.twitter_link])

## Twitter

In [3]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

In [4]:
twitter_handles = (
    twitter_links[twitter_links.notnull()]
    .str.replace('https://twitter.com/', '')
    .str.lower()
)

list_of_handles = sorted(twitter_handles.unique())

In [5]:
batch_size = 40
num_batches = np.ceil(len(list_of_handles) / batch_size)
batches = np.array_split(list_of_handles, num_batches)

In [6]:
user_df_list = []

for batch in tqdm(batches):
    lookup_response = api.lookup_users(screen_names=list(batch))
    
    temp_df = pd.DataFrame()

    for u in lookup_response:
        temp_df.loc[u.id, 'screen_name'] = u.screen_name
        temp_df.loc[u.id, 'created_at'] = u.created_at
        temp_df.loc[u.id, 'followers_count'] = u.followers_count
        temp_df.loc[u.id, 'statuses_count'] = u.statuses_count
        temp_df.loc[u.id, 'verified'] = u.verified
        temp_df.loc[u.id, 'protected'] = u.protected


    user_df_list += [temp_df]
    
    time.sleep(5)

100%|██████████| 5/5 [00:29<00:00,  5.93s/it]


In [7]:
user_df = pd.concat(user_df_list)

In [8]:
user_df['screen_name_lower'] = user_df['screen_name'].str.lower()
# user_df

In [9]:
working_screen_name_lower = [s.lower() for s in user_df['screen_name'].tolist()]

In [10]:
missing_users = [f for f in list_of_handles if f not in working_screen_name_lower]

In [11]:
missing_users

[]

In [12]:
len(missing_users)

0

In [13]:
len(list_of_handles)

176

### Superlatives

In [14]:
user_df.sort_values(by='followers_count')

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
1441237047960739843,PatrickForANC,2021-09-24 03:04:53,1.0,0.0,False,False,patrickforanc
1549570676389920769,CommissionerTRS,2022-07-20 01:43:41,4.0,2.0,False,False,commissionertrs
47946516,BTurmail,2009-06-17 12:39:56,20.0,0.0,False,False,bturmail
1285376801376698368,LatoyaM01766637,2020-07-21 00:51:35,21.0,0.0,False,False,latoyam01766637
1438868334926237697,TomMarabello,2021-09-17 14:12:20,22.0,101.0,False,False,tommarabello
...,...,...,...,...,...,...,...
1012044273364226050,ErinPalmerDC,2018-06-27 18:45:32,3789.0,21977.0,False,False,erinpalmerdc
33211889,keya_chatterjee,2009-04-19 14:33:28,5268.0,11745.0,False,False,keya_chatterjee
2196941238,salimadofo,2013-11-16 02:07:57,5405.0,50837.0,True,False,salimadofo
40059261,sabelharris,2009-05-14 18:53:42,5687.0,10006.0,False,False,sabelharris


In [15]:
user_df.sort_values(by='statuses_count')

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
1296877764818227208,AudainANC7D05,2020-08-21 18:32:18,26.0,0.0,False,False,audainanc7d05
47946516,BTurmail,2009-06-17 12:39:56,20.0,0.0,False,False,bturmail
1226154390,5E09Commish,2013-02-27 22:20:44,28.0,0.0,False,False,5e09commish
1285376801376698368,LatoyaM01766637,2020-07-21 00:51:35,21.0,0.0,False,False,latoyam01766637
1441237047960739843,PatrickForANC,2021-09-24 03:04:53,1.0,0.0,False,False,patrickforanc
...,...,...,...,...,...,...,...
17300061,oblivious_dude,2008-11-11 01:32:11,1525.0,31494.0,False,False,oblivious_dude
377706724,Capn_max,2011-09-22 00:19:16,592.0,32122.0,False,False,capn_max
2196941238,salimadofo,2013-11-16 02:07:57,5405.0,50837.0,True,False,salimadofo
128000332,TheeBlackAnglo,2010-03-30 20:43:32,1290.0,75880.0,False,True,theeblackanglo


In [16]:
user_df[user_df['verified']]

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
1023013209035431936,Evan_Yeats,2018-07-28 01:12:10,3125.0,28077.0,True,False,evan_yeats
1377263086520954893,farnan4dc,2021-03-31 14:14:59,1223.0,1305.0,True,False,farnan4dc
2186167509,latestchristian,2013-11-18 16:31:05,1789.0,3987.0,True,False,latestchristian
305704907,MoniDiop,2011-05-26 17:22:35,1193.0,6740.0,True,False,monidiop
1180608218059411456,revwendy3,2019-10-05 22:18:43,10711.0,15210.0,True,False,revwendy3
2196941238,salimadofo,2013-11-16 02:07:57,5405.0,50837.0,True,False,salimadofo
2890872052,tmholcomb,2014-11-24 14:43:44,1605.0,2633.0,True,False,tmholcomb


In [17]:
user_df[user_df['protected']]

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
128000332,TheeBlackAnglo,2010-03-30 20:43:32,1290.0,75880.0,False,True,theeblackanglo


In [18]:
user_df['followers_count'].median()

437.0

In [19]:
user_df['statuses_count'].median()

839.0

In [20]:
user_df['created_days_ago'] = (datetime.utcnow() - user_df['created_at']).dt.days
(datetime.utcnow() - timedelta(days=user_df['created_days_ago'].median())).strftime('%Y-%m-%d')

'2014-06-16'

## Websites

In [21]:
def check_status_of_links(df, link_column):
    
    bad_sites = []

#     for idx, row in tqdm(df.iterrows(), total=len(df)):
    for idx, row in df.iterrows():
        if pd.isnull(row[link_column]):
            continue

        print(f'Checking {row[link_column]} ... ', end='')

        r = requests.get(row[link_column])
        time.sleep(4)
        
        print(f'status_code: {r.status_code}')

        if r.status_code != 200:
            bad_sites += [row[link_column]]

        time.sleep(1)
        
    return bad_sites

In [22]:
ancs = pd.read_csv('../data/ancs.csv')
check_status_of_links(ancs, 'anc_homepage_link')

FileNotFoundError: [Errno 2] File ../data/ancs.csv does not exist: '../data/ancs.csv'

In [None]:
# check_status_of_links(people_candidates_active, 'website_link')

In [None]:
# requests.get('http://robbdooling.com')

In [None]:
# requests.get('https://meghanforanc.com')

## Facebook

In [None]:
# check_status_of_links(people_candidates_active, 'facebook_link')