Confirm that all links in the people database go to working sites. 

In [1]:
# Move working directory one directory up
import os
os.chdir('../')

import requests
import pandas as pd
import numpy as np
import time
from datetime import datetime, timedelta

from tqdm import tqdm
import random

import tweepy

from scripts.data_transformations import list_commissioners

consumer_key = os.environ.get('TWITTER_API_KEY')
consumer_secret = os.environ.get('TWITTER_API_KEY_SECRET')
access_key = os.environ.get('TWITTER_ACCESS_TOKEN')
access_secret = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET')

In [2]:
people = pd.read_csv('data/people.csv')
candidates = pd.read_csv('data/candidates.csv')
candidate_statuses = pd.read_csv('data/candidate_statuses.csv')
commissioners = list_commissioners(status='current')

people_candidates = pd.merge(people, candidates, how='inner', on='person_id')
people_candidates_status = pd.merge(people_candidates, candidate_statuses, how='inner', on='candidate_status')
people_candidates_active = people_candidates_status[
    (people_candidates_status['count_as_candidate'])
    & (people_candidates_status['election_year'] == 2022)
].copy()

people_commissioners = pd.merge(people, commissioners, how='inner', on='person_id')

# Group all twitter links for active candidates and current commissioners in one list
twitter_links = pd.concat([people_commissioners.twitter_link, people_candidates_active.twitter_link])

## Twitter

In [3]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

In [4]:
twitter_handles = (
    twitter_links[twitter_links.notnull()]
    .str.replace('https://twitter.com/', '')
    .str.lower()
)

list_of_handles = sorted(twitter_handles.unique())

In [5]:
batch_size = 40
num_batches = np.ceil(len(list_of_handles) / batch_size)
batches = np.array_split(list_of_handles, num_batches)

In [6]:
user_df_list = []

for batch in tqdm(batches):
    lookup_response = api.lookup_users(screen_names=list(batch))
    
    temp_df = pd.DataFrame()

    for u in lookup_response:
        temp_df.loc[u.id, 'screen_name'] = u.screen_name
        temp_df.loc[u.id, 'created_at'] = u.created_at
        temp_df.loc[u.id, 'followers_count'] = u.followers_count
        temp_df.loc[u.id, 'statuses_count'] = u.statuses_count
        temp_df.loc[u.id, 'verified'] = u.verified
        temp_df.loc[u.id, 'protected'] = u.protected


    user_df_list += [temp_df]
    
    time.sleep(5)

100%|██████████| 6/6 [00:34<00:00,  5.73s/it]


In [7]:
user_df = pd.concat(user_df_list)

In [8]:
user_df['screen_name_lower'] = user_df['screen_name'].str.lower()
# user_df

In [9]:
working_screen_name_lower = [s.lower() for s in user_df['screen_name'].tolist()]

In [10]:
missing_users = [f for f in list_of_handles if f not in working_screen_name_lower]

In [11]:
missing_users

[]

In [12]:
len(missing_users)

0

In [13]:
len(list_of_handles)

201

In [14]:
# todo: extract the twitter account's URL from its profile to see if it's a working website
# todo: look for twitter accounts with zero tweets, do not list them on OpenANC


### Superlatives

In [15]:
user_df.sort_values(by='followers_count')

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
1550483487756587020,AshtonInDC,2022-07-22 14:11:00,9.0,8.0,False,False,ashtonindc
1549570676389920769,CommissionerTRS,2022-07-20 01:43:41,10.0,3.0,False,False,commissionertrs
1545180195858022401,TakemaKeyes,2022-07-07 22:57:33,12.0,12.0,False,False,takemakeyes
1285376801376698368,LatoyaM01766637,2020-07-21 00:51:35,22.0,0.0,False,False,latoyam01766637
1438868334926237697,TomMarabello,2021-09-17 14:12:20,24.0,102.0,False,False,tommarabello
...,...,...,...,...,...,...,...
33211889,keya_chatterjee,2009-04-19 14:33:28,5322.0,11845.0,False,False,keya_chatterjee
2196941238,salimadofo,2013-11-16 02:07:57,5439.0,50881.0,True,False,salimadofo
40059261,sabelharris,2009-05-14 18:53:42,5687.0,10010.0,False,False,sabelharris
16005449,SeanHolihan,2008-08-27 01:02:45,9921.0,10273.0,False,False,seanholihan


In [16]:
user_df.sort_values(by='statuses_count')[['screen_name', 'statuses_count']].sort_values(
    by='statuses_count', ascending=False).head(20)

Unnamed: 0,screen_name,statuses_count
529532500,20002ist,114987.0
128000332,TheeBlackAnglo,75693.0
2196941238,salimadofo,50881.0
170780048,GeorgetownMet,35435.0
634260075,haydengise,33165.0
377706724,Capn_max,32196.0
17300061,oblivious_dude,31501.0
1023013209035431936,Evan_Yeats,28250.0
230908356,nigro4dc,27904.0
1012044273364226050,ErinPalmerDC,22208.0


In [17]:
user_df[user_df['verified']]

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
1023013209035431936,Evan_Yeats,2018-07-28 01:12:10,3156.0,28250.0,True,False,evan_yeats
1377263086520954893,farnan4dc,2021-03-31 14:14:59,1232.0,1324.0,True,False,farnan4dc
2186167509,latestchristian,2013-11-18 16:31:05,1801.0,4008.0,True,False,latestchristian
305704907,MoniDiop,2011-05-26 17:22:35,1205.0,6831.0,True,False,monidiop
1180608218059411456,revwendy3,2019-10-05 22:18:43,10853.0,15380.0,True,False,revwendy3
2196941238,salimadofo,2013-11-16 02:07:57,5439.0,50881.0,True,False,salimadofo
2890872052,tmholcomb,2014-11-24 14:43:44,1607.0,2650.0,True,False,tmholcomb


In [18]:
user_df[user_df['protected']]

Unnamed: 0,screen_name,created_at,followers_count,statuses_count,verified,protected,screen_name_lower
128000332,TheeBlackAnglo,2010-03-30 20:43:32,1286.0,75693.0,False,True,theeblackanglo


In [19]:
user_df['followers_count'].median()

405.0

In [20]:
user_df['statuses_count'].median()

767.0

In [21]:
user_df['created_days_ago'] = (datetime.utcnow() - user_df['created_at']).dt.days
(datetime.utcnow() - timedelta(days=user_df['created_days_ago'].median())).strftime('%Y-%m-%d')

'2015-03-15'

## Websites

In [22]:
def check_status_of_links(df, link_column):
    
    bad_sites = []

#     for idx, row in tqdm(df.iterrows(), total=len(df)):
    for idx, row in df.iterrows():
        if pd.isnull(row[link_column]):
            continue

        print(f'Checking {row[link_column]} ... ', end='')

        r = requests.get(row[link_column])
        time.sleep(4)
        
        print(f'status_code: {r.status_code}')

        if r.status_code != 200:
            bad_sites += [row[link_column]]

        time.sleep(1)
        
    return bad_sites

In [23]:
# ancs = pd.read_csv('../data/ancs.csv')
# check_status_of_links(ancs, 'anc_homepage_link')

In [24]:
# check_status_of_links(people_candidates_active, 'website_link')

In [25]:
# requests.get('http://robbdooling.com')

In [26]:
# requests.get('https://meghanforanc.com')

## Facebook

In [27]:
# check_status_of_links(people_candidates_active, 'facebook_link')