Confirm that all links in the people database go to working sites. 

In [1]:
# Move working directory one directory up
import os
os.chdir('../')

import requests
import pandas as pd
import numpy as np
import time
from datetime import datetime, timedelta

from tqdm import tqdm
import random

import tweepy

from scripts.data_transformations import list_commissioners

In [2]:
client = tweepy.Client(bearer_token=os.environ['TWITTER_BEARER_TOKEN'])

In [3]:
people = pd.read_csv('data/people.csv')
candidates = pd.read_csv('data/candidates.csv')
candidate_statuses = pd.read_csv('data/candidate_statuses.csv')
commissioners = list_commissioners(status='current')

people_candidates = pd.merge(people, candidates, how='inner', on='person_id')
people_candidates_status = pd.merge(people_candidates, candidate_statuses, how='inner', on='candidate_status')
people_candidates_active = people_candidates_status[
    (people_candidates_status['count_as_candidate'])
    & (people_candidates_status['election_year'] == 2022)
].copy()

people_commissioners = pd.merge(people, commissioners, how='inner', on='person_id')

# Group all twitter links for active candidates and current commissioners in one list
twitter_links = pd.concat([people_commissioners.twitter_link, people_candidates_active.twitter_link])

## Twitter

In [4]:
twitter_handles = (
    twitter_links[twitter_links.notnull()]
    .str.replace('https://twitter.com/', '')
    .str.lower()
)

list_of_handles = sorted(twitter_handles.unique())

In [5]:
batch_size = 40
num_batches = np.ceil(len(list_of_handles) / batch_size)
batches = np.array_split(list_of_handles, num_batches)

In [11]:
user_df_list = []

for batch in tqdm(batches):
    lookup_response = client.get_users(usernames=list(batch))
    
    temp_df = pd.DataFrame()

    for u in lookup_response.data:
        temp_df.loc[u.id, 'screen_name'] = u.username
#         temp_df.loc[u.id, 'created_at'] = u.created_at
#         temp_df.loc[u.id, 'followers_count'] = u.followers_count
#         temp_df.loc[u.id, 'statuses_count'] = u.statuses_count
#         temp_df.loc[u.id, 'verified'] = u.verified
#         temp_df.loc[u.id, 'protected'] = u.protected


    user_df_list += [temp_df]
    
    time.sleep(5)

100%|██████████| 6/6 [00:31<00:00,  5.19s/it]


In [17]:
user_df = pd.concat(user_df_list)

In [18]:
user_df['screen_name_lower'] = user_df['screen_name'].str.lower()
# user_df

In [19]:
working_screen_name_lower = [s.lower() for s in user_df['screen_name'].tolist()]

In [20]:
missing_users = [f for f in list_of_handles if f not in working_screen_name_lower]

In [21]:
missing_users

[]

In [22]:
len(missing_users)

0

In [23]:
len(list_of_handles)

216

In [None]:
# todo: extract the twitter account's URL from its profile to see if it's a working website
# todo: look for twitter accounts with zero tweets, do not list them on OpenANC


### Superlatives

In [None]:
user_df.sort_values(by='followers_count')

In [None]:
user_df.sort_values(by='statuses_count')[['screen_name', 'statuses_count']].sort_values(
    by='statuses_count', ascending=False).head(20)

In [None]:
user_df[user_df['verified']]

In [None]:
user_df[user_df['protected']]

In [None]:
user_df['followers_count'].median()

In [None]:
user_df['statuses_count'].median()

In [None]:
user_df['created_days_ago'] = (datetime.utcnow() - user_df['created_at']).dt.days
(datetime.utcnow() - timedelta(days=user_df['created_days_ago'].median())).strftime('%Y-%m-%d')

## Websites

In [None]:
def check_status_of_links(df, link_column):
    
    bad_sites = []

#     for idx, row in tqdm(df.iterrows(), total=len(df)):
    for idx, row in df.iterrows():
        if pd.isnull(row[link_column]):
            continue

        print(f'Checking {row[link_column]} ... ', end='')

        r = requests.get(row[link_column])
        time.sleep(4)
        
        print(f'status_code: {r.status_code}')

        if r.status_code != 200:
            bad_sites += [row[link_column]]

        time.sleep(1)
        
    return bad_sites

In [None]:
# ancs = pd.read_csv('../data/ancs.csv')
# check_status_of_links(ancs, 'anc_homepage_link')

In [None]:
# check_status_of_links(people_candidates_active, 'website_link')

In [None]:
# requests.get('http://robbdooling.com')

In [None]:
# requests.get('https://meghanforanc.com')

## Facebook

In [None]:
# check_status_of_links(people_candidates_active, 'facebook_link')