In [1]:
import os.path 
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
# Location
from bs4 import BeautifulSoup
import requests
from geopy.geocoders import Nominatim

In [2]:
# Change clean to trump or hillary
clean = "hillary"

parent_csv = "data/scraper/%s.csv" % clean
users_csv = "data/script/%s_users.csv" % clean
log = "data/script/%s_log" % clean

if not os.path.isfile(parent_csv):
    print("We don't have the parent csv file!")
    raise

In [3]:
if not os.path.isfile(users_csv):
    # Import CSV
    df_parent = pd.read_csv(parent_csv,sep=';')

    # Delete Duplicates
    df_parent = df_parent.drop_duplicates(subset='id')
    print("The scraper found %i unique tweets regarding %s" % (df_parent.shape[0], clean))

    # Create DF of unique users
    df_users = df_parent[['userid','user']].drop_duplicates(subset='userid')
    df_users['userid'] = df_users['userid'].fillna(-1).astype(int)
    df_users['loc'] = np.nan
    df_users['lat'] = np.nan
    df_users['lon'] = np.nan
    
    # Save to csv
    df_users.to_csv(users_csv,index=False)
else:
    df_users = pd.read_csv(users_csv)

In [4]:
def findLocation(user):
    uid, usr, loc, lat, lon = user
    url = "https://twitter.com/%s" % usr
    try:
        if not pd.isnull(loc):
            #print("location present")
            pass
        else:
            content = requests.get(url)
            soup = BeautifulSoup(content.text)
            span = soup.find("span", class_="ProfileHeaderCard-locationText u-dir")
            if span is None:
                raise AttributeError("invalid user")
            loc = span.text.strip()
            if loc == '':
                loc = np.nan
                raise ValueError("no location")
            elif len(loc) == 2:
                loc += ',USA'
                
    except Exception as e:
        saveError(user, e)

    user[2] = loc
    if not pd.isnull(loc):
        lat, lon = convertGeo(user)
    return loc, lat, lon

In [5]:
def convertGeo(user):
    uid, usr, loc, lat, lon = user
    try:
        if not pd.isnull(lat):
            #print("coordinates present")
            pass
        else:
            geolocator = Nominatim(country_bias="United States of America")
            place = geolocator.geocode(loc,addressdetails=True)
            if place is None:
                raise ValueError("invalid location")
            elif place.raw['address']['country'] != 'United States of America':
                #print(place.raw['address']['country'])
                raise ValueError("location not USA")
            else:
                lat = place.latitude
                lon = place.longitude
    
    except Exception as e:
        saveError(user, e)

    return lat, lon

In [6]:
def saveError(user, text):
    uid, usr, loc, lat, lon = user
    t = [uid,usr,loc,lat,lon,text]
    df = pd.DataFrame(columns=['uid','usr','loc','lat','lon','err'])
    df.loc[0] = t
    # print(t)
    if not os.path.isfile(log):
        with open(log, 'w') as f:
            df.to_csv(f, index=False)
    else:
        with open(log, 'a') as f:
            df.to_csv(f, index=False, header=False)

In [7]:
size = 10000
for chunk in range(14,27):
    print(chunk)
    df_temp = df_users[chunk*size:(chunk+1)*size]
    df_temp[['loc','lat','lon']] = df_temp.apply(lambda x: findLocation(x), axis=1).apply(pd.Series)
    df_users[chunk*size:(chunk+1)*size] = df_temp
    with open(users_csv, 'w') as f:
        df_users.to_csv(f, index=False)

# do for last chunk
chunk += 1
df_temp = df_users[chunk*size:]
df_temp[['loc','lat','lon']] = df_temp.apply(lambda x: findLocation(x), axis=1).apply(pd.Series)
df_users[chunk*size:] = df_temp
with open(users_csv, 'w') as f:
    df_users.to_csv(f, index=False)

df_users

13
14


KeyboardInterrupt: 