In [1]:
import geonamescache
gc = geonamescache.GeonamesCache()
cities = [c for c in gc.get_cities().values() if c['countrycode'] == 'US']

In [2]:
arc_name = "data/stream/2015/04/07/04"
save_name = "data/twitter_dump.txt"

In [3]:
abbr_to_states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

state_to_abbr = {v.upper():k for k,v in abbr_to_states.items()}

In [4]:
def match_country(st):
    if st.lower() in ['us', 'usa', 'america', 'united states', 'united states of america', 'amerika', 'murica', "'murica"]:
        return 'USA'
    else:
        return None

def match_state(st):
    stat = st.upper()
    if 'USA' in stat:
        stat = stat.replace('USA', '').strip()
    if 'AMERICA' in stat:
        stat = stat.replace('AMERICA', '').strip()
    if 'US' in stat:
        stat = stat.replace('US', '').strip()
    if stat in state_to_abbr.keys():
        return state_to_abbr[stat]
    elif stat not in state_to_abbr.values():
        return None
    return stat

def match_city(cit, st = None):
    city = cit.title()
    if st is None:
        out = [c for c in cities if c['name']==city]
        if len(out) > 1:
            temp = 0
            for c in out:
                if c['population']>temp:
                    temp = c['population']
                    oout = c
            return oout
        elif len(out)==1:
            return out[0]
        else:
            return None
    else:
        out = [c for c in cities if c['name']==city]
        out = [c for c in out if c['admin1code'] == st.upper()]
        if out:
            return out[0]
        else:
            return None

def match_string(st):
    st = st.strip()
    st = st.replace('.', '')
    if st.lower() in ['dublin', 'melbourne', 'vancouver', 'paris', 'lima', 'bristol', 'frankfurt', 'london', 'nederland', 'lebanon', 'aus', 'ausl', 'amsterdam']:
        return None
    if st.lower() == 'la':
        return match_string('Los Angeles, CA')
    if st.lower() == 'sf':
        return match_string('San Francisco, CA')
    if st.lower() == 'nyc':
        return match_string('New York City, NY')
    if 'nova iorque' in st.lower():
        return match_string('New York City, NY')
    if st.count(',') == 0:
        try:
            out = match_country(st)
            if out:
                return out
        except:
            pass
        try:
            out = match_state(st)
            if out:
                return out
        except:
            pass
        try:
            out = match_city(st)
            if out:
                return out
        except:
            pass
        if st.count(' ') > 0:
            temp = st.split(' ')
            if len(temp) == 2:
                return match_string(', '.join(temp))
            else:
                return match_string(' '.join(temp[0:-1]) + ', ' + temp[-1])
        return None
    elif st.count(',') == 1:
        cit, st = st.split(',')
        cit = cit.strip().title()
        st = st.strip().upper()
        if match_country(st):
            if match_state(cit.upper()):
                return match_state(cit.upper())
            elif match_city(cit):
                return match_city(cit)
            else:
                return 'USA'
        elif match_state(st):
            if match_city(cit,st):
                return match_city(cit,st)
            else:
                return match_state(st)
        else:
            return None
    elif st.count(',') == 2:
        cit, st, ct = st.split(',')
        if match_country(ct.strip()):
            return match_string(','.join([cit,st]))
        else:
            return None

In [5]:
def get_user_data(tweet):
    if (not tweet) or (not tweet.get('user', None)) or (not tweet['user']['location']):
        return None
    loc = match_string(tweet['user']['location'])
    if loc:
        if type(loc)==dict:
            out = [
                tweet['user']['id_str'],
                tweet['user']['friends_count'],
                tweet['user']['followers_count'],
                loc['admin1code'],
                loc['name'],
                loc['latitude'],
                loc['longitude'],
                tweet['user']['created_at']
            ]
        else:
            out = [
                tweet['user']['id_str'],
                tweet['user']['friends_count'],
                tweet['user']['followers_count'],
                loc,
                None,
                None,
                None,
                tweet['user']['created_at']
            ]
        return out
    else:
        return None
    

In [6]:
from twitterArchiveParser import twitterArchiveParser

In [7]:
tp = twitterArchiveParser(archive_location=arc_name, save_location = save_name, num_threads = 7, verbose = True)

In [8]:
tp.parse_archive(get_user_data)

Thread finished

Thread finished

Thread finished

Thread finished

