### Import 

In [1]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim

### Load data source and fix data format

In [2]:
## load fips csv
fips = pd.read_csv("fips2county.tsv", sep='\t', dtype={'CountyFIPS_3': 'string'} )
fips['fips_code']= fips['CountyFIPS'].astype(str)
# fix data format
fips.loc[fips['fips_code'].str.len() == 4, ['fips_code']] = '0' + fips['fips_code'][fips['fips_code'].str.len() == 4]
fips.fips_code

0       01001
1       01003
2       01005
3       01007
4       01009
        ...  
3137    56037
3138    56039
3139    56041
3140    56043
3141    56045
Name: fips_code, Length: 3142, dtype: object

### Load school data and clean location data

In [7]:
school = 'data/Brown University' # change to your school's name
df = pd.read_csv(school + '_clean.csv') 

In [10]:
# replace wrong locations(string 'connections' in them) to nulls
no_value = df.current_location.str.contains('connections') 
df.loc[no_value,'current_location'] = np.NAN

# remove some strings not identified by geocode (!you can add more depending on your data)
l = ['Metropolitan','Area', 'Township of', 'Region', 'Greater', 'Metroplex', 'District']
df['current_location'] = df['current_location'].str.replace('|'.join(l), '', regex=True)

geolocator = Nominatim(user_agent="course_project")

In [11]:
# generate a dictionary mapping unique locations to corresponding coordinates (4 mins)
latitude = {}
longitude = {}
state = {}
county = {}
city = {}
unique_locs = df.current_location.unique()
for loc in unique_locs:
    location = geolocator.geocode(loc, timeout = 600,addressdetails=True)
    if pd.isna(location) or pd.isna(loc):
        latitude[loc] = np.NAN
        longitude[loc] = np.NAN
        state[loc] = np.NAN
        county[loc] = np.NAN
        city[loc] = np.NAN
    else:
        latitude[loc] = location.latitude
        longitude[loc] = location.longitude
        try:
            state[loc] = location.raw['address']['state']
        except:
            state[loc] = np.NAN
        try:
            county[loc] = location.raw['address']['county'].rsplit(' ',1)[0]
        except:
            county[loc] = np.NAN
        try:
            city[loc] = location.raw['address']['city']
        except:
            city[loc] = np.NAN

In [12]:
df['latitude'] = df.apply(lambda row: latitude[row.current_location], axis =1)
df['longitude'] = df.apply(lambda row: longitude[row.current_location], axis =1)
df['state'] = df.apply(lambda row: state[row.current_location], axis =1)
df['county'] = df.apply(lambda row: county[row.current_location], axis =1)
df['city'] = df.apply(lambda row: city[row.current_location], axis =1)

In [23]:
def mapping(row):
    ''' map state-county to fips code
    '''
    if row[['county','state']].notnull().all():
        x = fips[(fips.CountyName==row.county) & (fips.StateName==row.state)]['fips_code'].values.tolist()
        if x !=[]:
            return str(x[0])
    else:
        try:
            x = fips[(fips.CountyName==row.city) & (fips.StateName==row.state)]['fips_code'].values.tolist()
            if x !=[]:
                return str(x[0])
        except:
            pass   
    return np.NAN

In [24]:
df['fips'] = df.apply(mapping, axis=1)
df['fips']  = df['fips'] .astype(str)

In [25]:
df = df.drop(columns=['Unnamed: 0'])

KeyError: "['Unnamed: 0'] not found in axis"

In [26]:
df.fips

0       39049
1       36061
2       06073
3       06081
4       04013
        ...  
2427    06075
2428    06075
2429    36061
2430      nan
2431    17031
Name: fips, Length: 2432, dtype: object

In [27]:
df.latitude = np.round(df.latitude, 6)
df.longitude = np.round(df.longitude, 6)

In [29]:
df[df.fips!= 'nan']

Unnamed: 0,current_location,current_work,previous_work,latest_Education,major,search_school,latitude,longitude,state,county,city,fips
0,"Dublin, Ohio, United States","Director, Cyber Security Operations & Incident...",Cyber Security Manager at Capgemini,Brown University,"Computer Science, Computer Networking and Secu...",Brown University,40.099229,-83.114077,Ohio,Franklin,,39049
1,New York City,Vice President of Development at TRITEC Real E...,Development Analyst at TRITEC Real Estate Comp...,Georgetown University,Real Estate Development,Brown University,40.712728,-74.006015,New York,,New York,36061
2,"San Diego County, California, United States","Strategic Advisor at Ionis Pharmaceuticals, Inc.",Vice President Of Business Development and All...,1999–2001,Healthcare Management,Brown University,32.963784,-116.770628,California,San Diego,,06073
3,San Francisco Bay,"Strategic Enterprise Solutions, International ...","Partnerships & Sales, Corporate Solutions at Y...",Brown University,"Entrepreneurship, Organizations, and Social Ju...",Brown University,37.714029,-122.307794,California,San Mateo,,06081
4,"Mesa, Arizona, United States","Senior Director, Global Enterprise Risk Manage...",Director of Global Supply Chain Planning / New...,University of Pennsylvania - The Wharton School,Strategic Management & Planning,Brown University,33.415112,-111.831479,Arizona,Maricopa,Mesa,04013
...,...,...,...,...,...,...,...,...,...,...,...,...
2426,"Houston, Texas, United States","Vice President , Women's Health and Genomics a...","Senior Director, Medical Policy and Operations...",Department of Health and Human Services,,Brown University,29.758938,-95.367697,Texas,Harris,Houston,48201
2427,"San Francisco, California, United States","Strategic Partnerships Manager, US Retail at G...","Strategy & Planning Manager, Global Business S...",Brown University,"Commerce, Organization, and Entrepreneurship -...",Brown University,37.779026,-122.419906,California,,San Francisco,06075
2428,"San Francisco, California, United States","Strategic Partnerships Manager, US Retail at G...","Strategy & Planning Manager, Global Business S...",Brown University,,Brown University,37.779026,-122.419906,California,,San Francisco,06075
2429,"New York, New York, United States",Product Lead at Zoom,Senior Product Manager at Zoom,Brown University,,Brown University,40.712728,-74.006015,New York,,New York,36061


In [19]:
## save to a new csv file!!!
df.to_csv(school + '_with_fips.csv')

In [193]:
# bugfix (optional)
#check a specific location
location = geolocator.geocode('Frankfurt Rhine', timeout = 600)
location

Location(Frankfurter Straße, Urdenbach, Stadtbezirk 9, Düsseldorf, Nordrhein-Westfalen, 40593, Deutschland, (51.1549523, 6.8854454, 0.0))

In [194]:
#check NAN values and revise current locations to be searchable (change/add more strings to remove in l)
df[(df.coordinate.isna()) & (df.current_location.notna())]

Unnamed: 0,current_location,current_work,previous_work,latest_Education,major,search_school,coordinate,state,county,city,fips
337,"Greenville-Spartanburg-Anderson, South Carolina","President/Owner, Financial Advisor, CFP®, CRPC...","Vice President, Financial Advisor, CFP®, CRPC®...",University of South Carolina - The Moore Schoo...,Brazil/Portuguese,"University of California, Riverside",,,,,
548,Frankfurt Rhine-Main,Head of Banking and Partner Strategy and Execu...,Senior Adviser at Holland FinTech,"University of California, Riverside",German Language and Literature,"University of California, Riverside",,,,,
1001,Ezana is a startup entrepreneur currently prom...,CEO at Afri Kash,Service Management Officer at United Nations,"University of California, Riverside",Political Science,"University of California, Riverside",,,,,
1013,Laila Mickelwait is the Founder and CEO of the...,Founder/CEO at Justice Defense Fund,Founder at New Reality International,University of Southern California,Public Diplomacy,"University of California, Riverside",,,,,
1782,CEO at Cyphen Limited,CEO at Cyphen Limited,,Peking University,国际经济,"University of California, Riverside",,,,,
