In [1]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim

In [2]:
## load fips csv
fips = pd.read_csv("fips2county.tsv", sep='\t', dtype={'CountyFIPS_3': 'string'} )
fips['fips_code']= fips['CountyFIPS'].astype(str)

In [3]:
fips['fips_code'].str.len()

0       4
1       4
2       4
3       4
4       4
       ..
3137    5
3138    5
3139    5
3140    5
3141    5
Name: fips_code, Length: 3142, dtype: int64

In [4]:
fips.loc[fips['fips_code'].str.len() == 4, ['fips_code']] = '0' + fips['fips_code'][fips['fips_code'].str.len() == 4]

In [27]:
fips.fips_code

0       01001
1       01003
2       01005
3       01007
4       01009
        ...  
3137    56037
3138    56039
3139    56041
3140    56043
3141    56045
Name: fips_code, Length: 3142, dtype: object

In [7]:
school = 'data/University of California, Riverside' # change to your school's name

In [8]:
df = pd.read_csv(school + '_clean.csv') 

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2202 entries, 0 to 2201
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        2202 non-null   int64 
 1   current_location  2202 non-null   object
 2   current_work      2165 non-null   object
 3   previous_work     2139 non-null   object
 4   latest_Education  2202 non-null   object
 5   major             2052 non-null   object
 6   search_school     2202 non-null   object
dtypes: int64(1), object(6)
memory usage: 120.5+ KB


In [10]:
# replace wrong locations(string 'connections' in them) to nulls
no_value = df.current_location.str.contains('connections') 
df.loc[no_value,'current_location'] = np.NAN

# remove some strings not identified by geocode (!you can add more depending on your data)
l = ['Metropolitan','Area', 'Township of', 'Region', 'Greater', 'Metroplex', 'District']
df['current_location'] = df['current_location'].str.replace('|'.join(l), '', regex=True)

geolocator = Nominatim(user_agent="course_project")

In [11]:
# generate a dictionary mapping unique locations to corresponding coordinates (4 mins)
coor = {}
state = {}
county = {}
city = {}
unique_locs = df.current_location.unique()
for loc in unique_locs:
    location = geolocator.geocode(loc, timeout = 600,addressdetails=True)
    if pd.isna(location) or pd.isna(loc):
        coor[loc] = np.NAN
        state[loc] = np.NAN
        county[loc] = np.NAN
        city[loc] = np.NAN
    else:
        coor[loc] = (location.latitude, location.longitude)
        try:
            state[loc] = location.raw['address']['state']
        except:
            state[loc] = np.NAN
        try:
            county[loc] = location.raw['address']['county'].rsplit(' ',1)[0]
        except:
            county[loc] = np.NAN
        try:
            city[loc] = location.raw['address']['city']
        except:
            city[loc] = np.NAN

In [12]:
df['coordinate'] = df.apply(lambda row: coor[row.current_location], axis =1)
df['state'] = df.apply(lambda row: state[row.current_location], axis =1)
df['county'] = df.apply(lambda row: county[row.current_location], axis =1)
df['city'] = df.apply(lambda row: city[row.current_location], axis =1)

In [28]:
def mapping(row):
    ''' map state-county to fips code
    '''
    if row[['county','state']].notnull().all():
        x = fips[(fips.CountyName==row.county) & (fips.StateName==row.state)]['fips_code'].values.tolist()
        if x !=[]:
            return str(x[0])
    else:
        try:
            x = fips[(fips.CountyName==row.city) & (fips.StateName==row.state)]['fips_code'].values.tolist()
            if x !=[]:
                return str(x[0])
        except:
            pass   
    return np.nan

In [29]:
df['fips'] = df.apply(mapping, axis=1)
df['fips']  = df['fips'] .astype(str)

In [21]:
df = df.drop(columns=['Unnamed: 0'])

KeyError: "['Unnamed: 0'] not found in axis"

In [32]:
df.fips

0       06037
1       06059
2       06075
3       48085
4       48113
        ...  
2197    08005
2198    06059
2199    06059
2200    06073
2201      nan
Name: fips, Length: 2202, dtype: object

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2202 entries, 0 to 2201
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   current_location  2025 non-null   object
 1   current_work      2165 non-null   object
 2   previous_work     2139 non-null   object
 3   latest_Education  2202 non-null   object
 4   major             2052 non-null   object
 5   search_school     2202 non-null   object
 6   coordinate        2020 non-null   object
 7   state             1918 non-null   object
 8   county            1740 non-null   object
 9   city              1420 non-null   object
 10  fips              2202 non-null   object
dtypes: object(11)
memory usage: 189.4+ KB


In [34]:
## save to a new csv file!!!
df.to_csv(school + '_with_fips.csv')

In [193]:
# bugfix (optional)
#check a specific location
location = geolocator.geocode('Frankfurt Rhine', timeout = 600)
location

Location(Frankfurter Straße, Urdenbach, Stadtbezirk 9, Düsseldorf, Nordrhein-Westfalen, 40593, Deutschland, (51.1549523, 6.8854454, 0.0))

In [194]:
#check NAN values and revise current locations to be searchable (change/add more strings to remove in l)
df[(df.coordinate.isna()) & (df.current_location.notna())]

Unnamed: 0,current_location,current_work,previous_work,latest_Education,major,search_school,coordinate,state,county,city,fips
337,"Greenville-Spartanburg-Anderson, South Carolina","President/Owner, Financial Advisor, CFP®, CRPC...","Vice President, Financial Advisor, CFP®, CRPC®...",University of South Carolina - The Moore Schoo...,Brazil/Portuguese,"University of California, Riverside",,,,,
548,Frankfurt Rhine-Main,Head of Banking and Partner Strategy and Execu...,Senior Adviser at Holland FinTech,"University of California, Riverside",German Language and Literature,"University of California, Riverside",,,,,
1001,Ezana is a startup entrepreneur currently prom...,CEO at Afri Kash,Service Management Officer at United Nations,"University of California, Riverside",Political Science,"University of California, Riverside",,,,,
1013,Laila Mickelwait is the Founder and CEO of the...,Founder/CEO at Justice Defense Fund,Founder at New Reality International,University of Southern California,Public Diplomacy,"University of California, Riverside",,,,,
1782,CEO at Cyphen Limited,CEO at Cyphen Limited,,Peking University,国际经济,"University of California, Riverside",,,,,
