In [2]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim

In [30]:
## load fips csv
fips = pd.read_csv("fips2county.tsv", sep='\t', dtype={'CountyFIPS_3': 'string'} )
fips['fips_code']= fips['CountyFIPS'].astype(int)

In [31]:
fips.fips_code

Unnamed: 0,StateFIPS,CountyFIPS_3,CountyName,StateName,CountyFIPS,StateAbbr,STATE-COUNTY,fips_code
0,1,001,Autauga,Alabama,1001,AL,AL-AUTAUGA,1001
1,1,003,Baldwin,Alabama,1003,AL,AL-BALDWIN,1003
2,1,005,Barbour,Alabama,1005,AL,AL-BARBOUR,1005
3,1,007,Bibb,Alabama,1007,AL,AL-BIBB,1007
4,1,009,Blount,Alabama,1009,AL,AL-BLOUNT,1009
...,...,...,...,...,...,...,...,...
3137,56,037,Sweetwater,Wyoming,56037,WY,WY-SWEETWATER,56037
3138,56,039,Teton,Wyoming,56039,WY,WY-TETON,56039
3139,56,041,Uinta,Wyoming,56041,WY,WY-UINTA,56041
3140,56,043,Washakie,Wyoming,56043,WY,WY-WASHAKIE,56043


In [4]:
school = 'University of California, Riverside' # change to your school's name

In [181]:
df = pd.read_csv(school + '.csv') 

In [182]:
# drop redundant column (if your data frame doesn't have 'Unnamed: 0', don't run this)
df = df.drop(columns =['Unnamed: 0'])

In [183]:
# drop duplicates
df = df.drop_duplicates()

In [184]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
3260    False
3264    False
3273    False
3276    False
3282    False
Length: 2202, dtype: bool

In [185]:
#rename 'majro' to 'major'
df = df.rename(columns={'majro':'major'})

In [186]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2202 entries, 0 to 3282
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   current_location  2202 non-null   object
 1   current_work      2165 non-null   object
 2   previous_work     2139 non-null   object
 3   latest_Education  2202 non-null   object
 4   major             2052 non-null   object
 5   search_school     2202 non-null   object
dtypes: object(6)
memory usage: 120.4+ KB


In [187]:
# replace wrong locations(string 'connections' in them) to nulls
no_value = df.current_location.str.contains('connections') 
df.loc[no_value,'current_location'] = np.NAN

# remove some strings not identified by geocode (!you can add more depending on your data)
l = ['Metropolitan','Area', 'Township of', 'Region', 'Greater', 'Metroplex', 'District']
df['current_location'] = df['current_location'].str.replace('|'.join(l), '', regex=True)

geolocator = Nominatim(user_agent="course_project")

In [188]:
# generate a dictionary mapping unique locations to corresponding coordinates (4 mins)
coor = {}
state = {}
county = {}
city = {}
unique_locs = df.current_location.unique()
for loc in unique_locs:
    location = geolocator.geocode(loc, timeout = 600,addressdetails=True)
    if pd.isna(location) or pd.isna(loc):
        coor[loc] = np.NAN
        state[loc] = np.NAN
        county[loc] = np.NAN
        city[loc] = np.NAN
    else:
        coor[loc] = (location.latitude, location.longitude)
        try:
            state[loc] = location.raw['address']['state']
        except:
            state[loc] = np.NAN
        try:
            county[loc] = location.raw['address']['county'].rsplit(' ',1)[0]
        except:
            county[loc] = np.NAN
        try:
            city[loc] = location.raw['address']['city']
        except:
            city[loc] = np.NAN

In [189]:
df['coordinate'] = df.apply(lambda row: coor[row.current_location], axis =1)
df['state'] = df.apply(lambda row: state[row.current_location], axis =1)
df['county'] = df.apply(lambda row: county[row.current_location], axis =1)
df['city'] = df.apply(lambda row: city[row.current_location], axis =1)

In [190]:
def mapping(row):
    ''' map state-county to fips code
    '''
    if row[['county','state']].notnull().all():
        x = fips[(fips.CountyName==row.county) & (fips.StateName==row.state)]['fips_code'].values.tolist()
        if x !=[]:
            return int(x[0])
    else:
        try:
            x = fips[(fips.CountyName==row.city) & (fips.StateName==row.state)]['fips_code'].values.tolist()
            if x !=[]:
                return int(x[0])
        except:
            pass   
    return None

In [191]:
df['fips'] = df.apply(mapping, axis=1)
df['fips']  = df['fips'] .astype(pd.Int64Dtype())

In [192]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2202 entries, 0 to 3282
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   current_location  2025 non-null   object
 1   current_work      2165 non-null   object
 2   previous_work     2139 non-null   object
 3   latest_Education  2202 non-null   object
 4   major             2052 non-null   object
 5   search_school     2202 non-null   object
 6   coordinate        2020 non-null   object
 7   state             1918 non-null   object
 8   county            1740 non-null   object
 9   city              1420 non-null   object
 10  fips              1752 non-null   Int64 
dtypes: Int64(1), object(10)
memory usage: 208.6+ KB


In [196]:
## save to a new csv file!!!
df.to_csv(school + '_with_fips.csv')

In [193]:
# bugfix (optional)
#check a specific location
location = geolocator.geocode('Frankfurt Rhine', timeout = 600)
location

Location(Frankfurter Straße, Urdenbach, Stadtbezirk 9, Düsseldorf, Nordrhein-Westfalen, 40593, Deutschland, (51.1549523, 6.8854454, 0.0))

In [194]:
#check NAN values and revise current locations to be searchable (change/add more strings to remove in l)
df[(df.coordinate.isna()) & (df.current_location.notna())]

Unnamed: 0,current_location,current_work,previous_work,latest_Education,major,search_school,coordinate,state,county,city,fips
337,"Greenville-Spartanburg-Anderson, South Carolina","President/Owner, Financial Advisor, CFP®, CRPC...","Vice President, Financial Advisor, CFP®, CRPC®...",University of South Carolina - The Moore Schoo...,Brazil/Portuguese,"University of California, Riverside",,,,,
548,Frankfurt Rhine-Main,Head of Banking and Partner Strategy and Execu...,Senior Adviser at Holland FinTech,"University of California, Riverside",German Language and Literature,"University of California, Riverside",,,,,
1001,Ezana is a startup entrepreneur currently prom...,CEO at Afri Kash,Service Management Officer at United Nations,"University of California, Riverside",Political Science,"University of California, Riverside",,,,,
1013,Laila Mickelwait is the Founder and CEO of the...,Founder/CEO at Justice Defense Fund,Founder at New Reality International,University of Southern California,Public Diplomacy,"University of California, Riverside",,,,,
1782,CEO at Cyphen Limited,CEO at Cyphen Limited,,Peking University,国际经济,"University of California, Riverside",,,,,
