# Immigration and Employment

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, linregress
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.formula.api as smf
import sqlite3
import seaborn as sns
sns.set_style("whitegrid", rc={'axes.linewidth': 2.5})
sns.set_context('notebook', font_scale=1.45, rc={"lines.linewidth": 3, "figure.figsize" : (7, 3)})

## Cleaning data

### Data from Integrated Public Use Microdata Series (IPUMS)

In [None]:
files=["2007-08", "2009-10", "2011-12", "2013-14", "2015-16"]
for filename in files:    
    df = pd.read_csv("IPUMS_"+ filename + '.csv', usecols=["YEAR", "STATEFIP", "COUNTYFIPS", "PERWT", "SEX", "AGE", "HISPAN", "CITIZEN", "MIGRATE1"])
    #select persons that are not US citizens and that 1 year ago were living abroad
    #select only US states, do not consider Puerto Rico, Alaska, Virgin Islands, Hawaii
    #do not consider counties whose code is zero
    #do not consider persons that did not respond about their race (HISPAN=9)
    df2 = df[(df.CITIZEN == 3) & (df.MIGRATE1 == 4) & (df.HISPAN != 9) & (df.STATEFIP != 2) & (df.STATEFIP != 3)
                        & (df.STATEFIP != 7) & (df.STATEFIP != 14) & (df.STATEFIP != 15) & (df.STATEFIP != 43) 
                        & (df.STATEFIP != 52) & (df.COUNTYFIPS > 0)]
    #rename variables so it is easier to calculate proportions
    #the code for persons that are hispanic is set to zero
    df2["HISPAN2"] = 0
    #the code for persons that are not hispanic (0) is set to one
    df2.loc[df2["HISPAN"] == 0,  "HISPAN2"] = 1
    #the code for males is set to zero and the code for females (2) is set to one
    df2["SEX2"] = 0
    df2.loc[df2["SEX"] == 2,  "SEX2"] = 1
    #group by county, state, and year
    #calculate the mean of age, sex, and hispanic people (proportion), the number of migrants, and the sum of the weights (perwt)
    df3 = df2.groupby(["COUNTYFIPS", "STATEFIP", "YEAR"]).agg({'AGE': 'mean', 'SEX2': 'mean', 'HISPAN2': 'mean', 'MIGRATE1': 'count', 'PERWT': 'sum'}).reset_index()
    #convert COUNTYFIPS to string
    df3.COUNTYFIPS = df3.COUNTYFIPS.astype(str)
    #convert STATEFIP to string
    df3.STATEFIP = df3.STATEFIP.astype(str)
    #add leading zeros to COUNTYFIPS so that the total number of digits is three
    df3['COUNTYFIPS'] = df3['COUNTYFIPS'].apply(lambda x: x.zfill(3))
    #add leading zeros to STATEFIP so that the total number of digits is two
    df3['STATEFIP'] = df3['STATEFIP'].apply(lambda x: x.zfill(2))
    #combine STATEFIP and COUNTYFIPS in one colummn
    df3["ID"] = df3["STATEFIP"] + df3["COUNTYFIPS"]
    #change the index to the new variable ID
    df3.set_index('ID', inplace=True)
    #save the data in separate files
    df3.to_csv('IPUMSclean_' + filename + '.csv')

### Data from American Community Survey (ACS)

## Merging datasets from IPUMS and ACS