In [1]:
import pandas as pd

## Set up

In [55]:
migrants = pd.read_csv('data/migrants_data.csv', dtype=str)

# Drop duplicated rows
before = len(migrants)
print("Before dedupe: " + str(before))
migrants = migrants.drop_duplicates(subset="ID")
after = len(migrants)
print("After dedupe: " + str(after))
print("diff: " + str(before - after))

# We imported all columns as type string, so let's manually convert some columns to other types like dates
migrants["Child's Date of Entry"] = pd.to_datetime(migrants["Child's Date of Entry"])
migrants["Child's Date of Release"] = pd.to_datetime(migrants["Child's Date of Release"])

# Next, let's drop rows with invalid zipcodes
before = len(migrants)
migrants = migrants[
    (migrants["Sponsor Zipcode"].str.isnumeric().astype(bool)) &
    (migrants["Sponsor Zipcode"].str.len() == 5)
]
after = len(migrants)
print("before fixing zipcodes: " + str(before))
print("after fixing zipcodes: " + str(after))
print("diff: " + str(before - after))

Before dedupe: 553322
After dedupe: 553320
diff: 2


  migrants["Child's Date of Entry"] = pd.to_datetime(migrants["Child's Date of Entry"])
  migrants["Child's Date of Release"] = pd.to_datetime(migrants["Child's Date of Release"])


before fixing zipcodes: 553320
after fixing zipcodes: 553226
diff: 94


In [56]:
migrants

Unnamed: 0,ID,Child's Country of Origin,Child's Gender,Child's Date of Entry,Child's Date of Release,Sponsor Zipcode,Sponsor Category,Relationship of Sponsor
0,1,Honduras,F,2015-01-01,2015-01-28,37863,1,Mother
1,2,Honduras,F,2015-01-01,2015-01-23,28212,1,Mother
2,3,Honduras,F,2015-01-01,2015-01-23,28212,2,Aunt
3,4,Honduras,M,2015-01-02,2015-01-30,27501,2,Brother
4,5,Honduras,M,2015-01-01,2015-02-13,78747,3,Other Cousin
...,...,...,...,...,...,...,...,...
553317,553316,Honduras,M,2023-05-20,2023-05-25,65201,1,Mother
553318,553317,Ecuador,M,2023-05-21,2023-05-26,07206,1,Mother
553319,553318,Guatemala,M,2023-05-21,2023-05-26,75080,2,Brother
553320,553319,Honduras,M,2023-05-21,2023-05-26,74055,2,Brother


## Analysis

From which countries did the most kids come?

In [57]:
migrants[["Child's Country of Origin", "ID"]].groupby("Child's Country of Origin").count().sort_values('ID', ascending=False).head(10)

Unnamed: 0_level_0,ID
Child's Country of Origin,Unnamed: 1_level_1
Guatemala,254231
Honduras,152704
El Salvador,95980
Mexico,14958
Ecuador,9082
Nicaragua,8096
India,2892
Venezuela,2427
Cuba,2388
Colombia,1766


Let's see a timeline of Guatemalan kids entering the country

In [58]:
# Create a new column that just the year that each child entered the country
migrants["year_entered"] = migrants["Child's Date of Entry"].dt.year

# Create a new dataframe by grouping on the country and year, then count the number of ID (children) in each case
origin_year = migrants.groupby(["Child's Country of Origin","year_entered"])["ID"].count().reset_index()

# Display this new dataframe sorted
origin_year.sort_values(["year_entered", "Child's Country of Origin"], ascending=[True,True])

Unnamed: 0,Child's Country of Origin,year_entered,ID
4,Albania,2015,2
13,Angola,2015,1
21,Argentina,2015,1
28,Armenia,2015,1
36,Azerbaijan,2015,1
...,...,...,...
455,United Kingdom,2023,2
464,United States of America,2023,33
472,Uzbekistan,2023,22
481,Venezuela,2023,640


In [59]:
# And now to answer the question, filter this new dataframe to show just kids from Guatemala
guatemala = origin_year[origin_year["Child's Country of Origin"] == "Guatemala"]
guatemala

Unnamed: 0,Child's Country of Origin,year_entered,ID
209,Guatemala,2015,18798
210,Guatemala,2016,23677
211,Guatemala,2017,14030
212,Guatemala,2018,23786
213,Guatemala,2019,25927
214,Guatemala,2020,6732
215,Guatemala,2021,69062
216,Guatemala,2022,57130
217,Guatemala,2023,15089


And let's just show off a couple more features, like renaming columns and creating new columns that rely on existing columns

In [60]:
# First, let's rename the ID column since it is the result of the count operation
guatemala = guatemala.rename(columns={"ID":"count_kids"})

# Now let's create a percent column that shows what percent of all kids from Guatemala came in which year
total = guatemala["count_kids"].sum()

#Here we are going to apply a function to each rows taking in the count_kids value and returning the pct value
guatemala["pct guatemala"] = guatemala["count_kids"].apply(lambda row: row / total)

# Now we are going to do the same thing, but calculate a percantage of all kids
total_all = origin_year["ID"].sum()
guatemala["pct total"] = guatemala["count_kids"].apply(lambda row: row / total_all)
guatemala

Unnamed: 0,Child's Country of Origin,year_entered,count_kids,pct guatemala,pct total
209,Guatemala,2015,18798,0.073941,0.033979
210,Guatemala,2016,23677,0.093132,0.042798
211,Guatemala,2017,14030,0.055186,0.02536
212,Guatemala,2018,23786,0.093561,0.042995
213,Guatemala,2019,25927,0.101982,0.046865
214,Guatemala,2020,6732,0.02648,0.012169
215,Guatemala,2021,69062,0.271651,0.124835
216,Guatemala,2022,57130,0.224717,0.103267
217,Guatemala,2023,15089,0.059352,0.027275


## Advanced topic: joins

Lastly, we want to find areas in the US where relatively high numbers of kids were sent. We can do this by calculating a rate of unaccompanied migrant children per 10,000 residents in a county. To do this we will need to join a few datasets together.

First, we will join the migrants dataframe to a zipcode crosswalk. This will let us turn zipcodes into zctas. The details are not super important, but this is an important step in order to join zipcodes to counties.

In [61]:
zctas = pd.read_csv('data/ZIP Code to ZCTA Crosswalk.csv', dtype=str) # From health resources and services admin https://geocarenavigator.hrsa.gov/
zctas

Unnamed: 0,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,zcta,zip_join_type
0,00501,Holtsville,NY,Post Office or large volume customer,11742,Spatial join to ZCTA
1,00544,Holtsville,NY,Post Office or large volume customer,11742,Spatial join to ZCTA
2,00601,Adjuntas,PR,Zip Code Area,00601,Zip matches ZCTA
3,00602,Aguada,PR,Zip Code Area,00602,Zip matches ZCTA
4,00603,Aguadilla,PR,Zip Code Area,00603,Zip matches ZCTA
...,...,...,...,...,...,...
41056,99926,Metlakatla,AK,Zip Code Area,99926,Zip matches ZCTA
41057,99927,Point Baker,AK,Zip Code Area,99927,Zip matches ZCTA
41058,99928,Ward Cove,AK,Post Office or large volume customer,99901,Spatial join to ZCTA
41059,99929,Wrangell,AK,Zip Code Area,99929,Zip matches ZCTA


In [62]:
# Join the migrants df to the zcta crosswalk using the zipcode field on each
# For the zctas df, we are only keeping on column, zcta
migrants = migrants.set_index("Sponsor Zipcode").join(zctas.set_index("ZIP_CODE")[["zcta"]])
migrants

Unnamed: 0_level_0,ID,Child's Country of Origin,Child's Gender,Child's Date of Entry,Child's Date of Release,Sponsor Category,Relationship of Sponsor,year_entered,zcta
Sponsor Zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
37863,1,Honduras,F,2015-01-01,2015-01-28,1,Mother,2015,37863
28212,2,Honduras,F,2015-01-01,2015-01-23,1,Mother,2015,28212
28212,3,Honduras,F,2015-01-01,2015-01-23,2,Aunt,2015,28212
27501,4,Honduras,M,2015-01-02,2015-01-30,2,Brother,2015,27501
78747,5,Honduras,M,2015-01-01,2015-02-13,3,Other Cousin,2015,78747
...,...,...,...,...,...,...,...,...,...
65201,553316,Honduras,M,2023-05-20,2023-05-25,1,Mother,2023,65201
07206,553317,Ecuador,M,2023-05-21,2023-05-26,1,Mother,2023,07206
75080,553318,Guatemala,M,2023-05-21,2023-05-26,2,Brother,2023,75080
74055,553319,Honduras,M,2023-05-21,2023-05-26,2,Brother,2023,74055


In [63]:
# Now we want to join zctas to counties, we will load the dataset that has a county for each zcta
zcta_to_county = pd.read_csv("data/ZCTAS to counties.csv", dtype=str)
zcta_to_county

Unnamed: 0,ZCTA,COUNTY,STATE,COUNTYFP
0,35592,Lamar County,AL,01075
1,35616,Colbert County,AL,01033
2,35621,Morgan County,AL,01103
3,35651,Lawrence County,AL,01079
4,36010,Pike County,AL,01109
...,...,...,...,...
33786,10540,Westchester County,NY,36119
33787,23081,James City County,VA,51095
33788,23147,Charles City County,VA,51036
33789,23298,Richmond city,VA,51760


In [64]:
# Now we will join these two datasets on the zcta field in each. We will keep all the columns from the county dataset this time
migrants = migrants.set_index("zcta").join(zcta_to_county.set_index("ZCTA"))
migrants

Unnamed: 0_level_0,ID,Child's Country of Origin,Child's Gender,Child's Date of Entry,Child's Date of Release,Sponsor Category,Relationship of Sponsor,year_entered,COUNTY,STATE,COUNTYFP
zcta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
37863,1,Honduras,F,2015-01-01,2015-01-28,1,Mother,2015,Sevier County,TN,47155
28212,2,Honduras,F,2015-01-01,2015-01-23,1,Mother,2015,Mecklenburg County,NC,37119
28212,3,Honduras,F,2015-01-01,2015-01-23,2,Aunt,2015,Mecklenburg County,NC,37119
27501,4,Honduras,M,2015-01-02,2015-01-30,2,Brother,2015,Harnett County,NC,37085
78747,5,Honduras,M,2015-01-01,2015-02-13,3,Other Cousin,2015,Travis County,TX,48453
...,...,...,...,...,...,...,...,...,...,...,...
65201,553316,Honduras,M,2023-05-20,2023-05-25,1,Mother,2023,Boone County,MO,29019
07206,553317,Ecuador,M,2023-05-21,2023-05-26,1,Mother,2023,Union County,NJ,34039
75080,553318,Guatemala,M,2023-05-21,2023-05-26,2,Brother,2023,Dallas County,TX,48113
74055,553319,Honduras,M,2023-05-21,2023-05-26,2,Brother,2023,Tulsa County,OK,40143


In [65]:
# Lastly, we need to load a dataset that has populations for each county so that we can calculate a rate
pop = pd.read_csv("data/PopulationEstimates.csv", dtype=str)
# Since we read everything in as a string, we need to turn the pop column into a number
pop["CENSUS_2020_POP"] = pop["CENSUS_2020_POP"].str.replace(",", "") # replace commas in the string
pop["CENSUS_2020_POP"] = pd.to_numeric(pop["CENSUS_2020_POP"]) # Then turn it into a number
pop

Unnamed: 0,FIPStxt,State,Area_Name,CENSUS_2020_POP
0,00000,US,United States,331449281.0
1,01000,AL,Alabama,5024279.0
2,01001,AL,Autauga County,58805.0
3,01003,AL,Baldwin County,231767.0
4,01005,AL,Barbour County,25223.0
...,...,...,...,...
3278,72145,PR,Vega Baja Municipio,54414.0
3279,72147,PR,Vieques Municipio,8249.0
3280,72149,PR,Villalba Municipio,22093.0
3281,72151,PR,Yabucoa Municipio,30426.0


In [66]:
migrants = migrants.set_index("COUNTYFP").join(pop.set_index("FIPStxt")["CENSUS_2020_POP"])
migrants

Unnamed: 0_level_0,ID,Child's Country of Origin,Child's Gender,Child's Date of Entry,Child's Date of Release,Sponsor Category,Relationship of Sponsor,year_entered,COUNTY,STATE,CENSUS_2020_POP
COUNTYFP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
47155,1,Honduras,F,2015-01-01,2015-01-28,1,Mother,2015,Sevier County,TN,98380.0
37119,2,Honduras,F,2015-01-01,2015-01-23,1,Mother,2015,Mecklenburg County,NC,1115482.0
37119,3,Honduras,F,2015-01-01,2015-01-23,2,Aunt,2015,Mecklenburg County,NC,1115482.0
37085,4,Honduras,M,2015-01-02,2015-01-30,2,Brother,2015,Harnett County,NC,133568.0
48453,5,Honduras,M,2015-01-01,2015-02-13,3,Other Cousin,2015,Travis County,TX,1290188.0
...,...,...,...,...,...,...,...,...,...,...,...
29019,553316,Honduras,M,2023-05-20,2023-05-25,1,Mother,2023,Boone County,MO,183610.0
34039,553317,Ecuador,M,2023-05-21,2023-05-26,1,Mother,2023,Union County,NJ,575345.0
48113,553318,Guatemala,M,2023-05-21,2023-05-26,2,Brother,2023,Dallas County,TX,2613539.0
40143,553319,Honduras,M,2023-05-21,2023-05-26,2,Brother,2023,Tulsa County,OK,669279.0


In [68]:
# Now we just need to aggregate them by county and calculate a rate
# Here we groupby county fips code and aggregate them to count the number of IDs in each group. We also take the first population value,
# county name value, and state value
migrants = migrants.reset_index().groupby("COUNTYFP").agg({
    "CENSUS_2020_POP": "first",
    "ID": "count",
    "COUNTY": "first",
    "STATE": "first"
})
migrants = migrants.rename(columns={
    "ID": "count_kids",
    "CENSUS_2020_POP": "pop"
})
migrants

Unnamed: 0_level_0,pop,count_kids,COUNTY,STATE
COUNTYFP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
01001,58805.0,6,Autauga County,AL
01003,231767.0,573,Baldwin County,AL
01005,25223.0,131,Barbour County,AL
01007,22293.0,47,Bibb County,AL
01009,59134.0,110,Blount County,AL
...,...,...,...,...
72031,154815.0,1,Carolina Municipio,
72127,342259.0,9,San Juan Municipio,
72137,75293.0,2,Toa Baja Municipio,
78010,,2,St. Croix Island,


In [71]:
# Lastly we will create a new rate column
migrants["rate"] = migrants.apply(lambda df: df["count_kids"] / df["pop"] * 10000, axis=1)
migrants.sort_values("rate", ascending=False).head(10)

Unnamed: 0_level_0,pop,count_kids,COUNTY,STATE,rate
COUNTYFP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
27105,22290.0,696,Nobles County,MN,312.247645
20075,2518.0,74,Hamilton County,KS,293.884035
48111,7115.0,198,Dallam County,TX,278.285313
48369,9869.0,229,Parmer County,TX,232.03972
22023,5617.0,122,Cameron Parish,LA,217.197792
31037,10582.0,224,Colfax County,NE,211.680212
28123,27990.0,579,Scott County,MS,206.859593
46005,19149.0,365,Beadle County,SD,190.610476
48017,6904.0,131,Bailey County,TX,189.745075
46003,2747.0,51,Aurora County,SD,185.65708
