In [None]:
import pandas as pd

## Set up

In [None]:
migrants = pd.read_csv('data/migrants_data.csv', dtype=str)

# Drop duplicated rows
before = len(migrants)
print("Before dedupe: " + str(before))
migrants = migrants.drop_duplicates(subset="ID")
after = len(migrants)
print("After dedupe: " + str(after))
print("diff: " + str(before - after))

# We imported all columns as type string, so let's manually convert some columns to other types like dates
migrants["Child's Date of Entry"] = pd.to_datetime(migrants["Child's Date of Entry"])
migrants["Child's Date of Release"] = pd.to_datetime(migrants["Child's Date of Release"])

# Next, let's drop rows with invalid zipcodes
before = len(migrants)
migrants = migrants[
    (migrants["Sponsor Zipcode"].str.isnumeric().astype(bool)) &
    (migrants["Sponsor Zipcode"].str.len() == 5)
]
after = len(migrants)
print("before fixing zipcodes: " + str(before))
print("after fixing zipcodes: " + str(after))
print("diff: " + str(before - after))

In [None]:
migrants

## Analysis

From which countries did the most kids come?

In [None]:
migrants[["Child's Country of Origin", "ID"]].groupby("Child's Country of Origin").count().sort_values('ID', ascending=False).head(10)

Let's see a timeline of Guatemalan kids entering the country

In [None]:
# Create a new column that just the year that each child entered the country
migrants["year_entered"] = migrants["Child's Date of Entry"].dt.year

# Create a new dataframe by grouping on the country and year, then count the number of ID (children) in each case
origin_year = migrants.groupby(["Child's Country of Origin","year_entered"])["ID"].count().reset_index()

# Display this new dataframe sorted
origin_year.sort_values(["year_entered", "Child's Country of Origin"], ascending=[True,True])

In [None]:
# And now to answer the question, filter this new dataframe to show just kids from Guatemala
guatemala = origin_year[origin_year["Child's Country of Origin"] == "Guatemala"]
guatemala

And let's just show off a couple more features, like renaming columns and creating new columns that rely on existing columns

In [None]:
# First, let's rename the ID column since it is the result of the count operation
guatemala = guatemala.rename(columns={"ID":"count_kids"})

# Now let's create a percent column that shows what percent of all kids from Guatemala came in which year
total = guatemala["count_kids"].sum()

#Here we are going to apply a function to each rows taking in the count_kids value and returning the pct value
guatemala["pct guatemala"] = guatemala["count_kids"].apply(lambda row: row / total)

# Now we are going to do the same thing, but calculate a percantage of all kids
total_all = origin_year["ID"].sum()
guatemala["pct total"] = guatemala["count_kids"].apply(lambda row: row / total_all)
guatemala

## Advanced topic: joins

Lastly, we want to find areas in the US where relatively high numbers of kids were sent. We can do this by calculating a rate of unaccompanied migrant children per 10,000 residents in a county. To do this we will need to join a few datasets together.

First, we will join the migrants dataframe to a zipcode crosswalk. This will let us turn zipcodes into zctas. The details are not super important, but this is an important step in order to join zipcodes to counties.

In [None]:
zctas = pd.read_csv('data/ZIP Code to ZCTA Crosswalk.csv', dtype=str) # From health resources and services admin https://geocarenavigator.hrsa.gov/
zctas

In [None]:
# Join the migrants df to the zcta crosswalk using the zipcode field on each
# For the zctas df, we are only keeping on column, zcta
migrants = migrants.set_index("Sponsor Zipcode").join(zctas.set_index("ZIP_CODE")[["zcta"]])
migrants

In [None]:
# Now we want to join zctas to counties, we will load the dataset that has a county for each zcta
zcta_to_county = pd.read_csv("data/ZCTAS to counties.csv", dtype=str)
zcta_to_county

In [None]:
# Now we will join these two datasets on the zcta field in each. We will keep all the columns from the county dataset this time
migrants = migrants.set_index("zcta").join(zcta_to_county.set_index("ZCTA"))
migrants

In [None]:
# Lastly, we need to load a dataset that has populations for each county so that we can calculate a rate
pop = pd.read_csv("data/PopulationEstimates.csv", dtype=str)
# Since we read everything in as a string, we need to turn the pop column into a number
pop["CENSUS_2020_POP"] = pop["CENSUS_2020_POP"].str.replace(",", "") # replace commas in the string
pop["CENSUS_2020_POP"] = pd.to_numeric(pop["CENSUS_2020_POP"]) # Then turn it into a number
pop

In [None]:
migrants = migrants.set_index("COUNTYFP").join(pop.set_index("FIPStxt")["CENSUS_2020_POP"])
migrants

In [None]:
# Now we just need to aggregate them by county and calculate a rate
# Here we groupby county fips code and aggregate them to count the number of IDs in each group. We also take the first population value,
# county name value, and state value
migrants = migrants.reset_index().groupby("COUNTYFP").agg({
    "CENSUS_2020_POP": "first",
    "ID": "count",
    "COUNTY": "first",
    "STATE": "first"
})
migrants = migrants.rename(columns={
    "ID": "count_kids",
    "CENSUS_2020_POP": "pop"
})
migrants

In [None]:
# Lastly we will create a new rate column
migrants["rate"] = migrants.apply(lambda df: df["count_kids"] / df["pop"] * 10000, axis=1)
migrants.sort_values("rate", ascending=False).head(10)