### Imports

In [None]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt

### Data processing

In [None]:
# The data had initial rows (1-6) that were not part of the table. I manually deleted these to preprocess
# I then saved that file as .csv
# You all may just use the .csv files in this folder as is, just wanted to share what happened previously

#Read in file
arrests = pd.read_csv('2025-ICLI-00019_2024-ICFO-39357_ERO Admin Arrests.csv')

#Data cleaning
arrests = arrests.dropna(subset = 'Birth Year')
# This allows me to see all columns
# print(arrests.columns)

# This lets you see the data in the first row
print(arrests.iloc[0])

# This allows you to see the top five rows; often truncates columns which is why I did the above
arrests.head()

In [None]:
# For arrests, the data does not have age (or birth date) but it does have birth year, so bucket by that
#Find the current year
current_year = datetime.datetime.today().year #2025
arrests['age'] = current_year - arrests['Birth Year']
arrests['Apprehension Date'] = pd.to_datetime(arrests['Apprehension Date'],format='%m/%d/%Y %H:%M')
arrests['Apprehension Day'] = arrests['Apprehension Date'].dt.date

## Histogram example

In [None]:
#Get every ten years for group, assume 100 is oldest group
agegroup = 1# Can do 10 to group by ten years
bins = np.linspace(0,100,int((100/agegroup))+1)

# Plot and label
plt.hist(arrests['age'], bins=bins)
plt.title('Histogram of Arrests by Age')
plt.xlabel('Age')
plt.ylabel('Count')

## Density Example

In [None]:
## Density example
import seaborn as sns
sns.kdeplot(arrests['age'], bw_adjust=1)  # `bw_adjust` can tweak smoothness
plt.xlabel('Age')
plt.ylabel('Density')
plt.title('Probability Density Function of Age')
plt.grid(True)

## Any observations on this graph?

## Scatter Plot

In [None]:
plt.scatter(arrests['Apprehension Day'],arrests['age'],alpha=0.1) 

### Time series

In [None]:
arrests_by_day = arrests.groupby(by='Apprehension Day').count().reset_index()


In [None]:
plt.plot(arrests_by_day['Apprehension Day'],arrests_by_day['age'])
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Arrests by date')
plt.grid(True)

### Ratio of removals

In [None]:
removals = pd.read_csv('2025-ICLI-00019_2024-ICFO-39357_ICE Removals_2025.csv')
removals['Departed Date'] = pd.to_datetime(removals['Departed Date'],format='%m/%d/%Y').dt.date

In [None]:
#Get count of removals by day
removals_by_day = removals.groupby(by='Departed Date').count().reset_index()

In [None]:
#align days for arrests and removals
combined_df = arrests_by_day.merge(removals_by_day,how='left', left_on='Apprehension Day', right_on='Departed Date')

In [None]:
#Chose generic columns with counts
combined_df['ratio'] = combined_df['Apprehension Date']/combined_df['Birth Date_y']
combined_df = combined_df.dropna(subset = 'ratio')

In [None]:
plt.plot(combined_df['Apprehension Day'],combined_df['ratio'])
plt.xlabel('Date')
plt.ylabel('Ratio of arrests to removals')
plt.title('Ratio of arrests to removals by date')
plt.grid(True)

### Summary stats

In [None]:
summary = arrests.groupby('Apprehension Day')['age'].agg(
    mean='mean',
    median='median',
    mode=lambda x: x.mode().iloc[0] if not x.mode().empty else None,
    percentile_25=lambda x: x.quantile(0.25),
    percentile_75=lambda x: x.quantile(0.75)
).reset_index()

In [None]:
summary

In [None]:
plt.plot(summary['Apprehension Day'],summary['mean'])
plt.xlabel('Date')
plt.ylabel('Mean removal age')
plt.title('Mean removal age over time')
plt.grid(True)