In [2]:
import pandas as pd
import datetime
import numpy as np

In [3]:
arrests = pd.read_csv('arrests.csv')#('2025-ICLI-00019_2024-ICFO-39357_ERO Admin Arrests.csv')

In [4]:
# For arrests, the data does not have age (or birth date) but it does have birth year, so bucket by that
#Find the current year
current_year = datetime.datetime.today().year #2025
arrests['age'] = current_year - arrests['Birth Year']
arrests['Apprehension Date'] = pd.to_datetime(arrests['Apprehension Date'],format='%m/%d/%Y %H:%M')
arrests['Apprehension Day'] = arrests['Apprehension Date'].dt.date

These are basic ways to find statistics using a dataframe series or numpy array

In [4]:
print("=== Descriptive Statistics on 'age' ===")

# --- MEAN ---
mean_pandas = arrests['age'].mean()
mean_manual = sum(arrests['age']) / len(arrests['age'])
def find_mean_verbose(series):
    tempSum = 0
    count = len(series)
    for val in series:
        tempSum = tempSum + val
    return tempSum/count
mean_verbose = find_mean_verbose(arrests['age'])

print(f"Mean (Pandas): {mean_pandas}")
print(f"Mean (Manual): {mean_manual}")
print(f"Mean (Verbose): {mean_verbose}")

=== Descriptive Statistics on 'age' ===
Mean (Pandas): 35.04964445416362
Mean (Manual): 35.04964445416362
Mean (Verbose): 35.04964445416362


In [5]:
# --- MEDIAN ---
median_pandas = arrests['age'].median()
median_manual = np.median(arrests['age'])

print(f"\nMedian (Pandas): {median_pandas}")
print(f"Median (Manual - NumPy): {median_manual}")


Median (Pandas): 34.0
Median (Manual - NumPy): 34.0


In [9]:
# --- MODE ---
mode_pandas = arrests['age'].mode()
mode_manual = max(set(arrests['age']), key=list(arrests['age']).count)  # Only shows one mode

print(f"\nMode (Pandas): {mode_pandas.tolist()} (can show multiple modes)")
print(f"Mode (Manual): {mode_manual}")


Mode (Pandas): [33] (can show multiple modes)
Mode (Manual): 33


In [8]:
# --- VARIANCE ---
variance_pandas = arrests['age'].var()  # Sample variance (ddof=1)
variance_manual = np.var(arrests['age'], ddof=1)
def find_variance_verbose(series):
    count = len(series)
    mean = series.mean()
    runningSoS = 0
    for val in series:
        runningSoS = runningSoS + (val-mean)**2
    return runningSoS/(count-1)
variance_verbose = find_variance_verbose(arrests['age'])

print(f"\nVariance (Pandas): {variance_pandas:.2f}")
print(f"Variance (Manual - NumPy): {variance_manual:.2f}")
print(f"Variance (Verbose): {variance_verbose:.2f}")


Variance (Pandas): 121.40
Variance (Manual - NumPy): 121.40
Variance (Verbose): 121.40


### These are more complicated ways but allow some grouping by other values

In [16]:
arrests.groupby('Apprehension State')['age'].agg(
    mean='mean',
    median='median',
    mode=lambda x: x.mode().iloc[0] if not x.mode().empty else None,
    count='count'
)

Unnamed: 0_level_0,mean,median,mode,count
Apprehension State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ALABAMA,34.150597,33.0,26,4190
ALASKA,37.617647,38.5,41,68
ARIZONA,35.268225,34.0,31,5391
ARKANSAS,34.948914,34.0,25,3132
ARMED FORCES - EUROPE,36.500000,36.5,34,2
...,...,...,...,...
VIRGINIA,34.092712,33.0,30,5447
WASHINGTON,34.444690,34.0,31,2260
WEST VIRGINIA,36.960967,36.0,31,538
WISCONSIN,34.990297,34.0,31,1649


In [21]:
arrests.groupby(['Apprehension State','Gender'])['age'].agg(
    mean='mean',
    median='median',
    mode=lambda x: x.mode().iloc[0] if not x.mode().empty else None,
    count='count'
)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median,mode,count
Apprehension State,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALABAMA,Female,37.493289,36.0,36,596
ALABAMA,Male,33.590808,32.0,26,3590
ALABAMA,Unknown,38.500000,37.0,37,4
ALASKA,Female,37.000000,38.0,56,7
ALASKA,Male,37.688525,39.0,41,61
...,...,...,...,...,...
WISCONSIN,Female,31.527675,31.0,24,271
WISCONSIN,Male,35.671263,35.0,31,1378
WYOMING,Female,29.482759,29.0,26,29
WYOMING,Male,33.824207,33.0,32,347
