In [30]:
import pandas as pd

# load the dataset
data = pd.read_csv("./data/NCRB_CII-2019_Table_19B.2.csv")

# clean column names - strip whitespaces, lowercase, replace spaces with underscores
data.columns = data.columns.str.strip().str.lower().str.replace(' ','_').str.replace('-','_')

# rename a few long column names for clarity
data.rename(columns={
    'total_____male': 'total_male',
    'total_____female': 'total_female',
    'total_____total_persons_arrested_by_age_and_sex': 'total_arrests', 
},  
            inplace=True)

# drop s._no column
if 's._no'in data.columns:
    data.drop(columns='s._no', inplace=True)

# check and handle missing values
data.fillna(0, inplace=True)

# convert relevant columns to integer 
numeric_cols = data.columns.drop('city')
data[numeric_cols] = data[numeric_cols].astype(int)

# set city as index for easier analysis
data.set_index('city', inplace=True)

print(data.head())

                        juveniles_apprehended___boys  \
city                                                   
Ahmedabad (Gujarat)                              298   
Bengaluru(Karnataka)                             197   
Chennai(Tamil Nadu)                              816   
Coimbatore(Tamil Nadu)                            35   
Delhi                                           3073   

                        juveniles_apprehended___girls  \
city                                                    
Ahmedabad (Gujarat)                                 0   
Bengaluru(Karnataka)                                3   
Chennai(Tamil Nadu)                                 0   
Coimbatore(Tamil Nadu)                              0   
Delhi                                               2   

                        juveniles_apprehended___total  \
city                                                    
Ahmedabad (Gujarat)                               298   
Bengaluru(Karnataka)                

In [31]:
# Exploratory Data Analysis

# basic overview of the dataset
print(data.info())
print(data.describe())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
Index: 20 entries, Ahmedabad (Gujarat) to TOTAL CITIES
Data columns (total 18 columns):
 #   Column                                    Non-Null Count  Dtype
---  ------                                    --------------  -----
 0   juveniles_apprehended___boys              20 non-null     int64
 1   juveniles_apprehended___girls             20 non-null     int64
 2   juveniles_apprehended___total             20 non-null     int64
 3   18_and_above_and_below_30_years___male    20 non-null     int64
 4   18_and_above_and_below_30_years___female  20 non-null     int64
 5   18_and_above_and_below_30_years___total   20 non-null     int64
 6   30_and_above_and_below_45_years___male    20 non-null     int64
 7   30_and_above_and_below_45_years___female  20 non-null     int64
 8   30_and_above_and_below_45_years___total   20 non-null     int64
 9   45_and_above_and_below_60_years___male    20 non-null     int64
 10  45_and_above_and_below_60_years___female 

In [21]:
# to check which cities reported the most crime-related arrests

top_arrests = data['total_arrests'].sort_values(ascending=False).head(10)
print(top_arrests)

city
TOTAL CITIES            434769
Delhi                    99086
Chennai(Tamil Nadu)      58508
Mumbai(Maharashtra)      54543
Ahmedabad (Gujarat)      34952
Bengaluru(Karnataka)     28358
Surat(Gujarat)           25620
Pune(Maharashtra)        17123
Jaipur(Rajasthan)        16344
Kochi(Kerala)            15209
Name: total_arrests, dtype: int64


In [23]:
# to confirm the gender bias in criminal involvement 

gender_totals = data[['total_male', 'total_female']].sum()
print(gender_totals)

total_male      821700
total_female     47838
dtype: int64


In [26]:
# to check how many under-18s are getting arrested - critical for youth crime policy

juveniles = data['juveniles_apprehended___total'].sum()
total = data['total_arrests'].sum()
print("Juveniles:", juveniles,"|Total:", total)
print("Juveniles %:", (juveniles / total) * 100)

Juveniles: 15612 |Total: 869538
Juveniles %: 1.7954361971529709


In [29]:
# to analyse which group had the most arrests

age_groups = {
    '18-30': int(data['18_and_above_and_below_30_years___total'].sum()),
    '30-45': int(data['30_and_above_and_below_45_years___total'].sum()),
    '45-60': int(data['45_and_above_and_below_60_years___total'].sum()),
    '60+': int(data['60_years_and_above___total'].sum())
}
print(age_groups)

{'18-30': 419708, '30-45': 305632, '45-60': 116770, '60+': 11816}
