In [2]:
import zipfile
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

-----------

# Pre-processing

One we added our data on the map, we observed that there are some spelling mistakes in county names.
So for this part we had to check State by State.

## California

To be done only once that's why we commented it.

In [14]:
"""df_ca = pd.read_pickle("../data/ca_cleaned.pkl", compression= "gzip")
df_ca['county_name'] = df_ca['county_name'].apply(lambda x : x.replace("Siskyou County", "Siskiyou County"))
df_ca['county_name'] = df_ca['county_name'].apply(lambda x : x.replace("Santa Barbera County", "Santa Barbara County"))
df_ca.to_pickle("../data/ca_cleaned.pkl", compression= "gzip")"""

--------------

# Data processing

## Creation of data by Ethnicity and Gender or each county 

In [6]:
def pre_process(path):
    df = pd.read_pickle(path, compression= "gzip")
    df["date"] = pd.to_datetime(df["date"])
    df["year"] = df["date"].dt.year
    df['county_name'] = df['county_name'].str[:-7]
    return df

def count_year_one_state(df, object = 'ethnicity'):
    if object == 'ethnicity':
        count_year = df.groupby(["year", "county_name", "subject_race"]).count()["date"]
        count_year = pd.DataFrame(count_year)
        count_year = count_year.rename(columns = {"date": "nb_arrest"})
        count_year = count_year.reset_index()
    else:
        count_year = df.groupby(["year", "county_name", "subject_sex"]).count()["date"]
        count_year = pd.DataFrame(count_year)
        count_year = count_year.rename(columns = {"date": "nb_arrest"})
        count_year = count_year.reset_index()
        
    return count_year

def load_population(state):
    population = pd.read_csv('../data/us_county.csv')
    population = population.query("state == state")
    population.drop(['fips','female_percentage','long','lat','median_age','state_code','state'], axis = 1, inplace = True)
    population['county'] = population['county'].str[:-7]
    return population

def relative_arrest_computation(df_state, population, object='ethnicity'):
    if(object == 'ethnicity'):
        relative_arrest = pd.merge(df_state, population, left_on = 'county_name', right_on = 'county').drop(['county'], axis = 1)
        relative_arrest['relative_arrest'] = relative_arrest['nb_arrest'] / relative_arrest['population']
    else:
        relative_arrest = pd.merge(df_state, population, left_on = 'county_name', right_on = 'county').drop(['county'], axis = 1)
        relative_arrest_female = relative_arrest.query("subject_sex == 'female'")
        relative_arrest_female['relative_arrest'] = relative_arrest_female['nb_arrest'] / relative_arrest_female['female']
        relative_arrest_male = relative_arrest.query("subject_sex == 'male'")
        relative_arrest_male['relative_arrest'] = relative_arrest_male['nb_arrest'] / relative_arrest_male['male']
        relative_arrest = pd.concat([relative_arrest_female, relative_arrest_male])

    return relative_arrest

def create_df_one_year(path, state):
    df = pre_process(path)
    count_year_ethnicity = count_year_one_state(df)
    count_year_gender = count_year_one_state(df, object = 'gender')
    population = load_population(state)
    df_ethnicity = relative_arrest_computation(count_year_ethnicity, population)
    df_ethnicity['state'] = state
    df_gender = relative_arrest_computation(count_year_gender, population, object='gender')
    df_gender['state'] = state

    return df_ethnicity, df_gender

In [15]:
df_ethnicity_ca, df_gender_ca = create_df_one_year("../data/ca_cleaned.pkl", 'California')

In [16]:
df_ethnicity_tx, df_gender_tx = create_df_one_year("../data/tx_cleaned.pkl", 'Texas')

In [17]:
arrest_ethnicity = pd.concat([df_ethnicity_ca, df_ethnicity_tx])
arrest_gender = pd.concat([df_gender_ca, df_gender_tx])
arrest_ethnicity.to_csv('../data/arrest_ethnicity.csv')
arrest_gender.to_csv('../data/arrest_gender.csv')

## Creation of data frame by Ethnicity and gender for each state

In [42]:
def pre_process_by_state(path, state):
    df = pd.read_pickle(path, compression= "gzip")
    df["date"] = pd.to_datetime(df["date"])
    df["year"] = df["date"].dt.year
    df['county_name'] = df['county_name'].str[:-7]
    df['state']  = state
    return df

def all_arrest_one_state(df):
    count_year = df.groupby(["year"]).count()["date"]
    count_year = pd.DataFrame(count_year)
    count_year = count_year.rename(columns = {"date": "nb_arrest_state"})
    count_year = count_year.reset_index()
    return count_year

def all_arrest_one_state_by_object(df, object = 'ethnicity'):
    if object == 'ethnicity':
        count_year_by_state = df.groupby(["year", "state", "subject_race"]).count()["date"]
        count_year_by_state = pd.DataFrame(count_year_by_state)
        count_year_by_state = count_year_by_state.rename(columns = {"date": "nb_arrest"})
        count_year_by_state = count_year_by_state.reset_index()
    else:
        count_year_by_state = df.groupby(["year", "state", "subject_sex"]).count()["date"]
        count_year_by_state = pd.DataFrame(count_year_by_state)
        count_year_by_state = count_year_by_state.rename(columns = {"date": "nb_arrest"})
        count_year_by_state = count_year_by_state.reset_index()
        
    return count_year_by_state

def compute_percentage(count_year, count_year_by_state):
    df = pd.merge(left= count_year_by_state, right=count_year, left_on='year', right_on='year')
    df['percentage'] = df['nb_arrest'] / df['nb_arrest_state'] *100
    return df
        
def create_df_one_year_by_state(path, state):
    df = pre_process_by_state(path, state)
    count_year = all_arrest_one_state(df)
    count_year_by_state_ethnicity = all_arrest_one_state_by_object(df)
    count_year_by_state_gender = all_arrest_one_state_by_object(df, object = 'gender')
    df_ethnicity = compute_percentage(count_year, count_year_by_state_ethnicity)
    df_gender = compute_percentage(count_year, count_year_by_state_gender)
    return df_ethnicity, df_gender

In [43]:
df_ca_ethnicity, df_ca_gender = create_df_one_year_by_state("../data/ca_cleaned.pkl", 'ca')
df_tx_ethnicity, df_tx_gender = create_df_one_year_by_state("../data/tx_cleaned.pkl", 'tx')

In [35]:
df_ca_ethnicity.head()

Unnamed: 0,year,state,subject_race,nb_arrest,nb_arrest_state,percentage
0,2009,ca,asian/pacific islander,163572,2494343,0.065577
1,2009,ca,black,199846,2494343,0.08012
2,2009,ca,hispanic,798562,2494343,0.320149
3,2009,ca,other,164346,2494343,0.065887
4,2009,ca,white,1168017,2494343,0.468266


In [44]:
count_ethnicity_state = pd.concat([df_ca_ethnicity, df_tx_ethnicity]).reset_index(drop=True)
count_ethnicity_state.to_csv('../data/count_ethnicity_state.csv')

In [45]:
count_gender_state = pd.concat([df_ca_gender, df_tx_gender]).reset_index(drop=True)
count_gender_state.to_csv('../data/count_gender_state.csv')