/
cleaning-catalan-juvenile-recidivism-data.py
131 lines (109 loc) · 5.27 KB
/
cleaning-catalan-juvenile-recidivism-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#%% Importing
import pandas as pd
#%% Loading data
raw_file_path = 'data\\raw\\catalan-juvenile-recidivism\\reincidenciaJusticiaMenors.xlsx'
processed_file_path = 'data\\processed\\catalan-juvenile-recidivism'
column_translation_file_path = 'data\\raw\\catalan-juvenile-recidivism\\catalan-juvenile-recidivism-columns.xlsx'
raw = pd.read_excel(raw_file_path)
columns_translation = pd.read_excel(column_translation_file_path)
# Shape of data
#raw.shape # (4753, 143)
# renaming to english columns
catalan_names = raw.columns
raw.columns = columns_translation.translation
#%% Filling or dropping NaNs
# Dropping columns with only NaN
df = raw.drop(raw.columns[(raw.isnull().sum(axis = 0) == raw.shape[0])], axis = 1)
# Offenders with missing geographical areas of origin all come from Espanya
# so we impute Espanya on missing values
df.loc[df.V4_area_origin.isnull()].V2_nationality_type.unique()
df.loc[df.V4_area_origin.isnull(), 'V4_area_origin'] = 'Espanya'
# For V12 and V20 NaN is equivalent to zero
df.V12_n_criminal_record.fillna(0, inplace = True)
df.V20_n_juvenile_records.fillna(0, inplace = True)
# For V26 we combine with column V25 because it has info about MRM and ATM,
# which matches the places with NaNs in V26
df.V26_finished_measure_grouped.fillna(df.V25_MRM_ATM_or_enforcement_actions, inplace = True)
# Dropping rows with NaNs in V5, V6, V7 and V8
df.dropna(axis = 0, subset=['V5_age_cat', 'V6_province', 'V7_region', 'V8_age'], inplace = True)
#%% Filtering out columns, which are either specific MRM, ATM, Savry, 2013 or 2015 data
dfsub = df.filter(list(df.columns[list(range(31))+[114]]))
#%%
# Dropping the column V14_main_crime, V7_region,
# and V3_nationality_country as they are too specific
# Also dropping V5_age_cat, as we have the numeric variable
dfsub.drop(columns=['V2_nationality_type',
'V3_nationality_country',
'V5_age_cat',
'V7_region',
'V14_main_crime',
'V25_MRM_ATM_or_enforcement_actions'], inplace=True)
#%% Replacing some of the catalan words with english
V1_map = {'Dona': 'female', 'Home': 'male'}
V4_map = {'Espanya': 'Spain',
'Magrib': 'Maghreb',
'Centre i Sud Amèrica': 'Latin America',
'Altres': 'Other',
'Europa': 'Europe'}
V11_map = {'Amb antecedents': 1, 'Sense antecedents': 0}
V12_map = {'1 o 2 antecedents': '1-2',
'De 3 a 5 antecedents': '3-5',
'Més de 5 antecedents': '5+',
0: '0'}
V13_map = {'3 fets o més': '3+',
'2 fets': '2',
'1 fet' : '1'}
V15_map = {'Contra les persones': 'Against People',
'Contra la propietat violent': 'Against Property',
'Contra la propietat no violent': 'Against Property',
'Altres': 'Other'}
V16_map = {'Violent': 1, 'No violent': 0}
V17_map = {'Delicte': 1, 'Falta': 0}
V26_map = {'Internament': 'Internment',
'LV': 'Probation', #Llibertat vigilada
'ATM': 'ATM', #report of technical advice
'MRM': 'MRM', #meditation
'Altres medi obert': 'Other',
'PBC': 'Community Service'}
V27_map = {'Menys de 6 mesos': '<6 months',
'De 6 mesos a 1 any': '6 months < 1 year',
"Més d'1 any": '>1 year'}
V115_map = {'Sí': 1, 'No': 0}
dfsub = dfsub.assign(
V1_sex = lambda x: x.V1_sex.map(V1_map),
V4_area_origin = lambda x: x.V4_area_origin.map(V4_map),
V8_age = lambda x: x.V8_age.astype(int),
V11_criminal_record = lambda x: x.V11_criminal_record.map(V11_map),
V12_n_criminal_record = lambda x: x.V12_n_criminal_record.map(V12_map),
V13_n_crime_cat = lambda x: x.V13_n_crime_cat.map(V13_map),
V15_main_crime_cat = lambda x: x.V15_main_crime_cat.map(V15_map),
V16_violent_crime = lambda x: x.V16_violent_crime.map(V16_map),
V17_crime_classification = lambda x: x.V17_crime_classification.map(V17_map),
V26_finished_measure_grouped = lambda x: x.V26_finished_measure_grouped.map(V26_map),
V27_program_duration_cat = lambda x: x.V27_program_duration_cat.map(V27_map),
V115_RECID2015_recid = lambda x: x.V115_RECID2015_recid.map(V115_map)
)
#%% Imputing missing values
#There are some missing values in 'V28_days_from_crime_to_program
#we impute them by calculating the difference in days from crime committed
#to start of program
tmp = (dfsub.loc[
dfsub.V28_days_from_crime_to_program.isnull(),
['V22_main_crime_date', 'V30_program_start']
])
n_nans = tmp.shape[0]
vals = [(tmp.V22_main_crime_date.iloc[i].date()
-tmp.V30_program_start.iloc[i].date()).days for i in range(n_nans)]
dfsub.loc[dfsub.V28_days_from_crime_to_program.isnull(), 'V28_days_from_crime_to_program'] = vals
# Checking for nulls
#dfsub.isnull().sum(axis=0)
#%% handling dates to extract month and year from each. Day is omitted.
date_cols = dfsub.select_dtypes(include=['datetime64']).columns
for i, col in enumerate(date_cols):
dfsub[f"{col}_year"] = dfsub[col].dt.year
dfsub[f"{col}_month"] = dfsub[col].dt.month
dfsub.drop(date_cols, axis = 1, inplace=True)
# 'V31_program_end_year' is 2010 in all instances. Column is dropped.
dfsub.drop(["V31_program_end_year"], axis = 1, inplace=True)
#%% Writing to Excel
dfsub.to_csv(f"{processed_file_path}/catalan-juvenile-recidivism-subset.csv", index = False)