In [1]:
import pandas as pd
import numpy as np

## Load datasets

In [80]:
# Loading from absolute path, because the dataset is too big to download and to upload
DATASETS_LOCATION = "/Users/gomerudo/workspace/datasets"

DATASET_CRIMES_PATH = DATASETS_LOCATION + "/chicago_crimes_2001_present.csv"
DATASET_INDICATORS_PATH = DATASETS_LOCATION + "/chicago_socioeconomic_indicators_2008_2012.csv"

# Load CSVs
crimes_df = pd.read_csv(DATASET_CRIMES_PATH, na_values = ["", " "])
indicators_df = pd.read_csv(DATASET_INDICATORS_PATH, na_values = ["", " "])

## Clean crimes dataset

In [87]:
# Remove unused columns
to_be_removed =  ['Location', 'Updated On', 'X Coordinate', 
                  'Y Coordinate', 'FBI Code', 'Ward']
crimes_df = crimes_df.drop(labels = to_be_removed, axis = 1, errors = "ignore")
N_ORIGINAL = crimes_df.shape[0] 

# Remove rows with NaNs
crimes_df = crimes_df.dropna()
N_CLEAN = crimes_df.shape[0]

print("Original size:", N_ORIGINAL)
print("Removed: {}".format(N_ORIGINAL - N_CLEAN))
print("Final size:", N_CLEAN)

# Fix types
crimes_df['District'] = crimes_df['District'].astype('int')
crimes_df['Community Area'] = crimes_df['Community Area'].astype('int')

# Fix primary type
crimes_df['Primary Type'] = crimes_df['Primary Type'].replace(to_replace = 'NON - CRIMINAL', value='NON-CRIMINAL')

# Summary
crimes_df.head()

Original size: 5856102
Removed: 0
Final size: 5856102


Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Community Area,Year,Latitude,Longitude
0,10000092,HY189866,03/18/2015 07:44:00 PM,047XX W OHIO ST,041A,BATTERY,AGGRAVATED: HANDGUN,STREET,False,False,1111,11,25,2015,41.891399,-87.744385
1,10000094,HY190059,03/18/2015 11:00:00 PM,066XX S MARSHFIELD AVE,4625,OTHER OFFENSE,PAROLE VIOLATION,STREET,True,False,725,7,67,2015,41.773372,-87.665319
2,10000095,HY190052,03/18/2015 10:45:00 PM,044XX S LAKE PARK AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,222,2,39,2015,41.813861,-87.596643
3,10000096,HY190054,03/18/2015 10:30:00 PM,051XX S MICHIGAN AVE,0460,BATTERY,SIMPLE,APARTMENT,False,False,225,2,40,2015,41.800802,-87.622619
4,10000097,HY189976,03/18/2015 09:00:00 PM,047XX W ADAMS ST,031A,ROBBERY,ARMED: HANDGUN,SIDEWALK,False,False,1113,11,25,2015,41.878065,-87.743354


## Clean indicators

In [81]:
M_ORIGINAL = indicators_df.shape[0]

# Clean NaNs if any
indicators_df = indicators_df.dropna()
M_CLEAN = indicators_df.shape[0]

print("Original size:", M_ORIGINAL)
print("Removed: {}".format(M_ORIGINAL - M_CLEAN))
print("Final size:", M_CLEAN)

# Fix types
indicators_df['Community Area Number'] = indicators_df['Community Area Number'].astype('int')

# Summary
indicators_df.head()

Original size: 78
Removed: 1
Final size: 77


Unnamed: 0,Community Area Number,COMMUNITY AREA NAME,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME,HARDSHIP INDEX
0,1,Rogers Park,7.7,23.6,8.7,18.2,27.5,23939,39.0
1,2,West Ridge,7.8,17.2,8.8,20.8,38.5,23040,46.0
2,3,Uptown,3.8,24.0,8.9,11.8,22.2,35787,20.0
3,4,Lincoln Square,3.4,10.9,8.2,13.4,25.5,37524,17.0
4,5,North Center,0.3,7.5,5.2,4.5,26.2,57123,6.0


## Save datasets

In [88]:
# Save clean datasets

DATASET_CRIMES_CLEAN_PATH = DATASETS_LOCATION + "/chicago_crimes_2001_present_clean.csv"
DATASET_INICATORS_CLEAN_PATH = DATASETS_LOCATION + "/chicago_socioeconomic_indicators_2008_2012_clean.csv"

crimes_df.to_csv(DATASET_CRIMES_CLEAN_PATH)
indicators_df.to_csv(DATASET_INICATORS_CLEAN_PATH)