In [3]:
import pandas as pd
import numpy as np

DATASETS_LOCATION = "/Users/gomerudo/workspace/datasets"


## Load clean datasets

In [4]:
# Loading from absolute path, because the dataset is too big to download and to upload

DATASET_CRIMES_PATH = DATASETS_LOCATION + "/chicago_crimes_2001_present_clean.csv"
DATASET_INDICATORS_PATH = DATASETS_LOCATION + "/chicago_socioeconomic_indicators_2008_2012_clean.csv"

# Load CSVs
crimes_df = pd.read_csv(DATASET_CRIMES_PATH, na_values = ["", " "])
indicators_df = pd.read_csv(DATASET_INDICATORS_PATH, na_values = ["", " "])

In [5]:
# Show preview of crimes
crimes_df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Community Area,Year,Latitude,Longitude
0,10000092,HY189866,03/18/2015 07:44:00 PM,047XX W OHIO ST,041A,BATTERY,AGGRAVATED: HANDGUN,STREET,False,False,1111,11,25,2015,41.891399,-87.744385
1,10000094,HY190059,03/18/2015 11:00:00 PM,066XX S MARSHFIELD AVE,4625,OTHER OFFENSE,PAROLE VIOLATION,STREET,True,False,725,7,67,2015,41.773372,-87.665319
2,10000095,HY190052,03/18/2015 10:45:00 PM,044XX S LAKE PARK AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,222,2,39,2015,41.813861,-87.596643
3,10000096,HY190054,03/18/2015 10:30:00 PM,051XX S MICHIGAN AVE,0460,BATTERY,SIMPLE,APARTMENT,False,False,225,2,40,2015,41.800802,-87.622619
4,10000097,HY189976,03/18/2015 09:00:00 PM,047XX W ADAMS ST,031A,ROBBERY,ARMED: HANDGUN,SIDEWALK,False,False,1113,11,25,2015,41.878065,-87.743354


In [6]:
# Show preview of indicators
indicators_df.head()

Unnamed: 0,Community Area Number,COMMUNITY AREA NAME,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME,HARDSHIP INDEX
0,1,Rogers Park,7.7,23.6,8.7,18.2,27.5,23939,39.0
1,2,West Ridge,7.8,17.2,8.8,20.8,38.5,23040,46.0
2,3,Uptown,3.8,24.0,8.9,11.8,22.2,35787,20.0
3,4,Lincoln Square,3.4,10.9,8.2,13.4,25.5,37524,17.0
4,5,North Center,0.3,7.5,5.2,4.5,26.2,57123,6.0


## Do the stuff for the network creation :D

### - First create the csv for the nodes

In [7]:
nodes_list = []

# For communities
communities = crimes_df['Community Area'].unique()
communities = np.sort(communities)

for community in communities:
    nodes_list.append(["C{}".format(community), 'NodeLabel', 'Community', 'Name', "Community {}".format(community)])

# For districts
districts = crimes_df['District'].unique()
districts = np.sort(districts)

for district in districts:
    nodes_list.append(["D{}".format(district), 'NodeLabel', 'District', 'Name', "District {}".format(community)])

# For primary type...
crime_types = crimes_df['Primary Type'].unique()
crime_types = np.sort(crime_types)

for i, crime_type in enumerate(crime_types):
    nodes_list.append(["CT_{}".format(i), 'NodeLabel', 'CrimeType', 'Name', crime_type])

# For case numbers
cases = crimes_df['Case Number'].unique()
cases = np.sort(cases)

for case in cases:
    nodes_list.append(["CASE_{}".format(case) , 'NodeLabel', 'CaseNumber', 'Name', case])

# Save CSV
nodes_df = pd.DataFrame(nodes_list, columns = ["NodeID", "Property1", "Value1", "Property2", "Value2"])
nodes_df

Unnamed: 0,NodeID,Property1,Value1,Property2,Value2
0,C1,NodeLabel,Community,Name,Community 1
1,C2,NodeLabel,Community,Name,Community 2
2,C3,NodeLabel,Community,Name,Community 3
3,C4,NodeLabel,Community,Name,Community 4
4,C5,NodeLabel,Community,Name,Community 5
5,C6,NodeLabel,Community,Name,Community 6
6,C7,NodeLabel,Community,Name,Community 7
7,C8,NodeLabel,Community,Name,Community 8
8,C9,NodeLabel,Community,Name,Community 9
9,C10,NodeLabel,Community,Name,Community 10


In [None]:
# Save nodes for further usage
DATASET_NODES_PATH = DATASETS_LOCATION + "/timewise_nodes.csv"
nodes_df.to_csv(DATASET_NODES_PATH, index = False)