In [1]:
import pandas as pd
import numpy as np

import time
import datetime

DATASETS_LOCATION = "/Users/gomerudo/workspace/datasets"
REDUCED_FILE = False

In [2]:
# Helper functions
def format_timestamp(timestamp_string):
    timestamp_format = '%m/%d/%Y %I:%M:%S %p'
    return datetime.datetime.strptime(timestamp_string, timestamp_format)
    
# String in the format: DD/MM/YYYY HH:MM:SS AM|PM
def roundupdown(timestamp_string):
    timestamp_format = '%m/%d/%Y %I:%M:%S %p'
    my_time = datetime.datetime.strptime(timestamp_string, timestamp_format)
    down_time = my_time.replace(microsecond = 0,second = 0,minute = 0)
    up_time = datetime.timedelta(minutes = 59) + down_time
    up_time = up_time.replace(second = 59)
    return down_time, up_time

def crime_type_to_key(crime_type):
    return "CT_{}".format(crime_type.replace(' ', '_'))

def case_number_to_key(case_number):
    return "CASE_{}".format(case_number)

def community_to_key(community_number):
    return "C{}".format(community_number)

def district_to_key(district_number):
    return "D{}".format(district_number)

## Load clean datasets

In [3]:
# Loading from absolute path, because the dataset is too big to download and to upload

DATASET_CRIMES_PATH = DATASETS_LOCATION + "/chicago_crimes_2001_present_clean.csv"
DATASET_INDICATORS_PATH = DATASETS_LOCATION + "/chicago_socioeconomic_indicators_2008_2012_clean.csv"

# Load CSVs
crimes_df = pd.read_csv(DATASET_CRIMES_PATH, na_values = ["", " "])

if REDUCED_FILE:
    crimes_df = crimes_df[:100000]

indicators_df = pd.read_csv(DATASET_INDICATORS_PATH, na_values = ["", " "])

In [4]:
# Show preview of crimes
crimes_df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Community Area,Year,Latitude,Longitude
0,10000092,HY189866,03/18/2015 07:44:00 PM,047XX W OHIO ST,041A,BATTERY,AGGRAVATED: HANDGUN,STREET,False,False,1111,11,25,2015,41.891399,-87.744385
1,10000094,HY190059,03/18/2015 11:00:00 PM,066XX S MARSHFIELD AVE,4625,OTHER OFFENSE,PAROLE VIOLATION,STREET,True,False,725,7,67,2015,41.773372,-87.665319
2,10000095,HY190052,03/18/2015 10:45:00 PM,044XX S LAKE PARK AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,222,2,39,2015,41.813861,-87.596643
3,10000096,HY190054,03/18/2015 10:30:00 PM,051XX S MICHIGAN AVE,0460,BATTERY,SIMPLE,APARTMENT,False,False,225,2,40,2015,41.800802,-87.622619
4,10000097,HY189976,03/18/2015 09:00:00 PM,047XX W ADAMS ST,031A,ROBBERY,ARMED: HANDGUN,SIDEWALK,False,False,1113,11,25,2015,41.878065,-87.743354


In [5]:
# Show preview of indicators
indicators_df.head()

Unnamed: 0,Community Area Number,COMMUNITY AREA NAME,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME,HARDSHIP INDEX
0,1,Rogers Park,7.7,23.6,8.7,18.2,27.5,23939,39.0
1,2,West Ridge,7.8,17.2,8.8,20.8,38.5,23040,46.0
2,3,Uptown,3.8,24.0,8.9,11.8,22.2,35787,20.0
3,4,Lincoln Square,3.4,10.9,8.2,13.4,25.5,37524,17.0
4,5,North Center,0.3,7.5,5.2,4.5,26.2,57123,6.0


## Do the stuff for the network creation :D

In [6]:
# Loading from absolute path, because the dataset is too big to download and to upload
DATASET_NODES_PATH = DATASETS_LOCATION + "/timewise_nodes.csv"
DATASET_EDGES_PATH = DATASETS_LOCATION + "/timewise_edges.csv"

# Load CSVs
nodes_df = pd.read_csv(DATASET_NODES_PATH, na_values = ["", " "])
edges_df = pd.read_csv(DATASET_EDGES_PATH, na_values = ["", " "])

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
nodes_list = nodes_df.values.tolist()
edges_list = edges_df.values.tolist()

### - First create the csv for the nodes

In [6]:
import time

start_time = time.time()

nodes_list = []

headers = ["NodeID", 
           "Property1", "Value1", 
           "Property2", "Value2", 
           "Property3", "Value3", 
           "Property4", "Value4",
           "Property5", "Value5"]

# For communities
communities = crimes_df['Community Area'].unique()
communities = np.sort(communities)

for community in communities:
    nodes_list.append([community_to_key(community), # NodeID
                       'NodeLabel', # Property1
                       'Community', # Value1
                       'Name', # Property2
                       community_to_key(community), # Value2
                       np.NAN, # Property3
                       np.NAN, # Value3
                       np.NAN, # Property4
                       np.NAN, # Value4
                       np.NAN, # Property5
                       np.NAN]) # Value5

# For districts
districts = crimes_df['District'].unique()
districts = np.sort(districts)

for district in districts:
    nodes_list.append([district_to_key(district), #NodeID
                       'NodeLabel', # Property1
                       'District', # Value1
                       'Name', # Property2
                       district_to_key(district), # Value2
                       np.NAN, # Property3
                       np.NAN, # Value3
                       np.NAN, # Property4
                       np.NAN, # Value4
                       np.NAN, # Property5
                       np.NAN]) # Value5

# For primary type...
crime_types = crimes_df['Primary Type'].unique()
crime_types = np.sort(crime_types)

for i, crime_type in enumerate(crime_types):
    nodes_list.append([crime_type_to_key(crime_type), # NodeID
                       'NodeLabel', # Property1
                       'CrimeType', # Value1
                       'Name', # Property2
                       crime_type, # Value2
                       '', # Property 3
                       '']) # Value3

# For case numbers
for index, row in crimes_df.iterrows():
    nodes_list.append([case_number_to_key(row['Case Number']), # NodeID
                       'NodeLabel', # Property1
                       'CaseNumber', # Value1
                       'Name', # Property2
                       row['Case Number'], # Value2
                       'Type', # Property3
                       crime_type_to_key(row['Primary Type']), # Value3
                       'Latitude', # Property4
                       row['Latitude'], # Value4
                       'Longitude', # Property5
                       row['Longitude'] ]) # Value5

# Create other nodes (based on criterias of indicators and so on)
# Based on hardship
nodes_list.append(["CR_DAN".format(), # NodeID
                   'NodeLabel', # Property1
                   'Category', # Value1
                   'Name', # Property2
                   'Dangereous', # Value2
                   'Comment', # Property3
                   'Bassed on hardship index', # Value3
                   np.NAN, # Property4
                   np.NAN, # Value4
                   np.NAN, # Property5
                   np.NAN]) # Value5

nodes_list.append(["CR_CMP".format(), # NodeID
                   'NodeLabel', # Property1
                   'Category', # Value1
                   'Name', # Property2
                   'Compliant', # Value2
                   'Comment', # Property3
                   'Bassed on the overall percentage of arrests', # Value3
                   np.NAN, # Property4
                   np.NAN, # Value4
                   np.NAN, # Property5
                   np.NAN]) # Value5


# Create the pandas DataFrame to save the results
nodes_df = pd.DataFrame(nodes_list, columns = headers)

end_time = time.time()
print("Creation of nodes data frame took {:.2f} sec".format(end_time - start_time))

nodes_df

Creation of nodes data frame took 921.34 sec


Unnamed: 0,NodeID,Property1,Value1,Property2,Value2,Property3,Value3,Property4,Value4,Property5,Value5
0,C1,NodeLabel,Community,Name,C1,,,,,,
1,C2,NodeLabel,Community,Name,C2,,,,,,
2,C3,NodeLabel,Community,Name,C3,,,,,,
3,C4,NodeLabel,Community,Name,C4,,,,,,
4,C5,NodeLabel,Community,Name,C5,,,,,,
5,C6,NodeLabel,Community,Name,C6,,,,,,
6,C7,NodeLabel,Community,Name,C7,,,,,,
7,C8,NodeLabel,Community,Name,C8,,,,,,
8,C9,NodeLabel,Community,Name,C9,,,,,,
9,C10,NodeLabel,Community,Name,C10,,,,,,


In [8]:
# Save nodes for further usage
if REDUCED_FILE:
    DATASET_NODES_PATH = DATASETS_LOCATION + "/timewise_nodes_reduced.csv"
else:
    DATASET_NODES_PATH = DATASETS_LOCATION + "/timewise_nodes.csv"

nodes_df.to_csv(DATASET_NODES_PATH, index = False)

### - Now, for the edges...

#### Relate communties with cases

In theory, this would never change, so we shouldn't modify this cell too much...

In [8]:
import time

# Log time
start_time = time.time()

# Array to store the edges
edges_list = []

headers = ["EdgeID", "FromNodeID", "ToNodeID",
           "Property1", "Value1", 
           "Property2", "Value2", 
           "Property3", "Value3"]

# Get the communities satisfying a given criteria
communities_criteria = indicators_df[indicators_df['HARDSHIP INDEX'] > 50]

# We iterate over the whole dataset to associate all the cases.
for index, row in crimes_df.iterrows():
    if index % 500000 == 0:
        print("Reached row {} ...".format(index))
    
    crime_timestamp = format_timestamp(row['Date'])
    crime_start_time, crime_end_time = roundupdown(row['Date'])

    edges_list.append(
        [
            "OCURRED_IN", # EdgeID
            case_number_to_key(row['Case Number']), # FromNodeID (A case)
            community_to_key(row['Community Area']), # ToNodeID (A community)
            "StartTime", # Property1
            str(crime_start_time), # Value1. TODO: save start of period we should define
            "EndTime", # Property2
            str(crime_end_time), # Value2. TODO: save end of period we should define
            "Timestamp", # Property3
            str(crime_timestamp) # Value3
        ]
    )

pd.DataFrame(edges_list, columns = headers)

edges_df = pd.DataFrame(edges_list, columns = headers)

end_time = time.time()
print("Creation of edges data frame took {:.2f} sec".format(end_time - start_time))

edges_df

Reached row 0 ...
Reached row 500000 ...
Reached row 1000000 ...
Reached row 1500000 ...
Reached row 2000000 ...
Reached row 2500000 ...
Reached row 3000000 ...
Reached row 3500000 ...
Reached row 4000000 ...
Reached row 4500000 ...
Reached row 5000000 ...
Reached row 5500000 ...
Creation of edges data frame took 1168.20 sec


Unnamed: 0,EdgeID,FromNodeID,ToNodeID,Property1,Value1,Property2,Value2,Property3,Value3
0,OCURRED_IN,CASE_HY189866,C25,StartTime,2015-03-18 19:00:00,EndTime,2015-03-18 19:59:59,Timestamp,2015-03-18 19:44:00
1,OCURRED_IN,CASE_HY190059,C67,StartTime,2015-03-18 23:00:00,EndTime,2015-03-18 23:59:59,Timestamp,2015-03-18 23:00:00
2,OCURRED_IN,CASE_HY190052,C39,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:45:00
3,OCURRED_IN,CASE_HY190054,C40,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:30:00
4,OCURRED_IN,CASE_HY189976,C25,StartTime,2015-03-18 21:00:00,EndTime,2015-03-18 21:59:59,Timestamp,2015-03-18 21:00:00
5,OCURRED_IN,CASE_HY190032,C39,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:00:00
6,OCURRED_IN,CASE_HY190047,C68,StartTime,2015-03-18 23:00:00,EndTime,2015-03-18 23:59:59,Timestamp,2015-03-18 23:00:00
7,OCURRED_IN,CASE_HY189988,C38,StartTime,2015-03-18 21:00:00,EndTime,2015-03-18 21:59:59,Timestamp,2015-03-18 21:35:00
8,OCURRED_IN,CASE_HY190020,C59,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:09:00
9,OCURRED_IN,CASE_HY189964,C49,StartTime,2015-03-18 21:00:00,EndTime,2015-03-18 21:59:59,Timestamp,2015-03-18 21:25:00


#### Relate communities with categories (dangereous, poor, etc...)

In [9]:
# This cell appends the content. So, if you run it multiple times, you will add multiple times.
# Proceed carefully...

import time

# Log time
start_time = time.time()

# Get the communities satisfying a given criteria
# communities_criteria = indicators_df[indicators_df['HARDSHIP INDEX'] > 50]
communities_criteria = indicators_df

for index, row in communities_criteria.iterrows():
    edges_list.append(
        [
            "CONSIDERED", # EdgeID
            community_to_key(row['Community Area Number']), # FromNodeID (A community)
            "CR_DAN", # ToNodeID (A criteria node)
            "HardshipIndex", # Property1
            row['HARDSHIP INDEX'], # Value1
            np.NaN, # Property2
            np.NaN, # Value2
            np.NaN, # Property3
            np.NaN  # Value3
        ]
    )

edges_df = pd.DataFrame(edges_list, columns = headers)

end_time = time.time()
print("Append to edges data frame took {:.2f} sec".format(end_time - start_time))

edges_df
    

Append to edges data frame took 19.84 sec


Unnamed: 0,EdgeID,FromNodeID,ToNodeID,Property1,Value1,Property2,Value2,Property3,Value3
0,OCURRED_IN,CASE_HY189866,C25,StartTime,2015-03-18 19:00:00,EndTime,2015-03-18 19:59:59,Timestamp,2015-03-18 19:44:00
1,OCURRED_IN,CASE_HY190059,C67,StartTime,2015-03-18 23:00:00,EndTime,2015-03-18 23:59:59,Timestamp,2015-03-18 23:00:00
2,OCURRED_IN,CASE_HY190052,C39,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:45:00
3,OCURRED_IN,CASE_HY190054,C40,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:30:00
4,OCURRED_IN,CASE_HY189976,C25,StartTime,2015-03-18 21:00:00,EndTime,2015-03-18 21:59:59,Timestamp,2015-03-18 21:00:00
5,OCURRED_IN,CASE_HY190032,C39,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:00:00
6,OCURRED_IN,CASE_HY190047,C68,StartTime,2015-03-18 23:00:00,EndTime,2015-03-18 23:59:59,Timestamp,2015-03-18 23:00:00
7,OCURRED_IN,CASE_HY189988,C38,StartTime,2015-03-18 21:00:00,EndTime,2015-03-18 21:59:59,Timestamp,2015-03-18 21:35:00
8,OCURRED_IN,CASE_HY190020,C59,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:09:00
9,OCURRED_IN,CASE_HY189964,C49,StartTime,2015-03-18 21:00:00,EndTime,2015-03-18 21:59:59,Timestamp,2015-03-18 21:25:00


#### Relate CrymeType with the communities, by temporality and strength

In [10]:
import time

# Log time
start_time = time.time()

all_years = crimes_df['Year'].unique()
print("Original", all_years, "has", crimes_df.shape[0], "items")

for year in all_years:
    # Subset for a given year
    subset = crimes_df.loc[crimes_df['Year'] == year]
    # Get statistics for the crime types by community area, for the current subset
    grouped = subset.groupby(['Community Area', 'Primary Type']).count()[['ID']]
    # This will give us the total number of crimes per community in the subset
    sum_communities = grouped.sum(level = 'Community Area')

    # Iterate each community in the year
    for community, total_crimes in sum_communities.iterrows():
        # Number of crimes in a community for the current year
        n_year_community = total_crimes[0] 
#         print("\n\nIndex:", index, "has total", total[0])
        for crime_type, count in grouped.loc[community].iterrows():
            edges_list.append(
                [
                    "HAS_OCURRENCES_OF", # EdgeID
                    community_to_key(community), # FromNodeID (A community)
                    crime_type_to_key(crime_type), # ToNodeID (A crime type)
                    "Strength", # Property1
                    count['ID']/n_year_community, # Value1
                    "StartTime", # Property2
                    "{}-01-01 00:00:00".format(year), # Value2
                    "EndTime", # Property3
                    "{}-12-31 23:59:59".format(year)  # Value3
                ]
            )


edges_df = pd.DataFrame(edges_list, columns = headers)

end_time = time.time()
print("Append to edges data frame took {:.2f} sec".format(end_time - start_time))

edges_df


Original [2015 2014 2008 2001 2016 2017 2006 2013 2009 2018 2011 2007 2010 2002 2012
 2003 2005 2004] has 5855655 items
Append to edges data frame took 20.39 sec


Unnamed: 0,EdgeID,FromNodeID,ToNodeID,Property1,Value1,Property2,Value2,Property3,Value3
0,OCURRED_IN,CASE_HY189866,C25,StartTime,2015-03-18 19:00:00,EndTime,2015-03-18 19:59:59,Timestamp,2015-03-18 19:44:00
1,OCURRED_IN,CASE_HY190059,C67,StartTime,2015-03-18 23:00:00,EndTime,2015-03-18 23:59:59,Timestamp,2015-03-18 23:00:00
2,OCURRED_IN,CASE_HY190052,C39,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:45:00
3,OCURRED_IN,CASE_HY190054,C40,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:30:00
4,OCURRED_IN,CASE_HY189976,C25,StartTime,2015-03-18 21:00:00,EndTime,2015-03-18 21:59:59,Timestamp,2015-03-18 21:00:00
5,OCURRED_IN,CASE_HY190032,C39,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:00:00
6,OCURRED_IN,CASE_HY190047,C68,StartTime,2015-03-18 23:00:00,EndTime,2015-03-18 23:59:59,Timestamp,2015-03-18 23:00:00
7,OCURRED_IN,CASE_HY189988,C38,StartTime,2015-03-18 21:00:00,EndTime,2015-03-18 21:59:59,Timestamp,2015-03-18 21:35:00
8,OCURRED_IN,CASE_HY190020,C59,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:09:00
9,OCURRED_IN,CASE_HY189964,C49,StartTime,2015-03-18 21:00:00,EndTime,2015-03-18 21:59:59,Timestamp,2015-03-18 21:25:00


#### Now we include the compliant nodes

In [23]:
# This cell appends the content. So, if you run it multiple times, you will add multiple times.
# Proceed carefully...

import time
#edges_list_2 = edges_list.copy()
# edges_list_2 = []

# Log time
start_time = time.time()

headers = ["EdgeID", "FromNodeID", "ToNodeID",
           "Property1", "Value1", 
           "Property2", "Value2", 
           "Property3", "Value3"]

all_years = crimes_df['Year'].unique()

for year in all_years:
#     print(year)

    grouped = crimes_df[crimes_df['Year'] == year].groupby(['Community Area', 'Arrest']).count()[['ID']]
    # grouped

    flag = False
    false_count = 0
    true_count = 0
    rates = []
    for index, row in grouped.iterrows():
        if flag: # Meaning is the count for "true"
            true_count = row['ID']
            rate = true_count/(true_count + false_count)
    #         if rate >= 0.240443:
    #             print("Community {}. T vs F: {}-{}. Rate: {}".format(index[0], true_count, false_count, rate ))
            edges_list.append(
                [
                    "CONSIDERED", # EdgeID
                    community_to_key(index[0]), # FromNodeID (A community)
                    "CR_CMP", # ToNodeID (A crime type)
                    "Rate", # Property1
                    rate, # Value1
                    "Year", # Property2
                    year, # Value2
                    np.NaN, # Property3
                    np.NaN  # Value3
                ]
            )

    #         rates.append(rate)
        else:
            false_count = row['ID']
        flag = not flag


edges_df = pd.DataFrame(edges_list, columns = headers)

end_time = time.time()
print("Append to edges data frame took {:.2f} sec".format(end_time - start_time))

edges_df

Append to edges data frame took 28.96 sec


Unnamed: 0,EdgeID,FromNodeID,ToNodeID,Property1,Value1,Property2,Value2,Property3,Value3
0,OCURRED_IN,CASE_HY189866,C25,StartTime,2015-03-18 19:00:00,EndTime,2015-03-18 19:59:59,Timestamp,2015-03-18 19:44:00
1,OCURRED_IN,CASE_HY190059,C67,StartTime,2015-03-18 23:00:00,EndTime,2015-03-18 23:59:59,Timestamp,2015-03-18 23:00:00
2,OCURRED_IN,CASE_HY190052,C39,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:45:00
3,OCURRED_IN,CASE_HY190054,C40,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:30:00
4,OCURRED_IN,CASE_HY189976,C25,StartTime,2015-03-18 21:00:00,EndTime,2015-03-18 21:59:59,Timestamp,2015-03-18 21:00:00
5,OCURRED_IN,CASE_HY190032,C39,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:00:00
6,OCURRED_IN,CASE_HY190047,C68,StartTime,2015-03-18 23:00:00,EndTime,2015-03-18 23:59:59,Timestamp,2015-03-18 23:00:00
7,OCURRED_IN,CASE_HY189988,C38,StartTime,2015-03-18 21:00:00,EndTime,2015-03-18 21:59:59,Timestamp,2015-03-18 21:35:00
8,OCURRED_IN,CASE_HY190020,C59,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:09:00
9,OCURRED_IN,CASE_HY189964,C49,StartTime,2015-03-18 21:00:00,EndTime,2015-03-18 21:59:59,Timestamp,2015-03-18 21:25:00


#### - This is for the cases to crime relationship

In [24]:
headers = ["EdgeID", "FromNodeID", "ToNodeID",
           "Property1", "Value1", 
           "Property2", "Value2", 
           "Property3", "Value3"]

start_time = time.time()

# For case numbers
for index, row in crimes_df.iterrows():
    edges_list.append(
        [
            "IS_OF_TYPE", # EdgeID
            case_number_to_key(row['Case Number']), # FromNodeID (A community)
            crime_type_to_key(row['Primary Type']), # ToNodeID (A crime type)
            np.NaN, # Property1
            np.NaN, # Value1
            np.NaN, # Property2
            np.NaN, # Value2
            np.NaN, # Property3
            np.NaN  # Value3
        ]
    )

edges_df = pd.DataFrame(edges_list, columns = headers)

end_time = time.time()
print("Append to edges data frame took {:.2f} sec".format(end_time - start_time))

edges_df

Append to edges data frame took 757.21 sec


Unnamed: 0,EdgeID,FromNodeID,ToNodeID,Property1,Value1,Property2,Value2,Property3,Value3
0,OCURRED_IN,CASE_HY189866,C25,StartTime,2015-03-18 19:00:00,EndTime,2015-03-18 19:59:59,Timestamp,2015-03-18 19:44:00
1,OCURRED_IN,CASE_HY190059,C67,StartTime,2015-03-18 23:00:00,EndTime,2015-03-18 23:59:59,Timestamp,2015-03-18 23:00:00
2,OCURRED_IN,CASE_HY190052,C39,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:45:00
3,OCURRED_IN,CASE_HY190054,C40,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:30:00
4,OCURRED_IN,CASE_HY189976,C25,StartTime,2015-03-18 21:00:00,EndTime,2015-03-18 21:59:59,Timestamp,2015-03-18 21:00:00
5,OCURRED_IN,CASE_HY190032,C39,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:00:00
6,OCURRED_IN,CASE_HY190047,C68,StartTime,2015-03-18 23:00:00,EndTime,2015-03-18 23:59:59,Timestamp,2015-03-18 23:00:00
7,OCURRED_IN,CASE_HY189988,C38,StartTime,2015-03-18 21:00:00,EndTime,2015-03-18 21:59:59,Timestamp,2015-03-18 21:35:00
8,OCURRED_IN,CASE_HY190020,C59,StartTime,2015-03-18 22:00:00,EndTime,2015-03-18 22:59:59,Timestamp,2015-03-18 22:09:00
9,OCURRED_IN,CASE_HY189964,C49,StartTime,2015-03-18 21:00:00,EndTime,2015-03-18 21:59:59,Timestamp,2015-03-18 21:25:00


In [47]:
pd.DataFrame(rates).describe()

Unnamed: 0,0
count,77.0
mean,0.245826
std,0.07105
min,0.115959
25%,0.194718
50%,0.240443
75%,0.280778
max,0.453269


In [25]:
# Save edges for further usage
if REDUCED_FILE:
    DATASET_EDGES_PATH = DATASETS_LOCATION + "/timewise_edges_reduced.csv"
else:
    DATASET_EDGES_PATH = DATASETS_LOCATION + "/timewise_edges.csv"

edges_df.to_csv(DATASET_EDGES_PATH, index = False)

In [21]:
#edges_df.groupby("EdgeID").count()