In [None]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
#!pip install dask[dataframe]
#!pip install pandas --upgrade
# We have to do this as dask is bad at guessing the types of objects for some reason
# Also, if you are having trouble with running the dataframe and continuously get 
# Fatal Python error: GC object already tracked, try to update pandas
data_types = {
    'ID':  np.int64,
    'Case Number':  object,
    'Date': object,
    'Block': object,
    'IUCR': object,
    'Primary Type': object,
    'Description':  object,
    'Location Description': object,
    'Arrest':  bool,
    'Domestic': bool,
    'Beat': np.int64,
    'District': np.float64,
    'Ward': np.float64,
    'Community Area': np.float64,
    'FBI Code': object,
    'X Coordinate': np.float64,
    'Y Coordinate': np.float64,
    'Year': np.int64,
    'Updated On': object,
    'Latitude': np.float64,
    'Longitude': np.float64,
    'Location': object
}
df = pd.read_csv('crimes.csv')
print("finished loading")

In [None]:
import datetime
import locale

store = pd.HDFStore('store.h5')


data_subset = pd.DataFrame(df.subset(20000).Date)

# add columns
data_subset.insert(0, 'Hour', 0)
data_subset.insert(0, 'Day', 0)
data_subset.insert(0, 'Month', 0)
data_subset.insert(0, 'Year', 0)

# FORMAT : 06/25/2012 03:00:00 PM

# convert string date time to 24 hour (just hour)
# output as string
def to24Hour(x):
    hour = x[11:13]
    am = x[-2:] == 'AM'
    if(am):
        if(hour == '12'):
            hour = '00'
        else:
            return hour
    else:
        if(hour != '12'):
            hour = str(int(hour) + 12)

    return hour

# convert string to datetime object with only date
def toDate(x):
    month = x[0:2]
    date = x[3:5]
    year = x[6:10]
    return datetime.datetime(int(year), int(month), int(date))

def toMonth(x):
    return x[0:2]

def toYear(x):
    return x[6:10]
    
    
    


# convert date
locale.setlocale(locale.LC_ALL, '')
#datetime.datetime.strptime('06/25/2012 03:00:00 PM', '%x %X %p').strftime('%A')
#https://docs.python.org/2/library/datetime.html#strftime-strptime-behavior
data_subset['Hour'] = data_subset['Date'].apply(to24Hour)
data_subset['Day'] = data_subset['Date'].apply(lambda x: toDate(x).strftime('%w'))
data_subset['Month'] = data_subset['Date'].apply(toMonth)
data_subset['Year'] = data_subset['Date'].apply(toYear)

# save to HDF5
store['data_subset'] = data_subset
#retrieve with varName = store['data_subset']

store.close()

data_subset



In [4]:
#start from here if reading from .h5

import matplotlib.pyplot as plt

import pandas as pd

import numpy as np

store = pd.HDFStore('store.h5')

data_subset = store['data_subset']

print("finished loading")

In [5]:
hourCounts = data_subset.Hour.value_counts().sort_index()

X = np.arange(len(hourCounts))

plt.bar(X, hourCounts.values)

plt.xticks(X, hourCounts.axes[0])

plt.show()

In [13]:
dayCounts = data_subset.Day.value_counts().sort_index()

X = np.arange(len(dayCounts))

plt.bar(X, dayCounts.values, align='center')

plt.xticks(X, ['S', 'M', 'T', 'W', 'Th', 'F', 'Sa'])#

# axes = plt.gca()
# axes.set_ylim([800000,1000000])

plt.show()

In [39]:
def monthNormalize(monthCounts):
    i = 1
    for key in monthCounts:
        if(i in [1, 3, 5, 7, 8, 10, 12]):
            monthCounts[i-1] /= 31
        elif (i == 2):
            monthCounts[i-1] /= 28.25 #2001-16 has 4 leap febs and 12 non-leap febs
        else :
            monthCounts[i-1] /= 30 
        i += 1
            
    return monthCounts
    

monthCounts = data_subset.Month.value_counts().sort_index()

monthCounts = monthNormalize(monthCounts)

X = np.arange(len(monthCounts))

plt.bar(X, monthCounts.values, align='center')

plt.xticks(X,  ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])

axes = plt.gca()
axes.set_ylim([14000,18000])

plt.show()

In [43]:
yearCounts = data_subset.Year.value_counts().sort_index()

X = np.arange(len(yearCounts))

plt.bar(X, yearCounts.values, align='center')

plt.xticks(X, yearCounts.axes[0].map(lambda x : x[-2:]))

plt.show()

In [91]:
rows_list = []

for i in range(2001, 2016):
    year = str(i)
    ser = pd.Series(data_subset.where(data_new['Year'] == year).Month.value_counts(), name=year)
    rows_list.append(ser)

counts_frame = pd.DataFrame(rows_list)

In [95]:
counts_frame

from sklearn import datasets, linear_model
from sklearn.cross_validation import train_test_split

indep_vars = ['01', '02', '03']
dep_vars = ['04', '05', '06', '07', '08', '09', '10', '11', '12']
indep_data = counts_frame[indep_vars]
dep_data = counts_frame[dep_vars]
indep_train, indep_test, dep_train, dep_test = train_test_split(indep_data, dep_data, test_size=0.33, random_state=42)

regr = linear_model.LinearRegression()
regr.fit(indep_train, dep_train)

regr_predict = regr.predict(indep_test)
regr_predict

array([[ 31448.73599254,  34413.04680637,  33971.64551268,  35259.0173458 ,
         35133.10770914,  33226.3101517 ,  33834.24449234,  30395.57687882,
         28255.56779079],
       [ 28386.64298305,  30364.83602264,  30086.60695081,  31231.05447456,
         31550.04169345,  29466.07001782,  29089.28420158,  26814.01724725,
         25033.57990024],
       [ 41018.97591303,  43932.87208923,  43579.87852796,  46229.31226952,
         45516.33513491,  43349.58438948,  44599.89605861,  39058.42986564,
         36330.60133221],
       [ 21476.79395302,  23181.72992809,  22895.38607591,  23206.94564897,
         23991.66202176,  21981.57310198,  20910.06910887,  20247.56670712,
         19014.11290828],
       [ 38562.46925811,  41365.43238911,  40981.94811215,  43867.68377597,
         42993.3031957 ,  40555.6307573 ,  41634.70454038,  36171.2865898 ,
         34252.53035922]])