# Machine Learning Assessment

## 1. Initialisation

Import calls.

[Performance](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#performance-considerations)

In [1]:
%%writefile init.py

import os
import pandas as pd
from time import process_time as timer

def calls_parser(fname) :
    print('Loading raw Seattle 911 calls database from ' + fname)
    tim = timer()
    calls_df = pd.read_csv(fname)
    print('Raw Seattle 911 calls database loaded in ' + str(timer() - tim))

    calls_df['Datetime'] = pd.to_datetime(calls_df['Datetime'],\
                                          format="%m/%d/%Y %I:%M:%S %p"\
                                         ).dt.tz_localize(tz='US/Pacific',\
                                                          ambiguous='NaT')

    calls_df.dropna(inplace=True)
    calls_df.set_index('Datetime', inplace=True)
    calls_df.index.set_names('datetime', inplace=True)
    calls_df.sort_index(inplace=True)

    return calls_df


def init_calls() :
    # Convert to pandas DataFrame
    
    calls_df_pt = './tmp/calls_df.parquet'
    
    if not os.path.exists('./tmp') :
        os.system('mkdir tmp')
    if os.path.isfile(calls_df_pt) :
        print('Loading parsed Seattle 911 calls database from ' + str(calls_df_pt))
        tim = timer()
        calls_df = pd.read_parquet(calls_df_pt) # pd.read_hdf(calls_df_pt, key='c', mode='r')
        print('Parsed Seattle 911 calls database loaded in ' + str(timer() - tim) + ' s')
    else :
        calls_pt = './data/calls.csv'
    
        if not os.path.exists(calls_pt) :
            print('Downloading missing raw Seattle 911 calls database to ' + calls_pt)
            os.system('cat get_calls.sh | sh')
    
        calls_df = calls_parser(calls_pt)

        print('Saving parsed Seattle 911 calls database to ' + calls_df_pt)
        tim = timer()
        calls_df.to_parquet(calls_df_pt) # calls_df.to_hdf(calls_df_pt, key='c', mode='w')
        print('Parsed Seattle 911 calls database saved in ' + str(timer() - tim) + ' s')

        # results_df['type'] = results_df['type'].astype('category')
    
    return calls_df


Overwriting init.py


Import weather data.

In [2]:
%%writefile -a init.py

def weather_parser(fname) :
    print('Loading raw Seattle weather database from ' + fname)
    tim = timer()
    wtr_df = pd.read_csv(fname)
    print('Raw Seattle weather database loaded in ' + str(timer() - tim) + ' s')

    wtr_df['datetime'] = pd.to_datetime(wtr_df['dt'], unit='s').\
                            dt.tz_localize(tz='UTC').\
                            dt.tz_convert('US/Pacific')
    wtr_df.set_index('datetime', inplace=True)
    
    return(wtr_df)


def init_weather() :
    
    wtr_df_pt = './tmp/wtr_df.parquet'
    
    if not os.path.exists('./tmp') :
        os.system('mkdir tmp')
    if os.path.isfile(wtr_df_pt) :
        print('Loading parsed Seattle weather database from ' + str(wtr_df_pt))
        tim = timer()
        wtr_df = pd.read_parquet(wtr_df_pt) # wtr_df = pd.read_hdf(wtr_df_pt, key='w', mode='r')
        # wtr_df.index.to_datetime().dt.tz_convert('US/Pacific')
        print('Parsed Seattle weather database loaded in ' + str(timer() - tim) + ' s')
    else :
        wtr_df = weather_parser('./data/Seattle Weatherdata 2002 to 2020.csv')

        wtr_df = wtr_df[~wtr_df.index.duplicated()]

        print('Saving parsed Seattle weather database to ' + wtr_df_pt)
        tim = timer()
        wtr_df.to_parquet(wtr_df_pt) # wtr_df.to_hdf(wtr_df_pt, key='w', mode='w')
        print('Parsed Seattle weather database saved in ' + str(timer() - tim) + ' s')

    return(wtr_df)


Appending to init.py


In [3]:
%%writefile -a init.py

def feature_parser(wtr_df) :
    x = wtr_df[['temp', 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'weather_id']]
    x.index.set_names('datetime', inplace=True)

    x_tim = x.index.isocalendar()
    x_tim['hour'] = x.index.hour
    x = x_tim.join(x)

    return x


def y_parser(calls_df) :
    y = calls_df['Incident Number'].resample('H').count().to_frame('incident_count')
    y.index.set_names('datetime', inplace=True)
    
    return y


def init() :
    
    xy_df_pt = './tmp/xy_df.parquet'
    
    if not os.path.exists('./tmp') :
        os.system('mkdir tmp')
    if os.path.isfile(xy_df_pt) :
        print('Loading ML database from ' + str(xy_df_pt))
        tim = timer()
        xy_df = pd.read_parquet(xy_df_pt)
        print('ML database loaded in ' + str(timer() - tim) + ' s')
    else :
        
        tim = timer()
        calls_df = init_calls()
        wtr_df = init_weather()
        
        x_raw = feature_parser(wtr_df)

        # x_raw.drop_duplicates(inplace=True)

        y_raw = y_parser(calls_df)
        xy_df = y_raw.join(x_raw).dropna()
        # xy_df.index.set_names('datetime', inplace=True)
        # xy_df['hour'] = xy_df.index.hour

        xy_df.drop_duplicates(inplace=True)

        print('Saving ML database to ' + xy_df_pt)
        tim = timer()
        xy_df.to_parquet(xy_df_pt) #, key='x', mode='w'
        print('ML database saved in ' + str(timer() - tim) + ' s')

    return xy_df.iloc[:, 1:], xy_df.iloc[:, 0]


Appending to init.py


## 2. Preprocessing

In [4]:
%%writefile prep.py

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
import sklearn.preprocessing as skl_prep
from sklearn.model_selection import TimeSeriesSplit

def fwd_splitter(n_spi=2, tr_week=261, te_week=52) :
    return TimeSeriesSplit(n_splits=n_spi, max_train_size=tr_week*7*24, test_size=te_week*7*24)


Overwriting prep.py


In [5]:
%%writefile -a prep.py

def nai_splitter() :
    e1_ind = -365*24
    tr_ind = (-365*5-1)*24+e1_ind
    e2_ind = (-365*5-1)*24+2*e1_ind
    return np.arange(tr_ind, e1_ind), np.arange(e1_ind, 0), np.arange(e2_ind, tr_ind)


Appending to prep.py


In [6]:
%%writefile -a prep.py

def prep_ppl() :
    return(Pipeline([
        ('std_scl', skl_prep.StandardScaler())
    ]))


Appending to prep.py


## 3. Modelling

In [7]:
%%writefile model.py

from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

import prep

def model_ppl(rseed=0) :
    return Pipeline([
        ('preprocessor', prep.prep_ppl()),
        ('regressor', GradientBoostingRegressor(random_state=rseed))
    ])
    

Overwriting model.py


## 4. Training

[parameters](https://stackoverflow.com/a/49501713/2682621)

In [8]:
%%writefile train.py

import os
from sklearn.model_selection import GridSearchCV
from time import process_time as timer
import joblib

import init
import prep
import model

def train() :

    if not os.path.exists('./tmp') :
        os.system('mkdir tmp')

    train_pt = './tmp/reg.pkl'
    if os.path.isfile(train_pt) :
        reg = joblib.load(train_pt)
        print('Regressor loaded from ' + train_pt)
    else:
        x, y = init.init()
        tr_ind, e1_ind, e2_ind = prep.nai_splitter()
    
        p_grid = {
            'regressor__learning_rate': [.5, .25, .1, .05, .01],
            'regressor__n_estimators': [25, 50, 100, 200, 400],
            'regressor__max_depth': [3, 4, 5]
        }

        tim = timer()
        reg = GridSearchCV(estimator=model.model_ppl(),
                           param_grid=p_grid,
                           n_jobs=-1,
                           verbose=2)

        print('Cross validating by Grid Search')
        reg.fit(x.iloc[tr_ind], y.iloc[tr_ind])
        # print('Cross validating by Grid Search successful in ' + str(timer() - tim) + ' s')

        joblib.dump(reg, train_pt)
        print('Regressor dumped to ' + train_pt)
        
    return reg


Overwriting train.py


## 5. Main

In [9]:
%%writefile main.py

import sys

import init
import train

def main() :
    if len(sys.argv) == 1 or len(sys.argv) > 3 :
        print('usage: python main.py input.csv [calls.csv]')
    else :
        wtr_new = init.weather_parser(sys.argv[1])
        x_new = init.feature_parser(wtr_new)
        reg = train.train()
        y_pew = reg.predict(x_new)
        
        y_pew.to_csv('output.csv')
        
        if len(sys.argv) == 3 :
            calls_new = init.calls_parser()
            print('Score: ' + str(reg.score(x_new, y_new)))

if __name__ == '__main__':
    main()

Writing main.py
