In [1]:
import pandas as pd
import pickle
import sqlalchemy
import pymysql
import os
from urllib.parse import quote
from cleaningScript import cleanDatav2
from dotenv import load_dotenv
from sklearn.ensemble import RandomForestRegressor

In [2]:
load_dotenv()
password = os.getenv('DB_PASSWORD')

In [3]:
DBengine = sqlalchemy.create_engine("mysql+pymysql://student:{}@localhost:3306/dublinbus".format(quote(password)))

In [4]:
dfBase = pd.read_sql('call dublinbus.all_lines_directions();', DBengine) #using stored procedure

In [5]:
lines = dfBase.values.tolist()

In [None]:
for line,direction in lines:
    print('route:',line, direction)
    df = cleanDatav2(line,direction)
    print('loaded!')
    df['weekday'] = df.date.dt.weekday
    df['month'] = df.date.dt.month
    df['hour'] = (df.stopActualArr//3600)
    df = df.drop(columns = ['stopActualArr','rain','temp','pressure','dwelltime'])
    df.hour %= 24
##removing dummies to reduce pickle file size
    df.month = df.month.astype('category')
    df.hour = df.hour.astype('category')
    df.weekday = df.weekday.astype('category')
    monthDummies = pd.get_dummies(df.month, prefix='m', drop_first=True)
    hourDummies = pd.get_dummies(df.hour,prefix='h', drop_first=True)
    dayDummies = pd.get_dummies(df.weekday, prefix='d', drop_first=True)
    df = pd.concat([df,monthDummies,hourDummies,dayDummies], axis=1)
    df.drop(columns=['month','hour','weekday'], inplace=True)
    Xfeatures = df.columns[df.columns != 'journeytime']
    Xfeatures = Xfeatures.drop(['date','tripid','humidity'])
    X = df[Xfeatures]
    y = df.journeytime
    print('ready!')
    rfc = RandomForestRegressor(m)
    rfc.fit(X,y)
    print('saving!')
    with open('models/model{}_{}.pkl'.format(line,direction),'wb') as file:
        pickle.dump(rfc,file, pickle.HIGHEST_PROTOCOL)

In [7]:
lines

[['14', 1],
 ['14', 2],
 ['77A', 1],
 ['77A', 2],
 ['25', 2],
 ['25', 1],
 ['46A', 2],
 ['46A', 1],
 ['39', 2],
 ['39', 1],
 ['43', 1],
 ['43', 2],
 ['16', 1],
 ['16', 2],
 ['16C', 2],
 ['76', 1],
 ['76', 2],
 ['27A', 2],
 ['27A', 1],
 ['32', 2],
 ['32', 1],
 ['40', 1],
 ['40', 2],
 ['39A', 2],
 ['39A', 1],
 ['25B', 2],
 ['25B', 1],
 ['25A', 1],
 ['25A', 2],
 ['151', 1],
 ['151', 2],
 ['25X', 2],
 ['15', 1],
 ['15', 2],
 ['29A', 2],
 ['29A', 1],
 ['31B', 2],
 ['31A', 1],
 ['31A', 2],
 ['31', 1],
 ['31', 2],
 ['65', 1],
 ['65', 2],
 ['7A', 1],
 ['7A', 2],
 ['7', 1],
 ['7', 2],
 ['47', 1],
 ['47', 2],
 ['33', 2],
 ['33', 1],
 ['33A', 2],
 ['13', 2],
 ['13', 1],
 ['66B', 1],
 ['66B', 2],
 ['26', 1],
 ['26', 2],
 ['66', 2],
 ['145', 1],
 ['145', 2],
 ['27B', 2],
 ['27B', 1],
 ['45A', 2],
 ['45A', 1],
 ['15B', 1],
 ['15B', 2],
 ['37', 2],
 ['37', 1],
 ['27', 1],
 ['27', 2],
 ['40B', 1],
 ['40B', 2],
 ['40D', 1],
 ['40D', 2],
 ['16C', 1],
 ['17A', 1],
 ['17A', 2],
 ['238', 2],
 ['238', 1],
 

# print Head For Implementation

In [9]:
print('route:', "46A", "1")
df = cleanDatav2('46A',1)
print('loaded!')
df['weekday'] = df.date.dt.weekday
df['month'] = df.date.dt.month
df['hour'] = (df.stopActualArr//3600)
df.drop(columns = ['stopActualArr','rain','temp','pressure','dwelltime','date','tripid','humidity'], inplace=True)
df.hour %= 24
df.month = df.month.astype('category')
df.hour = df.hour.astype('category')
df.weekday = df.weekday.astype('category')
monthDummies = pd.get_dummies(df.month, prefix='m', drop_first=True)
hourDummies = pd.get_dummies(df.hour,prefix='h', drop_first=True)
dayDummies = pd.get_dummies(df.weekday, prefix='d', drop_first=True)
df = pd.concat([df,monthDummies,hourDummies,dayDummies], axis=1)
df.drop(columns=['month','hour','weekday'], inplace=True)
Xfeatures = df.columns[df.columns != 'journeytime']
X = df[Xfeatures]
y = df.journeytime
print("finished")

route: 46A 1
loaded!
finished


In [11]:
X.head()

Unnamed: 0,progrnumber,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,...,h_20,h_21,h_22,h_23,d_1,d_2,d_3,d_4,d_5,d_6
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
pd.set_option('display.max_columns', None)

In [11]:
X.head()

Unnamed: 0,index,progrnumber,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,h_6,h_7,h_8,h_9,h_10,h_11,h_12,h_13,h_14,h_15,h_16,h_17,h_18,h_19,h_20,h_21,h_22,h_23,d_1,d_2,d_3,d_4,d_5,d_6
0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,5,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
