In [1]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

[K     |████████████████████████████████| 720 kB 5.5 MB/s 
[K     |████████████████████████████████| 189 kB 35.9 MB/s 
[K     |████████████████████████████████| 46 kB 2.1 MB/s 
[K     |████████████████████████████████| 1.2 MB 31.7 MB/s 
[K     |████████████████████████████████| 56 kB 2.2 MB/s 
[K     |████████████████████████████████| 51 kB 211 kB/s 
[?25hMounted at /content/gdrive


In [2]:
from fastai.tabular.all import *

In [3]:
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# Read data

In [4]:
df = pd.read_csv('gdrive/MyDrive/DVA-project/data.csv', index_col=0)

  mask |= (ar1 == a)


In [5]:
df = df.sort_values(by=['station','half_day','date']).reset_index(drop=True)
df

Unnamed: 0,station,date,half_day,pickup,dropoff,AVG_TEMP,CASE_COUNT,DEATH_COUNT
0,1 Ave & E 110 St,2018-01-01,0,1.0,0.0,12,0.0,0.0
1,1 Ave & E 110 St,2018-01-02,0,6.0,2.0,18,0.0,0.0
2,1 Ave & E 110 St,2018-01-03,0,6.0,3.0,20,0.0,0.0
3,1 Ave & E 110 St,2018-01-04,0,2.0,1.0,25,0.0,0.0
4,1 Ave & E 110 St,2018-01-05,0,1.0,3.0,15,0.0,0.0
...,...,...,...,...,...,...,...,...
4395614,York St & Jay St,2021-09-26,1,0.0,0.0,69,848.0,14.0
4395615,York St & Jay St,2021-09-27,1,0.0,0.0,72,1372.0,16.0
4395616,York St & Jay St,2021-09-28,1,0.0,0.0,72,1112.0,13.0
4395617,York St & Jay St,2021-09-29,1,0.0,0.0,64,1092.0,13.0


In [6]:
#move up
df['y1_pickup'] = df.pickup[1:].reset_index(drop=True)
df['y2_dropoff'] = df.dropoff[1:].reset_index(drop=True)

In [7]:
df

Unnamed: 0,station,date,half_day,pickup,dropoff,AVG_TEMP,CASE_COUNT,DEATH_COUNT,y1_pickup,y2_dropoff
0,1 Ave & E 110 St,2018-01-01,0,1.0,0.0,12,0.0,0.0,6.0,2.0
1,1 Ave & E 110 St,2018-01-02,0,6.0,2.0,18,0.0,0.0,6.0,3.0
2,1 Ave & E 110 St,2018-01-03,0,6.0,3.0,20,0.0,0.0,2.0,1.0
3,1 Ave & E 110 St,2018-01-04,0,2.0,1.0,25,0.0,0.0,1.0,3.0
4,1 Ave & E 110 St,2018-01-05,0,1.0,3.0,15,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
4395614,York St & Jay St,2021-09-26,1,0.0,0.0,69,848.0,14.0,0.0,0.0
4395615,York St & Jay St,2021-09-27,1,0.0,0.0,72,1372.0,16.0,0.0,0.0
4395616,York St & Jay St,2021-09-28,1,0.0,0.0,72,1112.0,13.0,0.0,0.0
4395617,York St & Jay St,2021-09-29,1,0.0,0.0,64,1092.0,13.0,0.0,0.0


In [62]:
#delete columns of date = 2021-9-30
df = df[df['date']!='2021-09-30'].reset_index(drop=True)
df

Unnamed: 0,station,date,half_day,pickup,dropoff,AVG_TEMP,CASE_COUNT,DEATH_COUNT,y1_pickup,y2_dropoff,y2_dropoff_pred
0,1 Ave & E 110 St,2018-01-01,0,1.0,0.0,12,0.0,0.0,6.0,2.0,2.968942
1,1 Ave & E 110 St,2018-01-02,0,6.0,2.0,18,0.0,0.0,6.0,3.0,4.670134
2,1 Ave & E 110 St,2018-01-03,0,6.0,3.0,20,0.0,0.0,2.0,1.0,5.127507
3,1 Ave & E 110 St,2018-01-04,0,2.0,1.0,25,0.0,0.0,1.0,3.0,4.545990
4,1 Ave & E 110 St,2018-01-05,0,1.0,3.0,15,0.0,0.0,1.0,0.0,4.187149
...,...,...,...,...,...,...,...,...,...,...,...
4392408,York St & Jay St,2021-09-25,1,0.0,0.0,69,798.0,15.0,0.0,0.0,3.522367
4392409,York St & Jay St,2021-09-26,1,0.0,0.0,69,848.0,14.0,0.0,0.0,3.607436
4392410,York St & Jay St,2021-09-27,1,0.0,0.0,72,1372.0,16.0,0.0,0.0,4.129452
4392411,York St & Jay St,2021-09-28,1,0.0,0.0,72,1112.0,13.0,0.0,0.0,3.878763


In [63]:
df2 = df.groupby(['station','half_day']).last()
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,date,pickup,dropoff,AVG_TEMP,CASE_COUNT,DEATH_COUNT,y1_pickup,y2_dropoff,y2_dropoff_pred
station,half_day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1 Ave & E 110 St,0,2021-09-29,30.0,23.0,64,1092.0,13.0,29.0,24.0,19.094231
1 Ave & E 110 St,1,2021-09-29,71.0,76.0,64,1092.0,13.0,53.0,57.0,54.549244
1 Ave & E 16 St,0,2021-09-29,128.0,80.0,64,1092.0,13.0,134.0,89.0,70.995361
1 Ave & E 16 St,1,2021-09-29,208.0,253.0,64,1092.0,13.0,239.0,283.0,196.075745
1 Ave & E 18 St,0,2021-09-29,133.0,84.0,64,1092.0,13.0,132.0,82.0,62.213551
...,...,...,...,...,...,...,...,...,...,...
Wythe Ave & Metropolitan Ave,1,2021-09-29,141.0,130.0,64,1092.0,13.0,159.0,154.0,119.213684
Yankee Ferry Terminal,0,2021-09-29,2.0,2.0,64,1092.0,13.0,1.0,2.0,7.260398
Yankee Ferry Terminal,1,2021-09-29,11.0,8.0,64,1092.0,13.0,3.0,3.0,22.048653
York St & Jay St,0,2021-09-29,0.0,0.0,64,1092.0,13.0,0.0,0.0,3.749078


In [10]:
# how to deal with stations that don't have 9-30, or in the middle

In [64]:
splits = RandomSplitter(valid_pct=0.2)(range_of(df))

In [65]:
to = TabularPandas(df, procs=[Categorify, FillMissing,Normalize],
                   cat_names = ['station', 'half_day'],
                   cont_names = ['pickup',	'dropoff',	'AVG_TEMP',	'CASE_COUNT',	'DEATH_COUNT'],
                   y_names='y1_pickup', #'y2_dropoff'],
                   splits=splits)

In [66]:
to.xs.iloc[:10]

Unnamed: 0,station,half_day,pickup,dropoff,AVG_TEMP,CASE_COUNT,DEATH_COUNT
1131896,414,1,-0.506676,-0.50838,-0.98167,-0.521128,-0.309265
8856,4,1,2.351309,2.327068,1.371623,-0.521128,-0.309265
3123578,1141,1,-0.506676,-0.50838,-1.039067,-0.521128,-0.309265
2335872,853,2,-0.506676,-0.50838,1.027239,-0.521128,-0.309265
2775017,1013,2,-0.280301,-0.250612,-1.039067,3.413874,0.770049
3398594,1241,1,-0.393489,-0.451098,-0.98167,3.604045,0.886732
2636893,963,1,0.738387,1.41056,1.084636,0.2548,-0.265509
2014703,736,1,-0.506676,-0.50838,-1.670438,1.0219,0.201221
3298522,1204,2,-0.506676,-0.50838,-0.063311,-0.073385,-0.265509
3762534,1374,1,-0.421786,-0.479739,1.084636,-0.521128,-0.309265


In [67]:
len(to)

4392413

In [68]:
dls = to.dataloaders(bs=64)

In [69]:
dls.show_batch()

Unnamed: 0,station,half_day,pickup,dropoff,AVG_TEMP,CASE_COUNT,DEATH_COUNT,y1_pickup
0,Freeman St & Reverend James A Polite Ave,1,3.863718e-07,3.003995e-07,54.0,1231.999988,104.0,0.0
1,41 St & 3 Ave,0,3.863718e-07,3.003995e-07,82.0,296.999992,8.0,0.0
2,Macon St & Nostrand Ave,1,17.0,16.0,43.0,4187.999866,57.0,6.0
3,Marion Ave & Mosholu Pkwy,0,3.863718e-07,3.003995e-07,53.0,620.999999,4.0,0.0
4,10 Ave & W 204 St,1,3.863718e-07,3.003995e-07,68.0,613.0,4.0,0.0
5,Waterloo Pl & Crotona Park East,1,3.863718e-07,3.003995e-07,69.0,279.999995,5.0,0.0
6,E 115 St & Lexington Ave,1,36.0,31.0,58.0,628.999999,21.0,27.0
7,Myrtle Ave & Grove St,0,3.863718e-07,3.003995e-07,76.999999,5e-06,-2.398975e-07,0.0
8,Fulton St & Broadway,0,11.0,21.0,40.0,3723.999941,85.0,6.0
9,Lexington Ave & E 24 St,1,3.863718e-07,3.003995e-07,42.0,5e-06,-2.398975e-07,0.0


In [70]:
learn = tabular_learner(dls, metrics=mse)

In [71]:
learn.loss_func

FlattenedLoss of MSELoss()

In [72]:
learn.model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(1604, 100)
    (1): Embedding(3, 3)
  )
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): LinBnDrop(
      (0): Linear(in_features=108, out_features=200, bias=False)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): LinBnDrop(
      (0): Linear(in_features=200, out_features=100, bias=False)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): LinBnDrop(
      (0): Linear(in_features=100, out_features=1, bias=True)
    )
  )
)

In [73]:
learn.fit_one_cycle(5)

epoch,train_loss,valid_loss,mse,time
0,232.61142,222.283661,222.283661,12:52
1,209.002823,223.372757,223.372757,12:38
2,206.844131,362.45578,362.45578,12:37
3,215.71109,1273.441895,1273.441895,12:32
4,210.264374,2159.739258,2159.739258,12:29


In [74]:
learn.show_results()

Unnamed: 0,station,half_day,pickup,dropoff,AVG_TEMP,CASE_COUNT,DEATH_COUNT,y1_pickup,y1_pickup_pred
0,1078.0,2.0,-0.506676,-0.50838,1.543815,-0.521128,-0.309265,0.0,1.021559
1,1177.0,1.0,-0.506676,-0.50838,-0.52249,-0.521128,-0.309265,0.0,1.610842
2,864.0,2.0,-0.506676,-0.50838,0.797649,-0.521128,-0.309265,0.0,0.866056
3,439.0,1.0,-0.506676,-0.50838,-1.785233,-0.521128,-0.309265,0.0,1.325295
4,1283.0,1.0,-0.506676,-0.50838,0.56806,0.7218,1.995217,0.0,2.4283
5,132.0,1.0,-0.223708,-0.19333,-1.383451,-0.521128,-0.309265,5.0,8.982565
6,499.0,1.0,0.172449,-0.307894,0.625457,-0.521128,-0.309265,15.0,19.139715
7,1538.0,1.0,-0.506676,-0.50838,-0.350298,1.666233,3.030776,0.0,2.135983
8,1309.0,2.0,0.71009,0.866383,1.256828,-0.43527,-0.148827,30.0,33.178093


In [75]:
cd gdrive/MyDrive/DVA-project/

[Errno 2] No such file or directory: 'gdrive/MyDrive/DVA-project/'
/content/gdrive/MyDrive/DVA-project


In [76]:
learn.save('y1_pickup_pred')

Path('models/y1_pickup_pred.pth')

In [24]:
# test_df = df.copy()
# test_df.drop(['salary'], axis=1, inplace=True)
# dl = learn.dls.test_dl(test_df)

In [77]:
dls.valid

<fastai.tabular.core.TabDataLoader at 0x7f5840feae10>

In [82]:
valid_pred = learn.get_preds(dl=dls.valid)

In [83]:
pred, y = valid_pred

In [84]:
y

tensor([[  0.],
        [ 31.],
        [127.],
        ...,
        [  0.],
        [  0.],
        [  0.]])

In [85]:
pred

tensor([[  2.8202],
        [ 26.3870],
        [138.0936],
        ...,
        [  1.5113],
        [  1.5123],
        [  5.9565]])

In [87]:
df2 = df.drop(['date','y1_pickup',  'y2_dropoff'],axis=1)
learn.predict(df2.iloc[0])
dl = learn.dls.test_dl(df2)
y, pred = learn.get_preds(dl=dl)

In [88]:
y

tensor([[3.7907],
        [7.4753],
        [7.6230],
        ...,
        [2.3468],
        [2.7720],
        [2.2584]])

In [89]:
df['y1_pickup_pred'] = y.numpy()

In [90]:
df

Unnamed: 0,station,date,half_day,pickup,dropoff,AVG_TEMP,CASE_COUNT,DEATH_COUNT,y1_pickup,y2_dropoff,y2_dropoff_pred,y1_pickup_pred
0,1 Ave & E 110 St,2018-01-01,0,1.0,0.0,12,0.0,0.0,6.0,2.0,2.968942,3.790713
1,1 Ave & E 110 St,2018-01-02,0,6.0,2.0,18,0.0,0.0,6.0,3.0,4.670134,7.475292
2,1 Ave & E 110 St,2018-01-03,0,6.0,3.0,20,0.0,0.0,2.0,1.0,5.127507,7.623029
3,1 Ave & E 110 St,2018-01-04,0,2.0,1.0,25,0.0,0.0,1.0,3.0,4.545990,6.145617
4,1 Ave & E 110 St,2018-01-05,0,1.0,3.0,15,0.0,0.0,1.0,0.0,4.187149,5.828378
...,...,...,...,...,...,...,...,...,...,...,...,...
4392408,York St & Jay St,2021-09-25,1,0.0,0.0,69,798.0,15.0,0.0,0.0,3.522367,2.383316
4392409,York St & Jay St,2021-09-26,1,0.0,0.0,69,848.0,14.0,0.0,0.0,3.607436,2.443176
4392410,York St & Jay St,2021-09-27,1,0.0,0.0,72,1372.0,16.0,0.0,0.0,4.129452,2.346811
4392411,York St & Jay St,2021-09-28,1,0.0,0.0,72,1112.0,13.0,0.0,0.0,3.878763,2.772028


In [91]:
df.to_csv('y1_y2_pred.csv', index=False)

# Try with other models

In [30]:
X_train, y_train = to.train.xs, to.train.ys.values.ravel()
X_test, y_test = to.valid.xs, to.valid.ys.values.ravel()

In [31]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC, LinearSVC
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.linear_model import Perceptron
# from sklearn.linear_model import SGDClassifier
# from sklearn.tree import DecisionTreeClassifier

In [32]:
model = SVC(dkjfdkjfkdf)
model.fit(X_train, y_train)

NameError: ignored