In [3]:
import os
import pandas as pd

def load_predictions_data():
    predictions_path=os.path.join("datasets", "predictions_prod_smallest.csv")
    return pd.read_csv(predictions_path)

In [5]:
data = load_predictions_data()

In [19]:
data[data['predictions_boarding_status'].notnull()]

Unnamed: 0,predictions_id,predictions_trip_id,predictions_arrival_time,predictions_boarding_status,predictions_departure_time,predictions_stop_id,predictions_stop_sequence,predictions_stops_away,predictions_vehicle_event_id,predictions_file_timestamp,predictions_route_id,predictions_vehicle_id,predictions_direction_id
26,648188038,39988632-20:30-FKenmoreStMaryC,1.553864e+09,Stopped 6 stops away,1.553864e+09,70157,100,6.0,10213965,1553862833,Green-C,G-10076,0
69,615671268,39783408,1.553249e+09,Stopped 1 stop away,1.553249e+09,70040,10,1.0,9662499,1553249378,Blue,B-545C10BB,1
83,648081068,ADDED-1553782534,1.553862e+09,Stopped 4 stops away,1.553862e+09,70202,40,4.0,10212573,1553861692,Green-E,G-10139,0
84,648167117,ADDED-1553782534,1.553863e+09,Stopped 2 stops away,1.553863e+09,70202,40,2.0,10212573,1553862592,Green-E,G-10139,0
100,648193708,39988632-20:30-FKenmoreStMaryC,1.553864e+09,Stopped 5 stops away,1.553864e+09,70159,90,5.0,10213765,1553862894,Green-C,G-10076,0
101,648193709,39988632-20:30-FKenmoreStMaryC,1.553864e+09,Stopped 6 stops away,1.553864e+09,70157,100,6.0,10213965,1553862894,Green-C,G-10076,0
139,648199348,39988632-20:30-FKenmoreStMaryC,1.553864e+09,Stopped 5 stops away,1.553864e+09,70159,90,5.0,10213765,1553862952,Green-C,G-10076,0
140,648199349,39988632-20:30-FKenmoreStMaryC,1.553864e+09,Stopped 6 stops away,1.553864e+09,70157,100,6.0,10213965,1553862952,Green-C,G-10076,0
341,644223024,40132889-L,1.553793e+09,Stopped 3 stops away,1.553793e+09,70098,40,3.0,10148828,1553792572,Red,R-545C3042,1
343,644226980,40132889-L,1.553793e+09,Stopped 3 stops away,1.553793e+09,70098,40,3.0,10148828,1553792632,Red,R-545C3042,1


In [13]:
data.head()

Unnamed: 0,predictions_id,predictions_trip_id,predictions_arrival_time,predictions_boarding_status,predictions_departure_time,predictions_stop_id,predictions_stop_sequence,predictions_stops_away,predictions_vehicle_event_id,predictions_file_timestamp,predictions_route_id,predictions_vehicle_id,predictions_direction_id
0,648187443,39783376,1553863000.0,,1553863000.0,70040,10,1.0,10212508,1553862833,Blue,B-545C365A,1
1,648187455,39988585-20:30-FKenmoreStMaryC,1553863000.0,,1553863000.0,70203,620,1.0,10212521,1553862833,Green-C,G-10142,1
2,648187514,40033949,1553864000.0,,1553864000.0,70007,30,8.0,10214673,1553862833,Orange,O-545C36A0,1
3,648187604,ADDED-1553782585,1553864000.0,,1553864000.0,70156,580,9.0,10214713,1553862833,Green-D,G-10152,1
4,648187732,40033948,1553863000.0,,1553863000.0,70005,20,2.0,10213047,1553862833,Orange,O-545C364E,1


In [20]:
# Keep: predictions_arrival_time, predictions_departure_time, predictions_stop_id (1-hot), predictions_direction_id
# Lose: predictions_id, predictions_trip_id, predictions_stop_sequence, predictions_vehicle_event_id, predictions_vehicle_id
# Maybe someday: predictions_stops_away, predictions_route_id (1-hot)
# Transform: is_stopped (from boarding status), predictions_file_timestamp (to get time of day / day of week)

In [49]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include_stops_away=False, include_route_id=False):
        self.include_stops_away = include_stops_away
        self.include_route_id = include_route_id
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop(["predictions_id", 
                       "predictions_trip_id", 
                       "predictions_stop_sequence",
                       "predictions_vehicle_event_id",
                       "predictions_vehicle_id",],
                      axis=1)

# Transform: is_stopped (from boarding status), predictions_file_timestamp (to get time of day / day of week)

class IsStoppedTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        result['is_stopped'] = X['predictions_boarding_status'].notnull()
        return result.drop('predictions_boarding_status', axis=1)
# Transform: 

In [50]:
is_stopped_transform = IsStoppedTransformer()

is_stopped_transform.transform(new_data)

Unnamed: 0,predictions_id,predictions_trip_id,predictions_arrival_time,predictions_departure_time,predictions_stop_id,predictions_stop_sequence,predictions_stops_away,predictions_vehicle_event_id,predictions_file_timestamp,predictions_route_id,predictions_vehicle_id,predictions_direction_id,is_stopped
0,648187443,39783376,1.553863e+09,1.553863e+09,70040,10,1.0,10212508,1553862833,Blue,B-545C365A,1,False
1,648187455,39988585-20:30-FKenmoreStMaryC,1.553863e+09,1.553863e+09,70203,620,1.0,10212521,1553862833,Green-C,G-10142,1,False
2,648187514,40033949,1.553864e+09,1.553864e+09,70007,30,8.0,10214673,1553862833,Orange,O-545C36A0,1,False
3,648187604,ADDED-1553782585,1.553864e+09,1.553864e+09,70156,580,9.0,10214713,1553862833,Green-D,G-10152,1,False
4,648187732,40033948,1.553863e+09,1.553863e+09,70005,20,2.0,10213047,1553862833,Orange,O-545C364E,1,False
5,648187713,ADDED-1553782592,1.553863e+09,1.553863e+09,70151,130,4.0,10213114,1553862833,Green-C,G-10037,0,False
6,648187729,40132824-L,1.553863e+09,,70105,220,3.0,10213408,1553862833,Red,R-545C3658,0,False
7,615570917,ADDED-1552921638,,1.553249e+09,70260,1,20.0,9662483,1553247638,Green-E,G-10058,1,False
8,615573906,ADDED-1552921638,,1.553249e+09,70260,1,20.0,9662483,1553247698,Green-E,G-10058,1,False
9,615577063,ADDED-1552921638,,1.553249e+09,70260,1,19.0,9662483,1553247758,Green-E,G-10058,1,False


In [39]:
new_data = data.copy()
is_stopped_transform.transform(new_data)

Unnamed: 0,predictions_id,predictions_trip_id,predictions_arrival_time,predictions_departure_time,predictions_stop_id,predictions_stop_sequence,predictions_stops_away,predictions_vehicle_event_id,predictions_file_timestamp,predictions_route_id,predictions_vehicle_id,predictions_direction_id,is_stopped
0,648187443,39783376,1.553863e+09,1.553863e+09,70040,10,1.0,10212508,1553862833,Blue,B-545C365A,1,False
1,648187455,39988585-20:30-FKenmoreStMaryC,1.553863e+09,1.553863e+09,70203,620,1.0,10212521,1553862833,Green-C,G-10142,1,False
2,648187514,40033949,1.553864e+09,1.553864e+09,70007,30,8.0,10214673,1553862833,Orange,O-545C36A0,1,False
3,648187604,ADDED-1553782585,1.553864e+09,1.553864e+09,70156,580,9.0,10214713,1553862833,Green-D,G-10152,1,False
4,648187732,40033948,1.553863e+09,1.553863e+09,70005,20,2.0,10213047,1553862833,Orange,O-545C364E,1,False
5,648187713,ADDED-1553782592,1.553863e+09,1.553863e+09,70151,130,4.0,10213114,1553862833,Green-C,G-10037,0,False
6,648187729,40132824-L,1.553863e+09,,70105,220,3.0,10213408,1553862833,Red,R-545C3658,0,False
7,615570917,ADDED-1552921638,,1.553249e+09,70260,1,20.0,9662483,1553247638,Green-E,G-10058,1,False
8,615573906,ADDED-1552921638,,1.553249e+09,70260,1,20.0,9662483,1553247698,Green-E,G-10058,1,False
9,615577063,ADDED-1552921638,,1.553249e+09,70260,1,19.0,9662483,1553247758,Green-E,G-10058,1,False


In [40]:
new_data

Unnamed: 0,predictions_id,predictions_trip_id,predictions_arrival_time,predictions_boarding_status,predictions_departure_time,predictions_stop_id,predictions_stop_sequence,predictions_stops_away,predictions_vehicle_event_id,predictions_file_timestamp,predictions_route_id,predictions_vehicle_id,predictions_direction_id,is_stopped
0,648187443,39783376,1.553863e+09,,1.553863e+09,70040,10,1.0,10212508,1553862833,Blue,B-545C365A,1,False
1,648187455,39988585-20:30-FKenmoreStMaryC,1.553863e+09,,1.553863e+09,70203,620,1.0,10212521,1553862833,Green-C,G-10142,1,False
2,648187514,40033949,1.553864e+09,,1.553864e+09,70007,30,8.0,10214673,1553862833,Orange,O-545C36A0,1,False
3,648187604,ADDED-1553782585,1.553864e+09,,1.553864e+09,70156,580,9.0,10214713,1553862833,Green-D,G-10152,1,False
4,648187732,40033948,1.553863e+09,,1.553863e+09,70005,20,2.0,10213047,1553862833,Orange,O-545C364E,1,False
5,648187713,ADDED-1553782592,1.553863e+09,,1.553863e+09,70151,130,4.0,10213114,1553862833,Green-C,G-10037,0,False
6,648187729,40132824-L,1.553863e+09,,,70105,220,3.0,10213408,1553862833,Red,R-545C3658,0,False
7,615570917,ADDED-1552921638,,,1.553249e+09,70260,1,20.0,9662483,1553247638,Green-E,G-10058,1,False
8,615573906,ADDED-1552921638,,,1.553249e+09,70260,1,20.0,9662483,1553247698,Green-E,G-10058,1,False
9,615577063,ADDED-1552921638,,,1.553249e+09,70260,1,19.0,9662483,1553247758,Green-E,G-10058,1,False


In [48]:
new_data[:, 1]

TypeError: '(slice(None, None, None), 1)' is an invalid key