# Bixi Data Mining Project

Students: Quan Hao, 11248609; Gabriel Lainesse, 11189782; Chaoyang Zheng, 11249259 

Course: Data Mining Techniques

# Data Pre-Processing

Importing, appending and merging data. Performing some feature engineering as well as type conversions and data cleaning.

## Preparing the Data

### Importing librairies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
%matplotlib inline
import plotly
from types import *
import math

### Importing and Processing Data

In [2]:
# Setting up path variables
if 'notebook_path' not in globals():
    #if notebook_path does not already exists, create it
    notebook_path = os.path.abspath(os.getcwd())
else:
    #otherwise, change current directory to notebook_path in order to set the other variables
    os.chdir(notebook_path)
    
path_bixi_data = os.path.abspath(r"Data/Bixi/")
path_weather_data = os.path.abspath(r"Data/Weather")
path_weather_data2 = os.path.abspath(r"Data/Weather2")
path_mtl_data = os.path.abspath(r"Data/Montreal")
path_festival_data = os.path.abspath(r"Data/Festival")
path_fuel_data = os.path.abspath(r"Data/Fuel")


#### Bixi Stations

In [3]:
# Import yearly Bixi Stations tables
os.chdir(path_bixi_data)
stations_2014 = pd.read_csv(r"Modified Stations Data/Stations_2014.csv")
stations_2015 = pd.read_csv(r"Modified Stations Data/Stations_2015.csv")
stations_2016 = pd.read_csv(r"Modified Stations Data/Stations_2016.csv")
stations_2017 = pd.read_csv(r"Modified Stations Data/Stations_2017.csv")
stations_2018 = pd.read_csv(r"Modified Stations Data/Stations_2018.csv")

In [4]:
stations_2014.head()

Unnamed: 0.1,Unnamed: 0,code,name,latitude,longitude,neighborhood,great_park,affectation
0,0,6209,Milton / Clark,45.51252,-73.57062,Le Plateau-Mont-Royal,,mixte
1,1,6436,Côte St-Antoine / Clarke,45.486452,-73.595234,Westmount,,
2,2,6214,Square St-Louis,45.51735,-73.56906,Le Plateau-Mont-Royal,,residentiel
3,3,6248,St-Dominique / Rachel,45.518593,-73.581566,Le Plateau-Mont-Royal,,mixte
4,4,6164,Chambord / Laurier,45.532955,-73.584194,Le Plateau-Mont-Royal,,residentiel


### Merging the Bixi Bike Rental Activity with the Stations Data and Appending them together.

#### Bixi Bike Rental Activity

In [5]:
# Define the Mass Append Table Function:
def mass_append(df_list):
    """ 
    mass_append(df_list)
    
    Description:
        This function appends together all DataFrames withina list.
    
    Argument(s): 
        df_list : List of pandas DataFrame objects. 
    """
    # Instantiate empty DataFrame
    output_df = pd.DataFrame()
    
    for df in df_list:
        # Append each table one by one
        output_df = output_df.append(df)
    return output_df

In [6]:
# Import Bixi Bike Rental Activity for each year
# For each year, store filenames in a list
files_rentals_2014 = [x for x in os.listdir() if "OD_2014" in x]
files_rentals_2015 = [x for x in os.listdir() if "OD_2015" in x]
files_rentals_2016 = [x for x in os.listdir() if "OD_2016" in x]
files_rentals_2017 = [x for x in os.listdir() if "OD_2017" in x]
files_rentals_2018 = [x for x in os.listdir() if "OD_2018" in x]
files_rentals_all = [(files_rentals_2014, stations_2014), 
                     (files_rentals_2015, stations_2015), 
                     (files_rentals_2016, stations_2016), 
                     (files_rentals_2017, stations_2017),
                     (files_rentals_2018, stations_2018)]

In [7]:
# Changing current directory to bixi data
os.chdir(path_bixi_data)
# Reading as DataFrame and then appending all DataFrames from 2014 trip history csv files
df_rentals_2014 = mass_append([pd.read_csv(x) for x in files_rentals_2014])

# Merging the appended trip history to the stations data (for starting stations).
df_rentals_2014 = df_rentals_2014.merge(stations_2014, how='left', 
                                        left_on='start_station_code', right_on='code')

# Renaming columns
df_rentals_2014.rename({'code':'start_code', 
                        'name':'start_name', 
                        'latitude':'start_latitude', 
                        'longitude':'start_longitude',
                        'neighborhood':'start_neighborhood',
                        'great_park':'start_great_park',
                        'affectation':'start_affectation'}, axis='columns', inplace=True)

# Merging the appended trip history to the stations data (for ending stations).
df_rentals_2014 = df_rentals_2014.merge(stations_2014, how='left', 
                                        left_on='end_station_code', right_on='code')

# Renaming columns
df_rentals_2014.rename({'code':'end_code', 
                        'name':'end_name', 
                        'latitude':'end_latitude', 
                        'longitude':'end_longitude',
                        'neighborhood':'end_neighborhood',
                        'great_park':'end_great_park',
                        'affectation':'end_affectation'}, axis='columns', inplace=True)

In [None]:
# Doing the same for 2015 as was done for the previous years.
df_rentals_2015 = mass_append([pd.read_csv(x) for x in files_rentals_2015])
df_rentals_2015 = df_rentals_2015.merge(stations_2015, how='left', 
                                        left_on='start_station_code', right_on='code')
df_rentals_2015.rename({'code':'start_code', 
                        'name':'start_name', 
                        'latitude':'start_latitude', 
                        'longitude':'start_longitude',
                        'neighborhood':'start_neighborhood',
                        'great_park':'start_great_park',
                        'affectation':'start_affectation'}, axis='columns', inplace=True)
df_rentals_2015 = df_rentals_2015.merge(stations_2015, how='left', 
                                        left_on='end_station_code', right_on='code')
df_rentals_2015.rename({'code':'end_code', 
                        'name':'end_name', 
                        'latitude':'end_latitude', 
                        'longitude':'end_longitude',
                        'neighborhood':'end_neighborhood',
                        'great_park':'end_great_park',
                        'affectation':'end_affectation'}, axis='columns', inplace=True)

In [None]:
# Doing the same for 2016 as was done for the previous years.
df_rentals_2016 = mass_append([pd.read_csv(x) for x in files_rentals_2016])
df_rentals_2016 = df_rentals_2016.merge(stations_2016, how='left', 
                                        left_on='start_station_code', right_on='code')
df_rentals_2016.rename({'code':'start_code', 
                        'name':'start_name', 
                        'latitude':'start_latitude', 
                        'longitude':'start_longitude',
                        'neighborhood':'start_neighborhood',
                        'great_park':'start_great_park',
                        'affectation':'start_affectation'}, axis='columns', inplace=True)
df_rentals_2016 = df_rentals_2016.merge(stations_2016, how='left', 
                                        left_on='end_station_code', right_on='code')
df_rentals_2016.rename({'code':'end_code', 
                        'name':'end_name', 
                        'latitude':'end_latitude', 
                        'longitude':'end_longitude',
                        'neighborhood':'end_neighborhood',
                        'great_park':'end_great_park',
                        'affectation':'end_affectation'}, axis='columns', inplace=True)

In [7]:
# Doing the same for 2017 as was done for the previous years.
df_rentals_2017 = mass_append([pd.read_csv(x) for x in files_rentals_2017])
df_rentals_2017 = df_rentals_2017.merge(stations_2017, how='left', 
                                        left_on='start_station_code', right_on='code')
df_rentals_2017.rename({'code':'start_code', 
                        'name':'start_name', 
                        'latitude':'start_latitude', 
                        'longitude':'start_longitude',
                        'neighborhood':'start_neighborhood',
                        'great_park':'start_great_park',
                        'affectation':'start_affectation'}, axis='columns', inplace=True)
df_rentals_2017 = df_rentals_2017.merge(stations_2017, how='left', 
                                        left_on='end_station_code', right_on='code')
df_rentals_2017.rename({'code':'end_code', 
                        'name':'end_name', 
                        'latitude':'end_latitude', 
                        'longitude':'end_longitude',
                        'neighborhood':'end_neighborhood',
                        'great_park':'end_great_park',
                        'affectation':'end_affectation'}, axis='columns', inplace=True)

In [66]:
# Doing the same for 2018 as was done for the previous years.
df_rentals_2018 = mass_append([pd.read_csv(x) for x in files_rentals_2018])
df_rentals_2018 = df_rentals_2018.merge(stations_2018, how='left', 
                                        left_on='start_station_code', right_on='code')
df_rentals_2018.rename({'code':'start_code', 
                        'name':'start_name', 
                        'latitude':'start_latitude', 
                        'longitude':'start_longitude',
                        'neighborhood':'start_neighborhood',
                        'great_park':'start_great_park',
                        'affectation':'start_affectation'}, axis='columns', inplace=True)
df_rentals_2018 = df_rentals_2018.merge(stations_2018, how='left', 
                                        left_on='end_station_code', right_on='code')
df_rentals_2018.rename({'code':'end_code', 
                        'name':'end_name', 
                        'latitude':'end_latitude', 
                        'longitude':'end_longitude',
                        'neighborhood':'end_neighborhood',
                        'great_park':'end_great_park',
                        'affectation':'end_affectation'}, axis='columns', inplace=True)

## Changing Final Output to either the full dataset or just one year:

Due to performance constraints, we chose to build our model using only one year of the dataset. The two cells below allow us to either export everything, or export only a single year:

In [8]:
# Export only one year. Uncomment to perform, comment out to prevent execution.
df_rentals = df_rentals_2017

In [23]:
# Export all data. Uncomment to perform, comment out to prevent execution.
#df_rentals = mass_append([df_rentals_2014, df_rentals_2015, df_rentals_2016, 
#                       df_rentals_2017, df_rentals_2018])

In [None]:
#Deleting non-merged DataFrames to save memory
try:
    del df_rentals_2014
    del df_rentals_2015
    del df_rentals_2016
    del df_rentals_2017
    del df_rentals_2018
    del stations_2014
    del stations_2015
    del stations_2016
    del stations_2017
    del stations_2018
except:
    continue

## Data Processing

#### Data Type Adjustments

In [9]:
# Coercing start_date and end_date to datetime type.
df_rentals['start_date'] = pd.to_datetime(df_rentals['start_date'])
df_rentals['end_date'] = pd.to_datetime(df_rentals['end_date'])

In [10]:
# Creating two new variables containing the nearest hour from 'start_date' and 'end_date'
# in order to create categories for the time of the day
df_rentals = df_rentals.assign(start_date_hour = df_rentals['start_date'].dt.hour)
df_rentals = df_rentals.assign(end_date_hour = df_rentals['end_date'].dt.hour)

# Importing and merging Weather Data

### First version of weather data (from Weatherstats.ca)

In [11]:
# Importing the first version of weather data
os.chdir(path_weather_data)
weather = pd.read_csv("weatherstats_montreal_hourly.csv", parse_dates=["date_time_local"])

### Second version of weather data (from Kaggle)

In [None]:
os.chdir(path_weather_data2)

**Weather descriptions**

In [14]:
# Import Weather descriptions and keep only Montreal
w2_desc = pd.read_csv("weather_description.csv", parse_dates=['datetime'])
w2_desc = w2_desc.filter(axis=1,items=['datetime','Montreal'])
w2_desc = w2_desc.rename({'Montreal':'Weather Condition'}, axis=1)

**Humidity**

In [17]:
# Import Humidity and keep only Montreal
w2_humid = pd.read_csv("humidity.csv", parse_dates=['datetime'])
w2_humid = w2_humid.filter(axis=1, items=['datetime', 'Montreal']).rename({'Montreal':'Humidity'}, axis=1)

**Atmospheric Pressure**

In [19]:
# Import Atmospheric Pressure and keep only Montreal
w2_pressure = pd.read_csv("pressure.csv", parse_dates=['datetime'])
w2_pressure = w2_pressure.filter(axis=1, items=['datetime', 'Montreal']).rename({'Montreal':'Pressure'}, axis=1)

**Wind Speed**

In [21]:
# Import Wind Speed and keep only Montreal
w2_wspeed = pd.read_csv("wind_speed.csv", parse_dates=['datetime'])
w2_wspeed = w2_wspeed.filter(axis=1, items=['datetime', 'Montreal']).rename({'Montreal':'Wind Speed'}, axis=1)

**Temperature**

In [23]:
# Import Temperature and keep only Montreal
w2_temp = pd.read_csv("temperature.csv", parse_dates=['datetime'])
w2_temp = w2_temp.filter(axis=1, items=['datetime', 'Montreal']).rename({'Montreal':'Temperature'}, axis=1)
# Transforming Kelvin measures into Celcius
w2_temp['Temperature'] = w2_temp['Temperature'] - 273.15 

**Merging all weather metrics together**

In [26]:
weather2 = w2_desc.merge(w2_humid, how='left', left_on='datetime', right_on='datetime')

In [27]:
weather2 = weather2.merge(w2_pressure, how='left', left_on='datetime', right_on='datetime')

In [28]:
weather2 = weather2.merge(w2_wspeed, how='left', left_on='datetime', right_on='datetime')

In [29]:
weather2 = weather2.merge(w2_temp, how='left', left_on='datetime', right_on='datetime')

In [30]:
weather2.head()

Unnamed: 0,datetime,Weather Condition,Humidity,Pressure,Wind Speed,Temperature
0,2012-10-01 12:00:00,,,,,
1,2012-10-01 13:00:00,overcast clouds,93.0,1001.0,4.0,12.68
2,2012-10-01 14:00:00,sky is clear,91.0,986.0,4.0,12.68465
3,2012-10-01 15:00:00,sky is clear,87.0,945.0,4.0,12.69779
4,2012-10-01 16:00:00,sky is clear,84.0,904.0,4.0,12.710929


Looking good!

## Merging Activity and Weather

In [31]:
# Choose weather data, version 1 or 2
weather_version = 2 

In [32]:
# Remove unnecessary columns from weather data version 1
weather.drop(['health_index', 'cloud_cover_4', 'cloud_cover_10', 'wind_dir', 'wind_dir_10s', 'wind_gust',
              'solar_radiation'], 
             axis=1, inplace=True)

In [33]:
# Sort trip history by start_date in order to merge with weather data
df_rentals.sort_values(by='start_date', inplace=True)

In [35]:
# Merge trip history with weather data depending on the version of weather data used
if weather_version == 1:
    merged_data = pd.merge_asof(df_rentals, weather, left_on='start_date', 
                                        right_on='date_time_local', direction= 'nearest')
elif weather_version == 2:
       merged_data = pd.merge_asof(df_rentals, weather2, left_on='start_date', 
                                           right_on='datetime', direction= 'nearest')

In [36]:
# Deleting weather data to free up memory
del weather
del weather2

## Adding Festival Data

In [37]:
# change current working directory
os.chdir(path_festival_data)

In [39]:
# Import festival data
festival = pd.read_csv("csv_dataset_holiday_festival.csv", parse_dates=['date'], encoding = "gb2312")

In [40]:
# Remove unnecessary column
festival.drop(['weenkend'], axis=1, inplace=True)

In [41]:
# Inspecting data
festival.head()

Unnamed: 0,date,festival_1,festivial_name_1,latitude_1,longitude_1,festival_2,festival_name_2,latitude_2,longitude_2,statutory_holiday,statutory_holiday_name
0,2014-04-15,0.0,,,,0.0,,,,0.0,
1,2014-04-16,0.0,,,,0.0,,,,0.0,
2,2014-04-17,0.0,,,,0.0,,,,0.0,
3,2014-04-18,0.0,,,,0.0,,,,1.0,good friday
4,2014-04-19,0.0,,,,0.0,,,,0.0,


In [42]:
# Keeping only rows with relevant information
# (removing all non-positive values regarding if there was a festival or a statutory holiday)
festival = festival[(festival['festival_1'] == 1.0) | (festival['festival_2'] == 1.0) | (festival['statutory_holiday'] == 1.0)]

In [43]:
# Creating a binary variable 'has_festival'
festival['has_festival'] = (festival['festival_1'] == 1.0) | (festival['festival_2'] == 1.0)

In [44]:
# Coercing 'statutory_holiday' into a binary variable
festival['statutory_holiday'] = (festival['statutory_holiday'] == 1.0)

In [45]:
# Removing now unnecessary variables
festival.drop(['festival_1', 'festival_2'], axis=1, inplace=True)

In [46]:
# Inspecting the data
festival.head()

Unnamed: 0,date,festivial_name_1,latitude_1,longitude_1,festival_name_2,latitude_2,longitude_2,statutory_holiday,statutory_holiday_name,has_festival
3,2014-04-18,,,,,,,True,good friday,False
6,2014-04-21,,,,,,,True,easter monday,False
34,2014-05-19,,,,,,,True,victoria day/national patriots day,False
53,2014-06-07,2014 Canadian Grand Prix,45.503231,-73.52669,,,,False,,True
54,2014-06-08,2014 Canadian Grand Prix,45.503231,-73.52669,,,,False,,True


### Merge Festival data with Merged Activity and Weather data

In [None]:
# Merge the festival data with the dataset
merged_data = merged_data.merge(festival, how='left', left_on='start_date', right_on='date')

In [48]:
# Inspecting columns
merged_data.columns

Index(['start_date', 'start_station_code', 'end_date', 'end_station_code',
       'duration_sec', 'is_member', 'Unnamed: 0_x', 'start_code', 'start_name',
       'start_latitude', 'start_longitude', 'start_neighborhood',
       'start_great_park', 'start_affectation', 'Unnamed: 0_y', 'end_code',
       'end_name', 'end_latitude', 'end_longitude', 'end_neighborhood',
       'end_great_park', 'end_affectation', 'start_date_hour', 'end_date_hour',
       'datetime', 'Weather Condition', 'Humidity', 'Pressure', 'Wind Speed',
       'Temperature', 'date', 'festivial_name_1', 'latitude_1', 'longitude_1',
       'festival_name_2', 'latitude_2', 'longitude_2', 'statutory_holiday',
       'statutory_holiday_name', 'has_festival'],
      dtype='object')

In [49]:
# Dropping superflous columns:
merged_data.drop(['Unnamed: 0_x', 'Unnamed: 0_y',
                    'start_station_code', 'end_station_code'],
                  axis=1, inplace=True)

In [50]:
# Renaming columns
merged_data.rename({'latitude_1':'festival1_lat',
                        'longitude_1':'festival1_long', 
                        'latitude_2':'festival2_lat', 
                        'longitude_2':'festival2_long'}, axis='columns', inplace=True)

## DateTime & Route Feature Engineering & Changing Data Types

Creating new features based on datetime values and combined values for the start station and the end station (considered to be a 'route').

In [51]:
# Coercing 'start_date' and 'end_date' into the datetime data type
merged_data['start_datetime'] = pd.to_datetime(merged_data['start_date'])
merged_data['end_datetime'] = pd.to_datetime(merged_data['end_date'])

# Creating two new variables, 'start_date' and 'end_date' to hold only the date portion of the datetime columns
merged_data['start_date'] = pd.to_datetime(merged_data['start_datetime'].dt.date)
merged_data['end_date'] = pd.to_datetime(merged_data['end_datetime'].dt.date)

In [52]:
# Creating two new variables, 'start_month' and 'end_month' to hold only the month portion of the datetime columns
merged_data['start_month'] = merged_data['start_datetime'].dt.month
merged_data['end_month'] = merged_data['end_datetime'].dt.month

In [53]:
# Creating two new variables, 'start_dayofyear' and 'end_dayofyear' to hold only the dayofyear portion of the datetime columns
merged_data['start_daynum'] = merged_data['start_datetime'].dt.dayofyear
merged_data['end_daynum'] = merged_data['end_datetime'].dt.dayofyear

In [54]:
# Creating two new variables, 'start_weekofyear' and 'end_wekofyear' to hold only the weekofyear portion of the datetime columns
merged_data['start_weeknum'] = merged_data['start_datetime'].dt.weekofyear
merged_data['end_weeknum'] = merged_data['end_datetime'].dt.weekofyear

In [55]:
# Creating two new variables, 'start_weekday' and 'end_weekday' to hold only the day of the week portion of the datetime columns
merged_data['start_weekday'] = merged_data['start_datetime'].dt.dayofweek
merged_data['end_weekday'] = merged_data['end_datetime'].dt.dayofweek
# Monday = 0, Tuesday = 1, ..., Saturday = 5, Sunday = 6

In [56]:
# Adding route code and name
merged_data['route'] = merged_data['start_code'].map(str) + merged_data['end_code'].map(str)
merged_data['route_name'] = merged_data['start_name'] + '-' + merged_data['end_name']

In [57]:
# Adding period of the day : categorical variable for the time of day : with more categories
def hour_mapping_large(hour):
    if hour < 3:
        return 'Late Night'
    elif hour < 6:
        return 'Early Morning'        
    elif hour < 9:
        return 'Morning'
    elif hour < 11:
        return 'Late Morning'
    elif hour < 13:
        return 'Noon'
    elif hour < 15:
        return 'Early Afternoon'
    elif hour < 17:
        return 'Late Afternoon'
    elif hour < 20:
        return 'Evening'
    elif hour < 24:
        return 'Early Night'
    else:
        return 'Error'
merged_data['period_of_day_large'] = merged_data['start_date_hour'].apply(hour_mapping_large)

In [58]:
# Adding period of the day : categorical variable for the time of day : with fewer categories
def hour_mapping_small(hour):
    if hour < 6:
        return 'Early Morning'
    elif hour < 11:
        return 'Morning'        
    elif hour < 14:
        return 'Noon'
    elif hour < 17:
        return 'Afternoon'
    elif hour < 22:
        return 'Evening'
    elif hour < 24:
        return 'Night'
    else:
        return 'Error'
merged_data['period_of_day_small'] = merged_data['start_date_hour'].apply(hour_mapping_small)

## Feature Engineering : Adding Fuel Prices Data from Toronto

In [62]:
# Importing fuel data
os.chdir(path_fuel_data)
fuel = pd.read_csv("fueltypesall.csv", parse_dates=['Date'])

In [63]:
# Calculating the mean fuel price in Toronto
fuel['Toronto_mean'] = (fuel['Toronto West/Ouest'] + fuel['Toronto East/Est']) / 2

In [64]:
# Keeping only prices for Regular Unleaded Gasoline
fuel = fuel[fuel['Fuel Type'] == "Regular Unleaded Gasoline"]

In [65]:
# Creating a datetime index in a new dataframe in order to fill it with the nearest gas price value
fuel_dates = pd.DataFrame(index=pd.date_range(start=pd.to_datetime("2014-04-15"), end=pd.to_datetime("2018-10-31")))
fuel_dates['date'] = pd.to_datetime(fuel_dates.index)

In [67]:
# Merging fuel dates with fuel prices
fuel_dates_merged = fuel_dates.merge(fuel, how='left', left_on='date', right_on='Date')

In [68]:
# Filtering only on required columns
fuel_dates_merged = fuel_dates_merged.filter(['price', 'date', 'Toronto_mean'], axis=1)

In [69]:
# Filling missing values forward
fuel_dates_merged.fillna(method='ffill',axis=0,inplace=True)

In [70]:
# Filling missing values backward (for the first week)
fuel_dates_merged.fillna(method='bfill',axis=0,inplace=True)

In [71]:
# Inspecting dataframe
fuel_dates_merged.head()

Unnamed: 0,date,Toronto_mean
0,2014-04-15,138.4
1,2014-04-16,138.4
2,2014-04-17,138.4
3,2014-04-18,138.4
4,2014-04-19,138.4


In [72]:
# Adding fuel data to the dataset
merged_data = merged_data.merge(fuel_dates_merged, how='left', left_on='start_date', right_on='date')

## Final Pre-Processing for the dataset

In [74]:
# Removing duplicate columns
merged_data.drop(['date_x', 'date_y'], axis=1, inplace=True)

In [75]:
# Dropping the date time column that came from merging weather data
if weather_version == 2:
    merged_data.drop(['datetime'], axis=1, inplace=True)

In [76]:
# Renaming the Fuel Price column
merged_data.rename({"Toronto_mean":"Fuel Price"}, axis='columns', inplace=True)

In [77]:
# Creating an affectation feature for routes
merged_data['route_affectations'] = merged_data['start_affectation'].map(str) + "-" + merged_data['end_affectation'].map(str)

In [78]:
# Creating a neighborhood feature for routes
merged_data['route_neighborhood'] = merged_data['start_neighborhood'].map(str) + "-" + merged_data['end_neighborhood'].map(str)

In [79]:
# Creating a great park feature for routes
merged_data['route_great_park'] = merged_data['start_great_park'].map(str) + "-" + merged_data['end_great_park'].map(str)

In [80]:
# Inspecting the final dataset
merged_data.head()

Unnamed: 0,start_date,end_date,duration_sec,is_member,start_code,start_name,start_latitude,start_longitude,start_neighborhood,start_great_park,...,start_weekday,end_weekday,route,route_name,period_of_day_large,period_of_day_small,Fuel Price,route_affectations,route_neighborhood,route_great_park
0,2017-04-15,2017-04-15,1841,1,7060,de l'Église / de Verdun,45.463001,-73.571569,Verdun,,...,5,5,70607060,de l'Église / de Verdun-de l'Église / de Verdun,Late Night,Early Morning,114.55,residentiel-residentiel,Verdun-Verdun,nan-nan
1,2017-04-15,2017-04-15,553,1,6173,Berri / Cherrier,45.519088,-73.569509,Le Plateau-Mont-Royal,,...,5,5,61736173,Berri / Cherrier-Berri / Cherrier,Late Night,Early Morning,114.55,institution-institution,Le Plateau-Mont-Royal-Le Plateau-Mont-Royal,nan-nan
2,2017-04-15,2017-04-15,195,1,6203,Hutchison / Sherbrooke,45.50781,-73.57208,Le Plateau-Mont-Royal,,...,5,5,62036204,Hutchison / Sherbrooke-Milton / Durocher,Late Night,Early Morning,114.55,mixte-residentiel,Le Plateau-Mont-Royal-Le Plateau-Mont-Royal,nan-nan
3,2017-04-15,2017-04-15,285,1,6104,Wolfe / René-Lévesque,45.516818,-73.554188,Ville-Marie,,...,5,5,61046114,Wolfe / René-Lévesque-Métro Papineau (Cartier ...,Late Night,Early Morning,114.55,mixte-mixte,Ville-Marie-Ville-Marie,nan-nan
4,2017-04-15,2017-04-15,569,1,6174,Roy / St-Denis,45.51908,-73.5727,Le Plateau-Mont-Royal,,...,5,5,61746174,Roy / St-Denis-Roy / St-Denis,Late Night,Early Morning,114.55,mixte-mixte,Le Plateau-Mont-Royal-Le Plateau-Mont-Royal,nan-nan


In [81]:
# Inspecting the shape of the final dataset
merged_data.shape

(4740357, 52)

## Saving the data to disk

In [84]:
os.chdir("/Users/gabriel/")

In [85]:
# Saving to HDF, the format which has the best read time among all other formats we tried.
merged_data.to_hdf("dataset_2017.hdf",key="dataset")



your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->['start_name', 'start_neighborhood', 'start_great_park', 'start_affectation', 'end_name', 'end_neighborhood', 'end_great_park', 'end_affectation', 'Weather Condition', 'festivial_name_1', 'festival_name_2', 'statutory_holiday', 'statutory_holiday_name', 'has_festival', 'route', 'route_name', 'period_of_day_large', 'period_of_day_small', 'route_affectations', 'route_neighborhood', 'route_great_park']]


