In [2]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import math

%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

from numpy.random import randn
np.random.seed(123)
import os
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4)
pd.options.display.max_rows = 10
#pd.options.display.max_rows = 20

In [48]:
#Pull in the file containing bike data
bike = ['id', 'start_date', 'start_station', 'end_date', 'end_station', 'duration_sec', 'blank1', 'is_member', 'blank2', 'blank3']
bike_data = pd.read_table('OD_2017 - Condensed.csv', sep=None, header=None, names=bike, skiprows=1, engine='python')
to_drop = ['blank1', 'blank2', 'blank3']
bike_data.drop(to_drop, inplace=True, axis=1)
bike_data.set_index(['start_date'])
bike_data['start_date'] = pd.to_datetime(bike_data['start_date'], yearfirst=True)
bike_data.set_index(['start_date'])

Unnamed: 0_level_0,id,start_station,end_date,end_station,duration_sec,is_member
start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-04-15 00:00:00,0.0,7060.0,2017-04-15 0:31,7060.0,1841.0,1.0
2017-04-15 00:01:00,1.0,6173.0,2017-04-15 0:10,6173.0,553.0,1.0
2017-04-15 00:01:00,2.0,6203.0,2017-04-15 0:04,6204.0,195.0,1.0
2017-04-15 00:01:00,3.0,6104.0,2017-04-15 0:06,6114.0,285.0,1.0
2017-04-15 00:01:00,4.0,6174.0,2017-04-15 0:11,6174.0,569.0,1.0
...,...,...,...,...,...,...
NaT,,,,,,
NaT,,,,,,
NaT,,,,,,
NaT,,,,,,


In [5]:
#Pull in station location data
location = ['code', 'latitude', 'longitude']
location_data = pd.read_table('Stations_2017.csv', sep=None, header=None, names=location, skiprows=1, engine='python')
location_data['code']=location_data['code'].astype(int)
location_data.set_index(['code'])

Unnamed: 0_level_0,latitude,longitude
code,Unnamed: 1_level_1,Unnamed: 2_level_1
6173,45.519088,-73.569509
6203,45.507810,-73.572080
6204,45.508144,-73.574772
6104,45.516818,-73.554188
6114,45.523530,-73.551990
...,...,...
6241,45.519489,-73.598496
6359,45.502602,-73.527503
6106,45.521140,-73.549260
6334,45.526870,-73.626616


In [43]:
#isolate trips with different start and end locations and add lattitude and longitude info
trip_data=bike_data.loc[bike_data['start_station'] != bike_data['end_station']]
trip_data.dropna(inplace=True)
trip_data=location_data.merge(trip_data,how='left',left_on='code', right_on='start_station')
trip_data.rename(columns={'latitude': 'start_latitude', 'longitude': 'start_longitude'}, inplace=True)
trip_data=location_data.merge(trip_data,how='left',left_on='code', right_on='end_station')
trip_data.rename(columns={'latitude': 'end_latitude', 'longitude': 'end_longitude'}, inplace=True)
trip_data.dropna(inplace=True)
trip_data = trip_data.loc[:, ['id','start_latitude','start_longitude','end_latitude','end_longitude']]
trip_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,id,start_latitude,start_longitude,end_latitude,end_longitude
0,75970.0,45.507810,-73.572080,45.519088,-73.569509
1,75973.0,45.507810,-73.572080,45.519088,-73.569509
2,164723.0,45.507810,-73.572080,45.519088,-73.569509
3,34006.0,45.516818,-73.554188,45.519088,-73.569509
4,123143.0,45.516818,-73.554188,45.519088,-73.569509
...,...,...,...,...,...
244115,243257.0,45.487915,-73.569643,45.521769,-73.534859
244116,230709.0,45.498673,-73.552563,45.521769,-73.534859
244117,230713.0,45.498673,-73.552563,45.521769,-73.534859
244118,230718.0,45.498673,-73.552563,45.521769,-73.534859


In [44]:
#calculate distances travelled for each trip with a different start and end station

distance_array = []

def calc_distance(lat1, lon1, lat2, lon2):
    earthRadiusKm = 6371
    dLat = math.radians(lat2-lat1)
    dLon = math.radians(lon2-lon1)
    lat1 = math.radians(lat1)
    lat2 = math.radians(lat2)
    a = math.sin(dLat/2) * math.sin(dLat/2) + math.sin(dLon/2) * math.sin(dLon/2) * math.cos(lat1) * math.cos(lat2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    return round(earthRadiusKm * c,4)

def get_distances():
    
    last=len(trip_data.index)
    
    for i in range(last):
        trip_id=trip_data.loc[i,'id']
        lat1=trip_data.loc[i,'start_latitude']
        lat2=trip_data.loc[i,'end_latitude']
        lon1=trip_data.loc[i,'start_longitude']
        lon2=trip_data.loc[i,'end_longitude']
        distance=calc_distance(lat1,lon1,lat2,lon2)
        distance_array.append(distance)
            
get_distances()

In [45]:
#convert the distance array to a usable format
distance_data = pd.DataFrame(data=distance_array)
distance_data

Unnamed: 0,0
0,1.2700
1,1.2700
2,1.2700
3,1.2201
4,1.2201
...,...
244115,4.6388
244116,2.9152
244117,2.9152
244118,2.9152


In [47]:
#put the pieces back together into a final data frame
trip_data_final = pd.DataFrame({'distance':distance_data.loc[:,0],'end_date':bike_data.end_date,'start_station':bike_data.start_station,'end_station':bike_data.end_station, 'duration': bike_data.duration_sec, 'is_member':bike_data.is_member, 'id':bike_data.id, "start_date":bike_data.start_date})
trip_data_final.dropna(inplace=True)
trip_data_final

Unnamed: 0,distance,duration,end_date,end_station,id,is_member,start_date,start_station
0,1.2700,1841.0,2017-04-15 0:31,7060.0,0.0,1.0,2017-04-15 00:00:00,7060.0
1,1.2700,553.0,2017-04-15 0:10,6173.0,1.0,1.0,2017-04-15 00:01:00,6173.0
2,1.2700,195.0,2017-04-15 0:04,6204.0,2.0,1.0,2017-04-15 00:01:00,6203.0
3,1.2201,285.0,2017-04-15 0:06,6114.0,3.0,1.0,2017-04-15 00:01:00,6104.0
4,1.2201,569.0,2017-04-15 0:11,6174.0,4.0,1.0,2017-04-15 00:01:00,6174.0
...,...,...,...,...,...,...,...,...
244115,4.6388,1768.0,2017-05-04 15:49,6314.0,244115.0,1.0,2017-05-04 15:19:00,6092.0
244116,2.9152,958.0,2017-05-04 15:35,6434.0,244116.0,1.0,2017-05-04 15:19:00,6042.0
244117,2.9152,412.0,2017-05-04 15:26,6132.0,244117.0,1.0,2017-05-04 15:19:00,6134.0
244118,2.9152,208.0,2017-05-04 15:23,6235.0,244118.0,1.0,2017-05-04 15:19:00,6227.0


In [None]:
#Pull in file containing humidity data
humidity = ['datetime', 'Montreal']
humidity_data = pd.read_table('Humidity - Montreal.csv', sep=None, header=None, names=humidity, skiprows=1, engine='python')
humidity_data.set_index(['datetime'])

In [None]:
#Pull in file containing temperature data
temperature = ['datetime', 'Montreal']
temperature_data = pd.read_table('Temperature - Montreal.csv', sep=None, header=None, names=humidity, skiprows=1, engine='python')
temperature_data.set_index(['datetime'])
temperature_data['datetime'] = pd.to_datetime(temperature_data['datetime'], yearfirst=True)

In [None]:
temperature_data