In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import math

%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

from numpy.random import randn
np.random.seed(123)
import os
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4)
pd.options.display.max_rows = 10
#pd.options.display.max_rows = 20

#Pull in the file containing bike data
bike = ['id', 'start_date', 'start_station', 'end_date', 'end_station', 'duration_sec', 'blank1', 'is_member', 'blank2', 'blank3']
bike_data = pd.read_table('OD_2017 - Condensed.csv', sep=None, header=None, names=bike, skiprows=1, engine='python')
to_drop = ['blank1', 'blank2', 'blank3']
bike_data.drop(to_drop, inplace=True, axis=1)
bike_data.set_index(['start_date'])
bike_data['start_date'] = pd.to_datetime(bike_data['start_date'], yearfirst=True)
bike_data.set_index(['start_date'])

Unnamed: 0_level_0,id,start_station,end_date,end_station,duration_sec,is_member
start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-04-15 00:00:00,0,7060,17-04-15 0:31,7060,1841,1
2017-04-15 00:01:00,1,6173,17-04-15 0:10,6173,553,1
2017-04-15 00:01:00,2,6203,17-04-15 0:04,6204,195,1
2017-04-15 00:01:00,3,6104,17-04-15 0:06,6114,285,1
2017-04-15 00:01:00,4,6174,17-04-15 0:11,6174,569,1
...,...,...,...,...,...,...
2017-05-04 18:21:00,249994,6131,17-05-04 18:32,6926,657,1
2017-05-04 18:21:00,249995,6014,17-05-04 18:26,6214,313,1
2017-05-04 18:21:00,249996,6015,17-05-04 18:25,6017,295,1
2017-05-04 18:21:00,249997,6011,17-05-04 18:30,6108,591,1


In [12]:
#Pull in station location data
location = ['code', 'latitude', 'longitude']
location_data = pd.read_table('Stations_2017.csv', sep=None, header=None, names=location, skiprows=1, engine='python')
location_data['code']=location_data['code'].astype(int)
location_data.set_index(['code'])

#isolate trips with different start and end locations and add lattitude and longitude info
trip_data=bike_data.loc[bike_data['start_station'] != bike_data['end_station']]
trip_data.dropna(inplace=True)
trip_data=location_data.merge(trip_data,how='left',left_on='code', right_on='start_station')
trip_data.rename(columns={'latitude': 'start_latitude', 'longitude': 'start_longitude'}, inplace=True)
trip_data=location_data.merge(trip_data,how='left',left_on='code', right_on='end_station')
trip_data.rename(columns={'latitude': 'end_latitude', 'longitude': 'end_longitude'}, inplace=True)
trip_data.dropna(inplace=True)
trip_data = trip_data.loc[:, ['id','start_latitude','start_longitude','end_latitude','end_longitude']]
trip_data

#calculate distances travelled for each trip with a different start and end station

distance_array = []

def calc_distance(lat1, lon1, lat2, lon2):
    earthRadiusKm = 6371
    dLat = math.radians(lat2-lat1)
    dLon = math.radians(lon2-lon1)
    lat1 = math.radians(lat1)
    lat2 = math.radians(lat2)
    a = math.sin(dLat/2) * math.sin(dLat/2) + math.sin(dLon/2) * math.sin(dLon/2) * math.cos(lat1) * math.cos(lat2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    return round(earthRadiusKm * c,4)

def get_distances():
    
    last=len(trip_data.index)
    
    for i in range(last):
        trip_id=trip_data.loc[i,'id']
        lat1=trip_data.loc[i,'start_latitude']
        lat2=trip_data.loc[i,'end_latitude']
        lon1=trip_data.loc[i,'start_longitude']
        lon2=trip_data.loc[i,'end_longitude']
        distance=calc_distance(lat1,lon1,lat2,lon2)
        distance_array.append(distance)
            
get_distances()

#convert the distance array to a usable format
distance_data = pd.DataFrame(data=distance_array)
distance_data

#put the pieces back together into a final data frame
trip_data_final = pd.DataFrame({'distance':distance_data.loc[:,0],'end_date':bike_data.end_date,'start_station':bike_data.start_station,'end_station':bike_data.end_station, 'duration': bike_data.duration_sec, 'is_member':bike_data.is_member, 'id':bike_data.id, 'start_date':bike_data.start_date})
trip_data_final.dropna(inplace=True)
trip_data_final
trip_data_final = trip_data_final.set_index(['start_date'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [13]:
trip_data_final

Unnamed: 0_level_0,distance,duration,end_date,end_station,id,is_member,start_station
start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-04-15 00:00:00,1.2700,1841,17-04-15 0:31,7060,0,1,7060
2017-04-15 00:01:00,1.2700,553,17-04-15 0:10,6173,1,1,6173
2017-04-15 00:01:00,1.2700,195,17-04-15 0:04,6204,2,1,6203
2017-04-15 00:01:00,1.2201,285,17-04-15 0:06,6114,3,1,6104
2017-04-15 00:01:00,1.2201,569,17-04-15 0:11,6174,4,1,6174
...,...,...,...,...,...,...,...
2017-05-04 15:19:00,4.6388,1768,17-05-04 15:49,6314,244115,1,6092
2017-05-04 15:19:00,2.9152,958,17-05-04 15:35,6434,244116,1,6042
2017-05-04 15:19:00,2.9152,412,17-05-04 15:26,6132,244117,1,6134
2017-05-04 15:19:00,2.9152,208,17-05-04 15:23,6235,244118,1,6227


In [16]:
#Pull in file containing temperature data and resample
temperature = ['datetime', 'Montreal']
temperature_data = pd.read_table('Temperature - Montreal.csv', sep=None, header=None, names=temperature, skiprows=1, engine='python')
temperature_data['datetime'] = pd.to_datetime(temperature_data['datetime'], yearfirst=True)
temperature_data = temperature_data.set_index(['datetime'])
temperature_data = temperature_data.resample('1min').pad()
temperature_data = temperature_data.rename(index=str, columns={'Montreal':'Temperature'})

In [17]:
#Pull in file containing humidity data and resample
humidity = ['datetime', 'Montreal']
humidity_data = pd.read_table('Humidity - Montreal.csv', sep=None, header=None, names=humidity, skiprows=1, engine='python')
humidity_data['datetime'] = pd.to_datetime(humidity_data['datetime'], yearfirst=True)
humidity_data = humidity_data.set_index(['datetime'])
humidity_data = humidity_data.resample('1min').pad()
humidity_data = humidity_data.rename(index=str, columns={'Montreal':'Humidity'})

In [20]:
master = pd.merge(trip_data_final, temperature_data, how = 'inner', left_index=True, right_index=True)

In [21]:
master = pd.merge(master, humidity_data, how = 'inner', left_index=True, right_index=True)

In [23]:
#check for any NaNs
master.isnull().sum(), len(master)

(distance         0
 duration         0
 end_date         0
 end_station      0
 id               0
 is_member        0
 start_station    0
 Temperature      0
 Humidity         0
 dtype: int64, 244120)

In [24]:
master

Unnamed: 0,distance,duration,end_date,end_station,id,is_member,start_station,Temperature,Humidity
2017-04-15 00:00:00,1.2700,1841,17-04-15 0:31,7060,0,1,7060,13.0,22
2017-04-15 00:01:00,1.2700,553,17-04-15 0:10,6173,1,1,6173,13.0,22
2017-04-15 00:01:00,1.2700,195,17-04-15 0:04,6204,2,1,6203,13.0,22
2017-04-15 00:01:00,1.2201,285,17-04-15 0:06,6114,3,1,6104,13.0,22
2017-04-15 00:01:00,1.2201,569,17-04-15 0:11,6174,4,1,6174,13.0,22
...,...,...,...,...,...,...,...,...,...
2017-05-04 15:19:00,4.6388,1768,17-05-04 15:49,6314,244115,1,6092,10.0,82
2017-05-04 15:19:00,2.9152,958,17-05-04 15:35,6434,244116,1,6042,10.0,82
2017-05-04 15:19:00,2.9152,412,17-05-04 15:26,6132,244117,1,6134,10.0,82
2017-05-04 15:19:00,2.9152,208,17-05-04 15:23,6235,244118,1,6227,10.0,82
