In [1]:
import pandas as pd
file = 'train.csv'
trip_df = []
for chunk in pd.read_csv(file, chunksize=10000):
    trip_df.append(chunk)
trip_df = pd.concat(trip_df, axis='rows')

In [10]:
#The dataset is too large for my computer's memory so I am downsampling. This step may be skipped on machines with larger memory.
import numpy as np
mask = np.random.rand(trip_df.shape[0])
mask = mask <= 0.2
trip_df = trip_df.loc[mask]

In [16]:
trip_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 291372 entries, 2 to 1458642
Data columns (total 11 columns):
id                    291372 non-null object
vendor_id             291372 non-null int64
pickup_datetime       291372 non-null object
dropoff_datetime      291372 non-null object
passenger_count       291372 non-null int64
pickup_longitude      291372 non-null float64
pickup_latitude       291372 non-null float64
dropoff_longitude     291372 non-null float64
dropoff_latitude      291372 non-null float64
store_and_fwd_flag    291372 non-null object
trip_duration         291372 non-null int64
dtypes: float64(4), int64(3), object(4)
memory usage: 22.2+ MB


In [17]:
print(trip_df.head())

           id  vendor_id      pickup_datetime     dropoff_datetime  \
2   id3858529          2  2016-01-19 11:35:24  2016-01-19 12:10:48   
10  id1436371          2  2016-05-10 22:08:41  2016-05-10 22:29:55   
22  id2352683          1  2016-04-09 03:34:27  2016-04-09 03:41:30   
23  id1603037          1  2016-06-25 10:36:26  2016-06-25 10:55:49   
25  id0129640          2  2016-02-14 13:27:56  2016-02-14 13:49:19   

    passenger_count  pickup_longitude  pickup_latitude  dropoff_longitude  \
2                 1        -73.979027        40.763939         -74.005333   
10                1        -73.982651        40.763840         -74.002228   
22                1        -73.995865        40.758812         -73.993324   
23                1        -73.993553        40.747173         -74.006142   
25                1        -73.956581        40.771358         -73.974968   

    dropoff_latitude store_and_fwd_flag  trip_duration  
2          40.710087                  N           2124  
10

In [18]:
dates = []
for i in trip_df['pickup_datetime']:
    dates.append(i[:10])
trip_df['date'] = pd.to_datetime(dates)

In [19]:
#Convert vendor ID from Int64 to Categorical
trip_df['vendor_id'] = trip_df['vendor_id'].astype('category')
#Pickup and drop off should be in Datetime format
trip_df['pickup_datetime'] = pd.to_datetime(trip_df['pickup_datetime'])
trip_df['dropoff_datetime'] = pd.to_datetime(trip_df['dropoff_datetime'])

In [20]:
#Import weather data
file2 = 'NYCweather2016.csv'
weather = pd.read_csv(file2)
print(weather.info())
print(weather.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2375 entries, 0 to 2374
Data columns (total 20 columns):
STATION      2375 non-null object
NAME         2375 non-null object
LATITUDE     2375 non-null float64
LONGITUDE    2375 non-null float64
ELEVATION    2375 non-null float64
DATE         2375 non-null object
AWND         1088 non-null float64
PRCP         2372 non-null float64
SNOW         1871 non-null float64
SNWD         1132 non-null float64
TAVG         732 non-null float64
TMAX         1098 non-null float64
TMIN         1098 non-null float64
WT01         353 non-null float64
WT02         33 non-null float64
WT03         44 non-null float64
WT04         7 non-null float64
WT06         7 non-null float64
WT08         197 non-null float64
WT09         4 non-null float64
dtypes: float64(17), object(3)
memory usage: 343.3+ KB
None
       STATION                         NAME  LATITUDE  LONGITUDE  ELEVATION  \
0  USW00094728  NY CITY CENTRAL PARK, NY US  40.77898  -73.96925       42

In [21]:
weather['DATE'] = pd.to_datetime(weather['DATE'])

In [22]:
#Review data for each weather station
for name in weather['NAME'].drop_duplicates():
    print(name)
    print(weather.loc[weather['NAME'] == name].info())

NY CITY CENTRAL PARK, NY US
<class 'pandas.core.frame.DataFrame'>
Int64Index: 366 entries, 0 to 365
Data columns (total 20 columns):
STATION      366 non-null object
NAME         366 non-null object
LATITUDE     366 non-null float64
LONGITUDE    366 non-null float64
ELEVATION    366 non-null float64
DATE         366 non-null datetime64[ns]
AWND         356 non-null float64
PRCP         366 non-null float64
SNOW         366 non-null float64
SNWD         366 non-null float64
TAVG         0 non-null float64
TMAX         366 non-null float64
TMIN         366 non-null float64
WT01         131 non-null float64
WT02         6 non-null float64
WT03         0 non-null float64
WT04         1 non-null float64
WT06         2 non-null float64
WT08         99 non-null float64
WT09         0 non-null float64
dtypes: datetime64[ns](1), float64(17), object(2)
memory usage: 57.2+ KB
None
MIDDLE VILLAGE 0.5 SW, NY US
<class 'pandas.core.frame.DataFrame'>
Int64Index: 351 entries, 366 to 716
Data columns (

In [23]:
#Use only weather events recorded at LaGuardia Airport due to highest availability of data and most central location.
weather = weather.loc[weather['NAME'] == 'LA GUARDIA AIRPORT, NY US']

In [24]:
#Merge dataframes
nyc_taxi = pd.merge(trip_df, weather, how='left', left_on='date', right_on = 'DATE')

In [26]:
nyc_taxi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 291372 entries, 0 to 291371
Data columns (total 32 columns):
id                    291372 non-null object
vendor_id             291372 non-null category
pickup_datetime       291372 non-null datetime64[ns]
dropoff_datetime      291372 non-null datetime64[ns]
passenger_count       291372 non-null int64
pickup_longitude      291372 non-null float64
pickup_latitude       291372 non-null float64
dropoff_longitude     291372 non-null float64
dropoff_latitude      291372 non-null float64
store_and_fwd_flag    291372 non-null object
trip_duration         291372 non-null int64
date                  291372 non-null datetime64[ns]
STATION               291372 non-null object
NAME                  291372 non-null object
LATITUDE              291372 non-null float64
LONGITUDE             291372 non-null float64
ELEVATION             291372 non-null float64
DATE                  291372 non-null datetime64[ns]
AWND                  291372 non-null fl

In [27]:
nyc_taxi.iloc[:20, -7:]

Unnamed: 0,WT01,WT02,WT03,WT04,WT06,WT08,WT09
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,,,,,,,
6,,,,,,,
7,,,,,,,
8,,,,,,,
9,,,,,,,


In [28]:
#Columns representing weather types (WT01 - WT09) have 1's indicating yes and NaN indicating no. Replace NaN with 0.
bools = {1: True, 0: False}
for col in nyc_taxi.iloc[:,-7:]:
    nyc_taxi[col] = nyc_taxi[col].fillna(value=0)
    nyc_taxi[col] = nyc_taxi[col].map(bools)

In [29]:
print(nyc_taxi.iloc[:20,-7:])

     WT01   WT02   WT03   WT04   WT06   WT08   WT09
0   False  False  False  False  False  False  False
1   False  False  False  False  False  False  False
2   False  False  False  False  False  False  False
3   False  False  False  False  False  False  False
4   False  False  False  False  False  False  False
5   False  False  False  False  False  False  False
6   False  False  False  False  False  False  False
7   False  False  False  False  False  False  False
8   False  False  False  False  False  False  False
9   False  False  False  False  False  False  False
10  False  False  False  False  False  False  False
11   True  False  False  False  False  False  False
12  False  False  False  False  False  False  False
13   True   True   True  False  False  False  False
14  False  False  False  False  False  False  False
15  False  False  False  False  False   True  False
16  False  False  False  False  False  False  False
17  False  False  False  False  False  False  False
18  False  F

In [None]:
for col in nyc_taxi:
    print(col)

In [31]:
c = ['id','vendor_id','store_and_fwd_flag','ELEVATION','DATE']
for i in c:
    del nyc_taxi[i]

In [32]:
columns = ['pickup_datetime','dropoff_datetime','passenger_count','pickup_longitude','pickup_latitude','dropoff_longitude',
          'dropoff_latitude','trip_duration','date','station_num','station_name','station_latitude','station_longitude','average_wind',
          'precipitation','snowfall','snow_depth','avg_temp','max_temp','min_temp','fog','fog_heavy','thunder','ice_pellets','glaze',
          'smoke_haze','blowing_snow']

In [33]:
nyc_taxi.columns = columns

In [34]:
nyc_taxi[:1000].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 27 columns):
pickup_datetime      1000 non-null datetime64[ns]
dropoff_datetime     1000 non-null datetime64[ns]
passenger_count      1000 non-null int64
pickup_longitude     1000 non-null float64
pickup_latitude      1000 non-null float64
dropoff_longitude    1000 non-null float64
dropoff_latitude     1000 non-null float64
trip_duration        1000 non-null int64
date                 1000 non-null datetime64[ns]
station_num          1000 non-null object
station_name         1000 non-null object
station_latitude     1000 non-null float64
station_longitude    1000 non-null float64
average_wind         1000 non-null float64
precipitation        1000 non-null float64
snowfall             1000 non-null float64
snow_depth           1000 non-null float64
avg_temp             1000 non-null float64
max_temp             1000 non-null float64
min_temp             1000 non-null float64
fog                

In [35]:
nyc_taxi.to_csv('nyc_taxi.csv')