In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import dask
import dask.dataframe as dd
import gc
import subprocess
import pandas as pd
import numpy as np
!pip install meteostat
from meteostat import Point, Daily
import missingno as msno
import calendar
from datetime import datetime

In [2]:
# we'll import the taxi data inside a function to save memory:
def imports2(year,months_number,sample_part):
    df_list = []
    for el in list((range(1,months_number+1))):
        month = str(el).zfill(2)
        link = 's3://nyc-tlc/trip data/yellow_tripdata_'+str(year)+'-'+month+'.csv'
        df = dd.read_csv(link, dtype={'tolls_amount': 'float64', 'RatecodeID':'float64',
                                                           'trip_distance':'float64','store_and_fwd_flag':'category', 'fare_amount':'float64'}, storage_options={'anon': True, 'use_ssl': False})
#         df = dd.concat(chunks, ignore_index=True)
#         del chunks
#         gc.collect()
        # 1. remove nulls and insanely long trips:
        df = df[~df['payment_type'].isnull()]
        df = df[df['trip_distance']<500]
        # 2. create a pickup date column, modify column dtypes:
        df['pickup_date'] = df['tpep_pickup_datetime'].str[:11]
        df['tpep_pickup_datetime'] = dd.to_datetime(df['tpep_pickup_datetime'])
        df['tpep_dropoff_datetime'] = dd.to_datetime(df['tpep_dropoff_datetime'])
        df['pickup_date'] = dd.to_datetime(df['pickup_date'])
        # 3. lets make sure that the dataframe has only this months days:
        df = df[(df['pickup_date']>=str(year)+'-'+month+'-01') & (df['pickup_date']<=str(year)+'-'+month+'-'+str(calendar.monthrange(year, el)[1]))].copy()
        # 4. now lets sample only some % of the original dataset:
        df = df.sample(frac=sample_part)
        # 5. change a few columns to integers to save memory:
        integerize = ['passenger_count','VendorID','RatecodeID', 'payment_type']
        for col in integerize:
            df[col] = df[col].astype(int)
            df[col] = dd.to_numeric(df[col])
        # 6. new trick:
        df_list.append(df)
        del df
        gc.collect()
        if len(df_list)>1:
            df_both = dd.concat([df_list[0],df_list[1]], axis=0)          
            del df_list[0]
            del df_list[0]            
            gc.collect()
            df_list.append(df_both)
            del df_both
            gc.collect()
    return df_list[0]

In [3]:
def merge_yrs():
    df_2018 = imports2(2017,12,0.05)
    df_2019 = imports2(2018,12,0.05)
    dfs = dd.concat([df_2018,df_2019], axis=0)  
    del df_2018
    del df_2019
    gc.collect()
    return dfs
dfs = merge_yrs()

In [4]:
def computing(df):
    df2 = df.compute()
    del df
    gc.collect()
    return df2
df = computing(dfs)
del dfs
gc.collect()

In [5]:
# inital basic feature engineering:
def feature_eng(df):
    # trip length
    df['trip_length'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    df['trip_length'] = df['trip_length'].astype('timedelta64[m]')

    # day of the week
    df['day_of_week'] = df['pickup_date'].dt.day_name()
    df['day_of_week'] = df['day_of_week'].astype('category')
    df['day_of_week'] = df['day_of_week'].cat.codes

    # time of day
    df['time_of_day'] = df['tpep_pickup_datetime'].dt.hour
    df['time_of_day'] = df['time_of_day'].astype('category')
    df['time_of_day'] = df['time_of_day'].cat.codes
    return df
df = feature_eng(df)

In [6]:
# importing weather data:
start = df['pickup_date'].min()
end = df['pickup_date'].max()

def get_weather(start, end):
    # Create Point for NYC
    location = Point(40.785091,-73.968285)

    # Get daily data for 2018
    data = Daily(location, start, end)
    data = data.fetch()
    data = data.drop(columns=['wpgt','tsun'])
    data['time'] = data.index
    data = data.rename(columns={'time':'pickup_date'})
    data = data.fillna(data.mean())
    return data
data = get_weather(start, end)

In [7]:
# merging weather data with our taxi dataset:
def data_merge(data,df):
    df2=df.merge(data, on='pickup_date', how='left') 
    df2.head()
    del df
    del data
    gc.collect()
    return df2
df2 = data_merge(data,df)
del data
del df
gc.collect()

In [8]:
# removing outliers in the dataset:
def iqring(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    df2 = df[~((df[col] < (Q1 - 1.5 * IQR)) |(df[col] > (Q3 + 1.5 * IQR)))]
    return df2
df3 = iqring(df2, 'total_amount')
del df2
gc.collect()

In [9]:
df3.to_csv('my_first_dask_pd.csv')