In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import KFold
from itertools import combinations
from sklearn.feature_selection import SelectKBest, f_classif
!pip install meteostat
from meteostat import Point, Daily
import missingno as msno
import calendar
from datetime import datetime
%load_ext memory_profiler

In [38]:
from meteostat import Hourly
# import warnings
# warnings.simplefilter('ignore')
import gc
import subprocess
import dask
import dask.dataframe as dd

In the function below we'll import our data. Given the size of the data we're going to use a bit of a different approach than regular 'pd.read_csv':

* the sheers size of 1 file prevents us from downloading all the 12 files and merging them into one dataframe - we will have to sample 10% of each file, and merge all the sample into 1 massive dataframe
    * the 10 % is just a number we'll set for now, but our imports function will have an option to change that at the input stage
* reading only 1 file is already a big task - we'll split that process into 'chunks' and merge all chunks together after all of them are ready
* every file represents 1 month of trips data, sort of... unfortunatelly it's common to come across dates that are out of range for their files, we can find some unnaturally long trips etc.
    * this creates our first problem: we will have to do some basic data cleaning (steps 1-3) before we'll start sampling the dataframe, this will prevent us from sampling incorrect/ dirty/ null data, the tradeof: we're going to work on a full size file, which uses a lot of memory
* we'll try to downcast numeric columns whenever possible to save memory

All of the above steps are going to be conducted inside a function - to prevent memory leaks. This method creates a new scope for the intermediate variables and removes them automatically when the interpreter exits the function.


In [39]:
months = []
# we'll import the data inside a function to save memory:
def imports2(year,months_number,sample_part):
    df_list = []
    for el in list((range(1,months_number+1))):
        month = str(el).zfill(2)
        link = 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_'+str(year)+'-'+month+'.csv'
        chunks = pd.read_csv(link, dtype={'tolls_amount': 'float64', 'RatecodeID':'float64',
                                                           'trip_distance':'float64','store_and_fwd_flag':'category'}, chunksize=40000)
        df = pd.concat(chunks, ignore_index=True)
        del chunks
        gc.collect()
        # 1. remove nulls and insanely long trips:
        df = df[~df['payment_type'].isnull()]
        df = df[df['trip_distance']<500]
        # 2. create a pickup date column, modify column dtypes:
        df['pickup_date'] = df['tpep_pickup_datetime'].str[:11]
        df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
        df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
        df['pickup_date'] = pd.to_datetime(df['pickup_date'])
        # 3. lets make sure that the dataframe has only this months days:
        df = df[(df['pickup_date']>=str(year)+'-'+month+'-01') & (df['pickup_date']<=str(year)+'-'+month+'-'+str(calendar.monthrange(year, el)[1]))].copy()
        # 4. now lets sample only 20% of the original dataset:
        df = df.sample(int(len(df)*sample_part))
        # 5. change a few columns to integers to save memory:
        integerize = ['passenger_count','VendorID','RatecodeID', 'payment_type']
        for col in integerize:
            df[col] = df[col].astype(int)
            df[col] = pd.to_numeric(df[col], downcast="unsigned")
        # 6. new trick:
        df_list.append(df)
        del df
        gc.collect()
        if len(df_list)>1:
            df_both = pd.concat([df_list[0],df_list[1]], axis=0)          
            del df_list[0]
            del df_list[0]            
            gc.collect()
            df_list.append(df_both)
            del df_both
            gc.collect()
    return df_list[0]


In [None]:
df_2018 = imports2(2017,12,0.1)
df_2019 = imports2(2018,12,0.1)
df = pd.concat([df_2018,df_2019], axis=0)  
del df_2018
del df_2019
gc.collect()

In [5]:
len(df)

In [None]:
df.groupby('PULocationID')['passenger_count'].std()

In [6]:
df.head()

In [None]:
def feature_eng(df):
    # trip length
    df['trip_length'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    df['trip_length'] = df['trip_length'].astype('timedelta64[m]')

    # day of the week
    df['day_of_week'] = df['pickup_date'].dt.day_name()
    df['day_of_week'] = df['day_of_week'].astype('category')
    df['day_of_week'] = df['day_of_week'].cat.codes

    # time of day
    df['time_of_day'] = df['tpep_pickup_datetime'].dt.hour
    df['time_of_day'] = df['time_of_day'].astype('category')
    df['time_of_day'] = df['time_of_day'].cat.codes
    return df
df = feature_eng(df)

In [None]:
fig, ax = plt.subplots(figsize=(16,8))
plt.scatter(df.groupby(df['pickup_date'])['passenger_count'].sum().index,df.groupby(df['pickup_date'])['passenger_count'].sum().values )
ax.tick_params(axis = 'x',labelsize=12, rotation=45)
plt.show()

# Weather 
lets try importing weather data ...er from somewhere

In [None]:
start = df['pickup_date'].min()
end = df['pickup_date'].max()

def get_weather(start, end):
    # Create Point for NYC
    location = Point(40.785091,-73.968285)

    # Get daily data for 2018
    data = Daily(location, start, end)
    data = data.fetch()
    data = data.drop(columns=['wpgt','tsun'])
    data['time'] = data.index
    data = data.rename(columns={'time':'pickup_date'})
    return data


In [9]:
data = get_weather(start, end)

In [None]:
def data_merge(data,df):
    df2=df.merge(data, on='pickup_date', how='left') 
    df2.head()
    del df
    del data
    gc.collect()
    return df2
df2 = data_merge(data,df)
del data
del df
gc.collect()

In [11]:
len(df2)

WORKS!

In [None]:
def iqring(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    df2 = df[~((df[col] < (Q1 - 1.5 * IQR)) |(df[col] > (Q3 + 1.5 * IQR)))]
    return df2
df3 = iqring(df2, 'total_amount')
del df2
gc.collect()

In [None]:
def col_dummy(df, col):
    cols_before = df.columns
    df[col] = df[col].astype('category')
    df[col] = df[col].cat.codes
    hood_series = pd.get_dummies(df[col],prefix=col)
    df = pd.concat([df, hood_series], axis=1)
    df = df.drop(columns=(col), axis=1)
    cols_after = df.columns
    new_cols = list(set(cols_after) - set(cols_before) )
    return df, new_cols

In [None]:
len(df3)

In [None]:
def train_test(df, features):
    np.random.seed(1)
    shuffled_index = np.random.permutation(df.index)
    df = df.reindex(index = shuffled_index)
    split_loc = int(0.5*len(df))
    # split
    train = df.iloc[:split_loc].copy()
    test = df.iloc[split_loc:].copy()
    del df
    gc.collect()
    lr = linear_model.LinearRegression()
    lr.fit(train[features], train["total_amount"])
    predictions = lr.predict(test[features])
    mse = mean_squared_error(test["total_amount"], predictions)
    rmse = np.sqrt(mse)
    
    return rmse

In [None]:
def train_test1(df, features):
#     np.random.seed(1)
#     shuffled_index = np.random.permutation(df.index)
#     df = df.reindex(index = shuffled_index)
#     split_loc = int(0.5*len(df))
    # split
    train = df[df['pickup_date'] < '2018-01-01'].copy()
    test = df[df['pickup_date'] >= '2018-01-01'].copy()
    del df
    gc.collect()
    lr = linear_model.LinearRegression()
    lr.fit(train[features], train["total_amount"])
    predictions = lr.predict(test[features])
    mse = mean_squared_error(test["total_amount"], predictions)
    rmse = np.sqrt(mse)
    
    return rmse

In [None]:
%load_ext memory_profiler

In [21]:
features1 = ['trip_distance', 'trip_length']
%memit rmse = train_test(df3, features1)

In [22]:
features1 = ['trip_distance', 'trip_length']
%memit rmse = train_test1(df3, features1)

In [None]:
features1 = ['trip_distance', 'trip_length']
rmse = train_test(df3, features1)
# f1 2.6936316597764085
rmse

In [None]:
features1 = ['trip_distance', 'trip_length']
rmse = train_test1(df3, features1)
# f1 2.6936316597764085
rmse

In [24]:
features2 = ['trip_distance', 'trip_length', 'day_of_week', 'time_of_day']
rmse = train_test(df3, features2)
# f2 2.671386816892119
rmse

In [23]:
features2 = ['trip_distance', 'trip_length', 'day_of_week', 'time_of_day']
rmse = train_test1(df3, features2)
# f2 3.48
rmse

In [25]:
features3 = ['trip_length', 'day_of_week', 'time_of_day']
rmse = train_test(df3, features3)
rmse

In [20]:
features4 = ['trip_length']
rmse = train_test(df3, features4)
# f4 5.548512496765388
rmse

In [18]:
features5 = ['trip_distance', 'trip_length', 'day_of_week', 'time_of_day', 'tmin', 'tmax']
rmse = train_test(df3, features5)
# f5 2.6680336382474628
rmse

In [30]:
%memit rmse = train_test(df3, features5)
#1318

In [20]:
features5 = ['trip_distance', 'trip_length', 'day_of_week', 'time_of_day', 'tmin', 'tmax']
rmse = train_test1(df3, features5)
# f 5 3.4780707610134134
rmse

In [21]:
%memit rmse = train_test1(df3, features5)
#  1842.56 MiB

In [22]:
df3, new_cols = col_dummy(df3, 'day_of_week')

In [31]:

features6 = ['trip_distance', 'trip_length', 'time_of_day', 'tmin', 'tmax'] + new_cols
rmse = train_test(df3, features6)
# f6 2.634177334931782
rmse

In [34]:
%memit rmse = train_test(df3, features6)
# 1316

In [28]:

features6 = ['trip_distance', 'trip_length', 'time_of_day', 'tmin', 'tmax'] + new_cols
rmse = train_test1(df3, features6)
# f6 3.4658308154700666
rmse
# 2.6013369689567116

In [35]:
%memit rmse = train_test1(df3, features6)
# peak memory: 5329.01 MiB, increment: 2193.05 MiB

In [33]:
df3, new_cols2 = col_dummy(df3, 'time_of_day')

In [34]:

features7 = ['trip_distance', 'trip_length', 'tmin', 'tmax'] + new_cols + new_cols2
rmse = train_test(df3, features7)
# f7 2.5576173097913086
rmse

In [35]:

features7 = ['trip_distance', 'trip_length', 'tmin', 'tmax'] + new_cols + new_cols2
rmse = train_test1(df3, features7)
# f7 2.5576173097913086
rmse

In [24]:
df3['total_amount'].mean()

In [43]:
df3.columns

In [25]:
def train_test2(df, features):
    np.random.seed(1)
    shuffled_index = np.random.permutation(df.index)
    df = df.reindex(index = shuffled_index)
    split_loc = int(0.5*len(df))
    # split
    train = df.iloc[:split_loc].copy()
    test = df.iloc[split_loc:].copy()
    del df
    gc.collect()
    lr = linear_model.LinearRegression()
    lr.fit(train[features], train["total_amount"])
    predictions = lr.predict(test[features])
    mse = mean_squared_error(test["total_amount"], predictions)
    rmse = np.sqrt(mse)
    test['predicted'] = predictions
    
    return rmse, test

In [42]:
features4 = ['trip_distance', 'trip_length', 'day_of_week', 'time_of_day']
rmse, test_df = train_test2(df3, features4)
test_df[['total_amount', 'predicted']]

In [56]:
df3.groupby('pickup_date')['passenger_count'].sum()

In [36]:
df3.to_csv('myfirst.csv')

In [37]:
2+2