In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

In [3]:
def preprocess(filename, nrows):
    df = pd.read_csv(filename, nrows = nrows)
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df['Hour'] = df['pickup_datetime'].dt.hour
    df['day'] = df['pickup_datetime'].dt.dayofweek
    df.dropna(inplace=True)
    df.drop(columns=['key', 'pickup_datetime'], inplace=True)
    df.drop(df[df['passenger_count']>=7].index, inplace=True)
    df.drop(df[df['passenger_count']<=0].index, inplace=True)
    df.drop(df[df['fare_amount']<=0.1].index, inplace=True)
    df.drop(df[df['fare_amount']>=500].index, inplace=True)
    df.drop(df[df['pickup_latitude']<=39].index, inplace=True)
    df.drop(df[df['pickup_latitude']>=41].index, inplace=True)
    df.drop(df[df['pickup_longitude']>=-70].index, inplace=True)
    df.drop(df[df['pickup_longitude']<=-80].index, inplace=True)
    df.drop(df[df['dropoff_latitude']<=35].index, inplace=True)
    df.drop(df[df['dropoff_latitude']>=45].index, inplace=True)
    df.drop(df[df['dropoff_longitude']>=-73].index, inplace=True)
    df.drop(df[df['dropoff_longitude']<=-75].index, inplace=True)
    train = df.to_numpy(dtype=float)
    np.random.shuffle(train)
    trainX = train[:,1:]
    trainY = train[:,0]
    return trainX, trainY

 



In [3]:
def preprocessForKnn(filename, nrows):
    df = pd.read_csv(filename, nrows = nrows)
    df.dropna(inplace=True)
    df.drop(columns=['key', 'pickup_datetime','passenger_count'], inplace=True)
    df.drop(df[df['fare_amount']<=0.1].index, inplace=True)
    df.drop(df[df['fare_amount']>=500].index, inplace=True)
    df.drop(df[df['pickup_latitude']<=39].index, inplace=True)
    df.drop(df[df['pickup_latitude']>=41].index, inplace=True)
    df.drop(df[df['pickup_longitude']>=-70].index, inplace=True)
    df.drop(df[df['pickup_longitude']<=-80].index, inplace=True)
    df.drop(df[df['dropoff_latitude']<=35].index, inplace=True)
    df.drop(df[df['dropoff_latitude']>=45].index, inplace=True)
    df.drop(df[df['dropoff_longitude']>=-73].index, inplace=True)
    df.drop(df[df['dropoff_longitude']<=-75].index, inplace=True)
    train = df.to_numpy(dtype=float)
    np.random.shuffle(train)
    trainX = train[:,1:]
    trainY = train[:,0]
    return trainX, trainY

 



In [5]:
ts = pd.read_csv('test.csv')
submit = ts['key'].to_numpy()
ts.drop(columns=['key','pickup_datetime','passenger_count'], inplace=True)
test = ts.to_numpy(dtype=float)

In [7]:
clf = KNeighborsRegressor(n_neighbors=7)
trainX, trainY = preprocessForKnn('train.csv',30000000)
clf.fit(trainX, trainY)
predict = clf.predict(test)
output = np.c_[submit.T, predict.T]
sub = pd.DataFrame(output)
sub.to_csv('knn.csv', index=None, header=['key', 'fare_amount'])

In [4]:
ts = pd.read_csv('test.csv')
ts['pickup_datetime'] = pd.to_datetime(ts['pickup_datetime'])
ts['Hour'] = ts['pickup_datetime'].dt.hour
ts['day'] = ts['pickup_datetime'].dt.dayofweek
submit = ts['key'].to_numpy()
ts.drop(columns=['key','pickup_datetime'], inplace=True)
test = ts.to_numpy(dtype=float)

In [32]:
trainX, trainY = preprocess('train.csv',100000)

clf = XGBRegressor()
clf.fit(trainX, trainY)
predict = clf.predict(test)
output = np.c_[submit.T, predict.T]
sub = pd.DataFrame(output)
sub.to_csv('xgb.csv', index=None, header=['key', 'fare_amount'])


In [7]:
trainX, trainY = preprocess('train.csv',100000)

clf = GradientBoostingRegressor(n_estimators=500, max_depth=6)
clf.fit(trainX, trainY)
predict = clf.predict(test)
output = np.c_[submit.T, predict.T]
sub = pd.DataFrame(output)
sub.to_csv('grad.csv', index=None, header=['key', 'fare_amount'])
