In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
#load python modules that might be needed
import sys
import os

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

import datetime as dt
from datetime import datetime

%matplotlib inline

#load the training data (as much as needed)
data_train = pd.read_csv('../input/train.csv', nrows = 10000000)
#data_train.describe()

#load the test data
data_test = pd.read_csv('../input/test.csv')
#data_test.describe()

In [None]:
#....CLEANING THE DATA....#
# - using the 'describe' method, a few anomalies were observed
# - minimum value of the fare was negative, which is absurd, so these entries were removed
# - minimum passenger count was 0, which again, doesn't make any sense, and corresponding entries were removed
# - in case of 0 passeneger count, it probably was an erroneous entry or the taxi could have just been transporting goods but seeing as how there are no such entries in the test data, it was simpler to just discard those entries
# - finally, looking at the test data, I set the bounds of the number of passengers to 1 and 6
# - latitude and longitudinal entries were out of bounds, so I observed the test data to obtain a set of limits for the entries
# - Using the info, entries with positional data out of the limits were removed
# - Also, rows with missing entries were removed

#remove missing rows
print('Missing entries removal')
print('Old size: %d' % len(data_train))
data_train = data_train.dropna(how = 'any', axis = 'rows')
print('New size: %d' % len(data_train))

In [None]:
#remove entries with negative fare or fare greater than $350(intercity fares are not going to be as high and are treated as outliers)
print('Negative fares removal')
print('Old size: %d' % len(data_train))
data_train = data_train[(data_train.fare_amount>=0) & (data_train.fare_amount<350)]
print('New size: %d' % len(data_train))

In [None]:
#remove entries with passenger count below 1 and above 6
print('Passenger Count Check')
print('Old size: %d' % len(data_train))
data_train = data_train[(data_train.passenger_count>0) & (data_train.passenger_count<7)]
print('New size: %d' % len(data_train))

In [None]:
#remove entries with positional data outside the defined bounding box(as obtained from the test set)
min_longitude = min(data_test.pickup_longitude.min(), data_test.dropoff_longitude.min())
min_latitude = min(data_test.pickup_latitude.min(), data_test.dropoff_latitude.min())
max_longitude = max(data_test.pickup_longitude.max(), data_test.dropoff_longitude.max())
max_latitude = max(data_test.pickup_latitude.max(), data_test.dropoff_latitude.max())

print('Positional Check')
print('Old size: %d' % len(data_train))
data_train = data_train[(data_train.pickup_longitude >= min_longitude) & (data_train.pickup_longitude <= max_longitude) & \
                        (data_train.dropoff_longitude >= min_longitude) & (data_train.dropoff_longitude <= max_longitude) & \
                        (data_train.pickup_latitude >= min_latitude) & (data_train.pickup_latitude <= max_latitude) & \
                        (data_train.dropoff_latitude >= min_latitude) & (data_train.dropoff_latitude <= max_latitude)]
print('New size: %d' % len(data_train))

In [None]:
#Calculation of Pearson correlation

#Haversine distance formula
#this was obtained from 'https://stackoverflow.com/questions/27928/calculate-distance-between-two-latitude-longitude-points-haversine-formula'
def hav_distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a)) # 2*R*asin...

#create a new column for the euclidean distance based on the haversine distance formula
data_train['distance_miles'] = hav_distance(data_train.pickup_latitude, data_train.pickup_longitude, data_train.dropoff_latitude, data_train.dropoff_longitude)

In [None]:
#corr1 = data_train['distance_miles'].corr(data_train['fare_amount'])
corr1 = data_train['fare_amount'].corr(data_train['distance_miles'])
print(corr1)

In [None]:
#I now transform the given date/time to a form that is much easier to work with (for obtaining hours, minutes, etc.)
data_train['pickup_datetime'] = pd.to_datetime(data_train['pickup_datetime'], format='%Y-%m-%d  %H:%M:%S %Z')

#A new column that contains the time of the day (only hour of the day is sufficient) is now added to the training data
data_train['time'] = (data_train['pickup_datetime'].dt.hour * 60) + data_train['pickup_datetime'].dt.minute 

In [None]:
corr2 = data_train['time'].corr(data_train['distance_miles'])
print(corr2)

In [None]:
corr3 = data_train['time'].corr(data_train['fare_amount'])
print(corr3)

In [None]:
data_train.plot.scatter(x="distance_miles", y="fare_amount")

In [None]:
data_train.plot.scatter(x="time", y="distance_miles")

In [None]:
data_train.plot.scatter(x="time", y="fare_amount")

In [None]:
long_range = (-74.03, -73.75)
lat_range = (40.63, 40.85)
data_train.plot.scatter(x="pickup_longitude", y="pickup_latitude", s=0.03, alpha = 0.5, color='g')
plt.ylim(lat_range)
plt.xlim(long_range)

In [None]:
data_train.plot.scatter(x="dropoff_longitude", y="dropoff_latitude", s=0.03, alpha = 0.5, color='r')
plt.ylim(lat_range)
plt.xlim(long_range)

In [None]:
data_train['week_day'] = data_train['pickup_datetime'].dt.weekday
#data_train.week_day.describe()

In [None]:
data_train['year'] = data_train['pickup_datetime'].dt.year
#data_train.year.describe()

In [None]:
data_train['month'] = data_train['pickup_datetime'].dt.month
#data_train.month.describe()

In [None]:
nyc_center = (-74.006, 40.712)
data_train['pick_dist_cent'] = hav_distance(nyc_center[1], nyc_center[0], data_train.pickup_latitude, data_train.pickup_longitude)
data_train['drop_dist_cent'] = hav_distance(nyc_center[1], nyc_center[0], data_train.dropoff_latitude, data_train.dropoff_longitude)

In [None]:
#simple linear regression
feature_list = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'distance_miles', 'passenger_count']
x = data_train[feature_list].values
y = data_train['fare_amount'].values
data_test['distance_miles'] = hav_distance(data_test.pickup_latitude, data_test.pickup_longitude, data_test.dropoff_latitude, data_test.dropoff_longitude)
x_test = data_test[feature_list].values

In [None]:
from sklearn.pipeline import Pipeline
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(x)
rescaledX = scaler.transform(x)
scaler = StandardScaler().fit(x_test)
rescaledX_test = scaler.transform(x_test)
regr = linear_model.LinearRegression()
regr.fit(rescaledX, y)
print(regr.coef_)
y_test_pred = regr.predict(rescaledX_test)

In [None]:
submission = pd.DataFrame({'key': data_test.key, 'fare_amount': y_test_pred},
    columns = ['key', 'fare_amount'])
submission.to_csv('submission.csv', index = False)

In [None]:
'''from sklearn.neighbors import KNeighborsRegressor

neigh = KNeighborsRegressor(n_neighbors=4)
neigh.fit(rescaledX, y)
y_test_pred_knr = neigh.predict(rescaledX_test)
print('Done')'''

'''from sklearn.neighbors import RandomForestRegressor

neigh = RandomForestRegressor(n_estimators=10, max_depth=10, min_samples_leaf=10)
neigh.fit(rescaledX, y)
y_test_pred_random = neigh.predict(rescaledX_test)
print('Done')'''