In [29]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
import string

In [30]:
all_chars = string.printable

In [31]:
df_train = pd.read_csv("train.csv")

In [32]:
# Over every single 
def polyline_to_trip_duration(polyline):
    return max(polyline.count("[") - 2, 0) * 15

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15, where polyline_length = count("[") - 1
df_train["LEN"] = df_train["POLYLINE"].apply(polyline_to_trip_duration)

In [33]:
from datetime import datetime
def parse_time(x):
    # We are using python's builtin datetime library
    # https://docs.python.org/3/library/datetime.html#datetime.date.fromtimestamp

    # Each x is essentially a 1 row, 1 column pandas Series
    dt = datetime.fromtimestamp(x["TIMESTAMP"])
    return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

# Because we are assigning multiple values at a time, we need to "expand" our computed (year, month, day, hour, weekday) tuples on 
# the column axis, or axis 1
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html
df_train[["YR", "MON", "DAY", "HR", "WK"]] = df_train[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

In [34]:
def feature(datum):
    features = [1,
                all_chars.index(datum['CALL_TYPE']),
                datum['ORIGIN_CALL'] if not pd.isna(datum['ORIGIN_CALL']) else 0,
                datum['ORIGIN_STAND'] if not pd.isna(datum['ORIGIN_STAND']) else 0,
                datum['TAXI_ID'],
                all_chars.index(datum['DAY_TYPE']),
                datum['YR'],
                datum['MON'],
                datum['DAY'],
                datum['HR'],
                datum['WK']
               ]
    return features

In [35]:
X = [feature(df_train.iloc[i]) for i in range(len(df_train))]
y = [df_train.iloc[i]['LEN'] for i in range(len(df_train))]

In [36]:
reg = linear_model.LinearRegression()
reg.fit(X, y)

LinearRegression()

In [37]:
df_test = pd.read_csv("test_public.csv")
df_test[["YR", "MON", "DAY", "HR", "WK"]] = df_test[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

In [38]:
Xpred = [feature(df_test.iloc[i]) for i in range(len(df_test))]
ypred = reg.predict(Xpred)

In [39]:
# Sample submission file that is given on kaggle
df_sample = pd.read_csv("sampleSubmission.csv")

df_sample["TRAVEL_TIME"] = ypred

df_sample.to_csv("my_pred.csv", index=None)

In [40]:
"""
def RMSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return (sum(differences) / len(differences)) ** 0.5
"""

'\ndef RMSE(predictions, labels):\n    differences = [(x-y)**2 for x,y in zip(predictions,labels)]\n    return (sum(differences) / len(differences)) ** 0.5\n'

In [41]:
#labels = [df_test.iloc[i]['LEN'] for i in range(len(df_test))]

In [42]:
#RMSE(ypred, labels)