In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
%matplotlib inline
plt.rcParams["figure.figsize"] = 10, 6

# Importing

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
combined_df = pd.concat((train_df, test_df))
# reindex the combined dataframe
combined_df = combined_df.reset_index()
combined_df.drop("index", axis=1, inplace=True)

In [None]:
print(train_df.shape, test_df.shape, combined_df.shape)
combined_df.count()

Great, all fields are filled. The only missing ones are the dropoff fields from the test-data.

In [None]:
train_df.head()

In [None]:
test_df.head()

There are two missing columns in test, both corresponding to the duration. From the training data it seems as if the duration is measured in seconds. Lets check.

In [None]:
start = dt.datetime.strptime(train_df.loc[0].pickup_datetime, "%Y-%m-%d %X")
end = dt.datetime.strptime(train_df.loc[0].dropoff_datetime, "%Y-%m-%d %X")
print((end-start).total_seconds(), train_df.loc[0].trip_duration)

In [None]:
train_df.store_and_fwd_flag.value_counts()

The duration is given in seconds. The fields are:

- field | type | categorical | interpretation |
- id | string "id" + int | no | id of the entry |
- vendor_id | int | yes | id of the provider |
- pickup_datetime | string representing datetime | no | time of pickup using the meter |
- dropoff_datetime | string representing datetime | no | time of dropoff using the meter |
- passenger_count | int | no | manually entered number of passengers |
- pickup_{longitude, latitude} | float | no | {longitude, latitude} of pickup at pickup_datetime |
- dropoff_{longitude, latitude} | float | no | {longitude, latitude} of dropoff at dropoff_datetime |
- store_and_fwd_flag | str representing bool | yes | if the taxi had a connection to the server
- trip_duration | int | no | trip duration in seconds |

## Duration

In [None]:
train_df.trip_duration.describe()

In [None]:
print("Longest trip took {} days.".format(train_df.trip_duration.max()/(60.*60*24)))

One of the trip durations is incredibly long. Let us look at the corresponding entry

In [None]:
max_entry = train_df.where(train_df.trip_duration == train_df.trip_duration.max()).dropna()
max_entry.index

The beginning and the end are very close. Two possible explanations could be a very long trip including return, so from NYC to SF and back or such. Or someone forgot to turn off his meter. Either way this entry is of no use to us, so let us delete it.

In [None]:
train_df.drop(max_entry.index).describe()

And there is another absurdely long. Let us say everything longer than 10 hours is certainly too long. 10 hours = 10\*60\*60 seconds = 36000. Let us make that a nice round 4e+4.

In [None]:
long_durations = train_df.where(train_df.trip_duration > 4e+4).dropna()
long_durations.describe()

We see a lot of times around 86000 seconds, so 24 hours. It seems like some drivers turned the meter on and just let it run.

In [None]:
train_df.drop(long_durations.index, inplace=True)
train_df.describe()

In [None]:
train_df.hist(column="trip_duration", bins=100)

Most trips are quite relatively short, below 5000 seconds = 83 minutes. So most trips take under one and a half hour, although there are some quite long trips.

## Vendor

First let us look at the vendors and how many we have.

In [None]:
combined_df.vendor_id.value_counts()

In [None]:
vendor_counts = combined_df.vendor_id.value_counts(sort=True, ascending=True)
df = pd.DataFrame(vendor_counts)
df.columns = ["counts"]
df.plot(kind="bar", stacked=True)

The vendors seem to be fairly balanced. Now let us check the distribution of the times.

In [None]:
n, bins, patches = plt.hist([train_df[train_df.vendor_id == 1].trip_duration,
                            train_df[train_df.vendor_id == 2].trip_duration],
                            stacked=True, edgecolor="k", bins=1000)
plt.legend(patches, ("Vendor 1", "Vendor 2"), loc="best")
plt.xlim(0., 10000)
plt.show()
n, bins, patches = plt.hist([train_df[(train_df.vendor_id == 1) & (train_df.trip_duration > 10000)].trip_duration,
                            train_df[(train_df.vendor_id == 2) & (train_df.trip_duration > 10000)].trip_duration],
                            stacked=True, edgecolor="k", bins=1000)
plt.legend(patches, ("Vendor 1", "Vendor 2"), loc="best")
plt.show()

In [None]:
# Look at the whole distribution
sns.kdeplot(train_df[train_df.vendor_id == 1].trip_duration, label="Vendor 1", shade=True)
sns.kdeplot( train_df[train_df.vendor_id == 2].trip_duration, label="Vendor 2", shade=True)
plt.xlim(0., 10000)
plt.show()

# Look only at trips < 10000 s
sns.kdeplot(train_df[train_df.vendor_id == 1].trip_duration, label="Vendor 1", shade=True, clip=[0., 10000])
sns.kdeplot( train_df[train_df.vendor_id == 2].trip_duration, label="Vendor 2", shade=True, clip=[0., 10000])
plt.xlim(0., 10000)
plt.show()

If we restrict ourselves to trips < 10000 seconds (about 2 and a half hours) both vendors have the same distribution, but as soon as we look at the whole distribution we find different ones. Vendor 2 seems to have generally longer trips, whereas Vendor 1 always stays below about 1000 seconds.

## Pickup time

In [None]:
combined_df["year"] =  combined_df.pickup_datetime.map(lambda x: dt.datetime.strptime(x, "%Y-%m-%d %X").year)
train_df["year"] =  train_df.pickup_datetime.map(lambda x: dt.datetime.strptime(x, "%Y-%m-%d %X").year)
combined_df.head()