In [None]:
import pandas as pd
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
babs = pd.read_csv('data/201508_trip_data.csv.gz')

In [None]:
def convert_date(date_str):
    try:
        return datetime.datetime.strptime(date_str, '%m/%d/%Y %H:%M')
    except ValueError:
        return None

In [None]:
babs['Start Date'] = babs['Start Date'].apply(convert_date)

In [None]:
plt.hist(babs.Duration, bins=30)
None

In [None]:
sd, mu = babs.Duration.std(), babs.Duration.mean()

In [None]:
babs['z'] = (babs.Duration - mu) / sd

In [None]:
plt.hist(babs.z[babs.z < 5], bins=30)
plt.yscale('log')

In [None]:
babs['date'] = babs['Start Date'].apply(lambda x: x.date())

In [None]:
babs['hour'] = babs['Start Date'].apply(lambda x: x.hour)

In [None]:
counts = babs.groupby(['date', 'hour'], as_index=False)['Trip ID'].count()

In [None]:
X = counts.pivot_table(columns='hour', values='Trip ID', index='date').fillna(0)

In [None]:
X.head()

In [None]:
from sklearn.cluster import KMeans

In [None]:
ks = range(2, 10)
scores = [-KMeans(k).fit(X).score(X) for k in ks]

In [None]:
plt.plot(ks, scores)

In [None]:
model = KMeans(4).fit(X)

In [None]:
for n, center in enumerate(model.cluster_centers_):
    plt.plot(center, label="class = {}".format(n))
plt.legend()

In [None]:
labels = pd.DataFrame({'date': X.index, 'cluster': model.labels_})

In [None]:
from calendar import day_abbr, month_abbr

In [None]:
labels['day'] = labels.date.apply(lambda x: day_abbr[x.weekday()])
labels['month'] = labels.date.apply(lambda x: month_abbr[x.month])

In [None]:
weekday_counts = labels.groupby(['cluster', 'day']).cluster.count()

In [None]:
weekday_counts

In [None]:
weekday_counts.name = 'count'

In [None]:
better_counts = weekday_counts.reset_index().pivot(columns='day', values='count', index='cluster').fillna(0)
better_counts

In [None]:
better_counts.plot.bar()
plt.ylabel('count')

In [None]:
(labels
 .groupby(['cluster', 'month'])
 .day.count().reset_index()
 .pivot(columns='month', values='day', index='cluster')
 .fillna(0)
 .plot.bar())
plt.legend(bbox_to_anchor=(1,1))
plt.ylabel('count')

In [None]:
hours = X.columns

In [None]:
X['center'] = [model.cluster_centers_[i] for i in model.labels_]
X['cluster'] = model.labels_

In [None]:
from numpy.linalg import norm

In [None]:
X['dist'] = [norm(i - j) for i, j in zip(X[hours].values, X['center'])]

In [None]:
stats = X.groupby(model.labels_)['dist'].agg([np.mean, np.std])

In [None]:
joined = X.join(stats, on='cluster')

In [None]:
joined['z'] = (joined.dist - joined['mean'])/joined['std']

In [None]:
joined.z.plot.hist()

In [None]:
joined[joined.z > 3]

In [None]:
for i in joined[(joined.z > 3.5) & (joined.cluster == 2)][list(hours)].iterrows():
    plt.plot(i[1], label='outlier')
plt.plot(model.cluster_centers_[2], label='center')
plt.legend()
plt.xlabel('hour')
plt.ylabel('trip count')