In [280]:
import pandas as pd
import numpy as np
import math
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
import seaborn as sns

In [53]:
list_holidays = pd.read_csv('days_holidays.csv')['datetime'].values.tolist()
list_workdays = pd.read_csv('days_workingdays.csv')['datetime'].values.tolist()

In [2]:
df_biketrips = pd.read_csv('biketrips2011.csv')
df_biketrips['StartTime'] = df_biketrips['StartTime'].astype('datetime64')
df_biketrips['EndTime'] = df_biketrips['EndTime'].astype('datetime64')
df_biketrips['StartNum'] = df_biketrips['StartNum'].astype(int)
df_biketrips['EndNum'] = df_biketrips['EndNum'].astype(int)

In [4]:
# new features - to help group by hour
df_dhcombo_out = pd.DataFrame({
    'Day': df_biketrips['StartTime'].dt.date,
    'Hour': df_biketrips['StartTime'].dt.hour,
})

df_dhcombo_in = pd.DataFrame({
    'Day': df_biketrips['EndTime'].dt.date,
    'Hour': df_biketrips['EndTime'].dt.hour,
})

df_biketrips['StartHour'] = pd.to_datetime(df_dhcombo_out.Day) + pd.to_timedelta(df_dhcombo_out.Hour, unit='h')
df_biketrips['EndHour'] = pd.to_datetime(df_dhcombo_in.Day) + pd.to_timedelta(df_dhcombo_in.Hour, unit='h')

In [5]:
df_hourly_start = df_biketrips.pivot_table(values='count',index='StartHour',columns='StartNum',aggfunc='sum',fill_value=0)
df_hourly_end = df_biketrips.pivot_table(values='count',index='EndHour',columns='EndNum',aggfunc='sum',fill_value=0)

In [648]:
df_flow_new = pd.DataFrame([df_hourly_end[31217],df_hourly_start[31217]]).T
df_flow_new.fillna(0,inplace=True)
df_flow_new.columns= ['in','out']
df_flow_new['workday'] = df_flow_new.index.isin(list_workdays).astype(int)
df_flow_new['hourofday'] = df_flow_new.index.hour
df_flow_new['net'] = df_flow_new['in'] - df_flow_new['out']
df_flow_new['netscaled'] = StandardScaler().fit_transform(df_flow_new['net'].values.reshape(-1,1))
df_flow_new['pct_in'] = (df_flow_new['in']/(df_flow_new['in']+df_flow_new['out'])).fillna(0)
df_flow_new['pct_out'] = (df_flow_new['out']/(df_flow_new['in']+df_flow_new['out'])).fillna(0)
df_flow_new['log_in'] = df_flow_new['pct_in'].apply(lambda x: math.log1p(x))
df_flow_new['log_out'] = df_flow_new['pct_out'].apply(lambda x: math.log1p(x))
df_flow_new['am_rush'] = ((df_flow_new['workday']==1)&(df_flow_new['hourofday'].isin([6,7,8]))).astype(int)
df_flow_new['pm_rush'] = ((df_flow_new['workday']==1)&(df_flow_new['hourofday'].isin([15,16,17]))).astype(int)
df_flow_new['evening'] = ((df_flow_new['hourofday'].isin([18,19,20,21,22,23]))).astype(int)
df_flow_new['daytime_nwd'] = ((df_flow_new['workday']==0)&(df_flow_new['hourofday'].isin([9,10,11,12,13,14,15,16,17]))).astype(int)

In [649]:
df_flow_new.loc['2011-08']

Unnamed: 0,in,out,workday,hourofday,net,netscaled,pct_in,pct_out,log_in,log_out,am_rush,pm_rush,evening,daytime_nwd
2011-08-01 00:00:00,0.0,0.0,1,0,0.0,-0.106123,0.000000,0.000000,0.000000,0.000000,0,0,0,0
2011-08-01 01:00:00,0.0,0.0,1,1,0.0,-0.106123,0.000000,0.000000,0.000000,0.000000,0,0,0,0
2011-08-01 02:00:00,0.0,0.0,1,2,0.0,-0.106123,0.000000,0.000000,0.000000,0.000000,0,0,0,0
2011-08-01 03:00:00,0.0,0.0,1,3,0.0,-0.106123,0.000000,0.000000,0.000000,0.000000,0,0,0,0
2011-08-01 04:00:00,0.0,0.0,1,4,0.0,-0.106123,0.000000,0.000000,0.000000,0.000000,0,0,0,0
2011-08-01 05:00:00,0.0,0.0,1,5,0.0,-0.106123,0.000000,0.000000,0.000000,0.000000,0,0,0,0
2011-08-01 06:00:00,1.0,0.0,1,6,1.0,0.158057,1.000000,0.000000,0.693147,0.000000,1,0,0,0
2011-08-01 07:00:00,6.0,3.0,1,7,3.0,0.686417,0.666667,0.333333,0.510826,0.287682,1,0,0,0
2011-08-01 08:00:00,13.0,3.0,1,8,10.0,2.535677,0.812500,0.187500,0.594707,0.171850,1,0,0,0
2011-08-01 09:00:00,8.0,3.0,1,9,5.0,1.214777,0.727273,0.272727,0.546544,0.241162,0,0,0,0


In [679]:
df_for_cluster = df_flow_new.loc['2011-08']
model = DBSCAN(eps=0.9,min_samples=10,metric='l1').fit(df_for_cluster.drop(['hourofday','in','out','log_in','log_out','net'],axis=1))

In [678]:
np.unique(model.labels_)

array([-1,  0])

In [680]:
df_for_cluster['label'] = model.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [681]:
df_for_cluster.groupby('label').count()

Unnamed: 0_level_0,in,out,workday,hourofday,net,netscaled,pct_in,pct_out,log_in,log_out,am_rush,pm_rush,evening,daytime_nwd
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
-1,42,42,42,42,42,42,42,42,42,42,42,42,42,42
0,115,115,115,115,115,115,115,115,115,115,115,115,115,115
1,68,68,68,68,68,68,68,68,68,68,68,68,68,68
2,160,160,160,160,160,160,160,160,160,160,160,160,160,160
3,65,65,65,65,65,65,65,65,65,65,65,65,65,65
4,115,115,115,115,115,115,115,115,115,115,115,115,115,115
5,21,21,21,21,21,21,21,21,21,21,21,21,21,21
6,42,42,42,42,42,42,42,42,42,42,42,42,42,42
7,18,18,18,18,18,18,18,18,18,18,18,18,18,18
8,57,57,57,57,57,57,57,57,57,57,57,57,57,57


In [470]:
df_for_cluster.groupby(['label','am_rush','pm_rush','evening','daytime_nwd','workday','hourofday']).count().to_csv('cluster_stats.csv')

In [682]:
df_for_cluster[df_for_cluster['label']==-1]

Unnamed: 0,in,out,workday,hourofday,net,netscaled,pct_in,pct_out,log_in,log_out,am_rush,pm_rush,evening,daytime_nwd,label
2011-08-05 17:00:00,3.0,23.0,1,17,-20.0,-5.389723,0.115385,0.884615,0.109199,0.633724,0,1,0,0,-1
2011-08-06 08:00:00,0.0,4.0,0,8,-4.0,-1.162843,0.0,1.0,0.0,0.693147,0,0,0,0,-1
2011-08-06 14:00:00,27.0,12.0,0,14,15.0,3.856577,0.692308,0.307692,0.526093,0.268264,0,0,0,1,-1
2011-08-06 15:00:00,11.0,19.0,0,15,-8.0,-2.219563,0.366667,0.633333,0.312375,0.490623,0,0,0,1,-1
2011-08-06 16:00:00,4.0,11.0,0,16,-7.0,-1.955383,0.266667,0.733333,0.236389,0.550046,0,0,0,1,-1
2011-08-06 20:00:00,5.0,0.0,0,20,5.0,1.214777,1.0,0.0,0.693147,0.0,0,0,1,0,-1
2011-08-07 12:00:00,0.0,7.0,0,12,-7.0,-1.955383,0.0,1.0,0.0,0.693147,0,0,0,1,-1
2011-08-07 21:00:00,11.0,0.0,0,21,11.0,2.799857,1.0,0.0,0.693147,0.0,0,0,1,0,-1
2011-08-08 20:00:00,17.0,5.0,1,20,12.0,3.064037,0.772727,0.227273,0.572519,0.204794,0,0,1,0,-1
2011-08-08 22:00:00,2.0,15.0,1,22,-13.0,-3.540463,0.117647,0.882353,0.111226,0.632523,0,0,1,0,-1


In [632]:
df_for_cluster[(df_for_cluster['hourofday']==17)&(df_for_cluster['workday']==0)]

Unnamed: 0,in,out,workday,hourofday,net,netscaled,log_in,log_out,am_rush,pm_rush,evening,daytime_nwd,label
2011-08-06 17:00:00,13.0,2.0,0,17,11.0,2.799857,0.624154,0.125163,0,0,0,1,0
2011-08-07 17:00:00,1.0,2.0,0,17,-1.0,-0.370303,0.287682,0.510826,0,0,0,1,0
2011-08-13 17:00:00,13.0,9.0,0,17,4.0,0.950597,0.464306,0.342945,0,0,0,1,0
2011-08-14 17:00:00,7.0,5.0,0,17,2.0,0.422237,0.459532,0.348307,0,0,0,1,0
2011-08-20 17:00:00,9.0,27.0,0,17,-18.0,-4.861363,0.223144,0.559616,0,0,0,1,-1
2011-08-21 17:00:00,3.0,3.0,0,17,0.0,-0.106123,0.405465,0.405465,0,0,0,1,0
2011-08-27 17:00:00,0.0,0.0,0,17,0.0,-0.106123,0.0,0.0,0,0,0,1,0
2011-08-28 17:00:00,15.0,15.0,0,17,0.0,-0.106123,0.405465,0.405465,0,0,0,1,0


In [633]:
df_for_cluster[(df_for_cluster['hourofday']==17)&(df_for_cluster['workday']==1)]

Unnamed: 0,in,out,workday,hourofday,net,netscaled,log_in,log_out,am_rush,pm_rush,evening,daytime_nwd,label
2011-08-01 17:00:00,10.0,13.0,1,17,-3.0,-0.898663,0.361013,0.448025,0,1,0,0,0
2011-08-02 17:00:00,0.0,14.0,1,17,-14.0,-3.804643,0.0,0.693147,0,1,0,0,0
2011-08-03 17:00:00,6.0,6.0,1,17,0.0,-0.106123,0.405465,0.405465,0,1,0,0,0
2011-08-04 17:00:00,4.0,19.0,1,17,-15.0,-4.068823,0.160343,0.602175,0,1,0,0,0
2011-08-05 17:00:00,3.0,23.0,1,17,-20.0,-5.389723,0.109199,0.633724,0,1,0,0,-1
2011-08-08 17:00:00,9.0,16.0,1,17,-7.0,-1.955383,0.307485,0.494696,0,1,0,0,0
2011-08-09 17:00:00,3.0,21.0,1,17,-18.0,-4.861363,0.117783,0.628609,0,1,0,0,0
2011-08-10 17:00:00,2.0,10.0,1,17,-8.0,-2.219563,0.154151,0.606136,0,1,0,0,0
2011-08-11 17:00:00,2.0,14.0,1,17,-12.0,-3.276283,0.117783,0.628609,0,1,0,0,0
2011-08-12 17:00:00,6.0,10.0,1,17,-4.0,-1.162843,0.318454,0.485508,0,1,0,0,0
