In [108]:
import pandas as pd
import numpy as np
import pickle as pkl
import geopandas as gpd

In [157]:
df = pd.read_csv('data/processed/processed_data.csv')

In [158]:
df.head()

Unnamed: 0,zone,bin,day,month,time_period,demand,air_temp,air_temp_feel,rain,snow,residential,education,recreational,commercial,health,subway_stations,bus_stops,avg_demand_five,historical_average
0,4,0,3,6,off_peak,7.0,73.18,73.31,0.0,0.0,57,11,30,0,1,0.0,11.0,,6.052632
1,4,1,3,6,off_peak,5.0,73.18,73.31,0.0,0.0,57,11,30,0,1,0.0,11.0,,4.315789
2,4,2,3,6,off_peak,1.0,73.18,73.31,0.0,0.0,57,11,30,0,1,0.0,11.0,,4.473684
3,4,3,3,6,off_peak,3.0,73.18,73.31,0.0,0.0,57,11,30,0,1,0.0,11.0,,3.631579
4,4,4,3,6,off_peak,1.0,73.18,73.31,0.0,0.0,57,11,30,0,1,0.0,11.0,3.4,4.631579


In [159]:
zone_id = df['zone'].unique()

### Demand

In [3]:
df_demand = df[['zone','month','day','bin','demand']]
df_demand = df_demand.sort_values(by=['month','day','bin','zone'])
df_demand.head(10)

Unnamed: 0,zone,month,day,bin,demand
0,4,6,3,0,7.0
5760,12,6,3,0,0.0
11520,13,6,3,0,1.0
17280,24,6,3,0,3.0
23040,41,6,3,0,10.0
28800,42,6,3,0,9.0
34560,43,6,3,0,2.0
40320,45,6,3,0,2.0
46080,48,6,3,0,20.0
51840,50,6,3,0,9.0


In [68]:
df_demand = df_demand.reset_index(drop=True)
df_demand.to_csv('data/processed/demand.csv',index_label='index')
df_demand.head()

Unnamed: 0,zone,month,day,bin,demand
0,4,6,3,0,7.0
1,12,6,3,0,0.0
2,13,6,3,0,1.0
3,24,6,3,0,3.0
4,41,6,3,0,10.0


In [78]:
df_demand = pd.read_csv('data/processed/demand.csv')
df_demand.head()

Unnamed: 0,index,zone,month,day,bin,demand
0,0,4,6,3,0,7.0
1,1,12,6,3,0,0.0
2,2,13,6,3,0,1.0
3,3,24,6,3,0,3.0
4,4,41,6,3,0,10.0


### Adjancent matrix

In [5]:
zipfile = 'data/taxi_zones.shp'
zones = gpd.read_file(zipfile)
zones = zones.sort_values(['LocationID']).reset_index(drop=True)

In [12]:
Manhattan = zones[zones['LocationID'].isin(zone_id)]
Manhattan.head(10)

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((992073.467 203714.076, 992068.667 20..."
11,12,0.036661,4.2e-05,Battery Park,12,Manhattan,"POLYGON ((979908.772 196066.565, 979980.852 19..."
12,13,0.050281,0.000149,Battery Park City,13,Manhattan,"POLYGON ((980801.310 201248.869, 980697.386 20..."
23,24,0.047,6.1e-05,Bloomingdale,24,Manhattan,"POLYGON ((995453.114 230274.267, 995312.583 23..."
40,41,0.052793,0.000143,Central Harlem,41,Manhattan,"POLYGON ((998716.913 234240.397, 999458.736 23..."
41,42,0.092709,0.000264,Central Harlem North,42,Manhattan,"POLYGON ((1002413.191 243934.560, 1002388.021 ..."
42,43,0.099739,0.00038,Central Park,43,Manhattan,"POLYGON ((991852.973 217950.807, 991725.244 21..."
44,45,0.045907,9.1e-05,Chinatown,45,Manhattan,"POLYGON ((984941.821 199431.157, 984958.446 19..."
47,48,0.043747,9.4e-05,Clinton East,48,Manhattan,"POLYGON ((986694.313 214463.846, 986568.184 21..."
49,50,0.055748,0.000173,Clinton West,50,Manhattan,"POLYGON ((985170.372 221087.389, 985817.170 22..."


In [13]:
zones_centroid = Manhattan['geometry'].centroid
zones_polygon = Manhattan['geometry']

#### Eculiden

In [14]:
euc_dist = zones_centroid.apply(lambda x:zones_centroid.distance(x)).to_numpy().flatten()

In [16]:
census_adj = pd.DataFrame(np.array([np.repeat(Manhattan['LocationID'].to_numpy(), len(Manhattan)),
                                    np.tile(Manhattan['LocationID'].to_numpy(), len(Manhattan)),
                                    euc_dist]).T, columns=['start_id','end_id','euc'])

In [19]:
census_adj['start_id'] = census_adj['start_id'].astype(int)
census_adj['end_id'] = census_adj['end_id'].astype(int)

Unnamed: 0,start_id,end_id,euc
0,4,4,0.0
1,4,12,13112.689481
2,4,13,11651.582885
3,4,24,28674.720866
4,4,41,30207.906231


### Connectivity

In [20]:
touch = zones_polygon.apply(lambda x: zones_polygon.touches(x)).to_numpy(dtype=np.float32).flatten()

In [21]:
census_adj['con'] = touch

In [22]:
census_adj.head()

Unnamed: 0,start_id,end_id,euc,con
0,4,4,0.0,0.0
1,4,12,13112.689481,0.0
2,4,13,11651.582885,0.0
3,4,24,28674.720866,0.0
4,4,41,30207.906231,0.0


### Functionality

In [52]:
spatial = df[['zone','residential','education','recreational','commercial','health']].drop_duplicates()
s = spatial[['residential','education','recreational','commercial','health']]
for i in s.columns:
    s.loc[:,i] = s[i]/s[i].max()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s.loc[:,i] = s[i]/s[i].max()


In [53]:
n = len(s)
s1 = np.transpose(np.tile(s, (n,1,1)),(1,0,2))
s2 = np.tile(s, (n,1,1))
s3 = np.sum(np.sqrt((s1-s2)*(s1-s2)), axis=2)
s4 = s3[(s3 > 0)].min()/s3
s4 = s4.flatten()
s4[~np.isfinite(s4)] = 0

  s4 = s3[(s3 > 0)].min()/s3


In [58]:
func_adj = pd.DataFrame(np.array([np.repeat(spatial['zone'].to_numpy(), n), 
                       np.tile(spatial['zone'].to_numpy(), n),
                       s4]).T, columns=['start_id','end_id','func'])


In [59]:
census_adj = pd.merge(census_adj, func_adj, on=['start_id', 'end_id'], how='left').fillna(0)

In [60]:
census_adj.head()

Unnamed: 0,start_id,end_id,euc,con,func
0,4,4,0.0,0.0,0.0
1,4,12,13112.689481,0.0,0.042759
2,4,13,11651.582885,0.0,0.037645
3,4,24,28674.720866,0.0,0.046002
4,4,41,30207.906231,0.0,0.046256


In [61]:
census_adj.to_csv('data/processed/adjlist.csv',index=False)

### History

In [62]:
df.head()

Unnamed: 0,zone,bin,day,month,time_period,demand,air_temp,air_temp_feel,rain,snow,residential,education,recreational,commercial,health,subway_stations,bus_stops,avg_demand_five,historical_average
0,4,0,3,6,off_peak,7.0,73.18,73.31,0.0,0.0,57,11,30,0,1,0.0,11.0,,6.052632
1,4,1,3,6,off_peak,5.0,73.18,73.31,0.0,0.0,57,11,30,0,1,0.0,11.0,,4.315789
2,4,2,3,6,off_peak,1.0,73.18,73.31,0.0,0.0,57,11,30,0,1,0.0,11.0,,4.473684
3,4,3,3,6,off_peak,3.0,73.18,73.31,0.0,0.0,57,11,30,0,1,0.0,11.0,,3.631579
4,4,4,3,6,off_peak,1.0,73.18,73.31,0.0,0.0,57,11,30,0,1,0.0,11.0,3.4,4.631579


In [64]:
history = df[['zone','month','day','bin','historical_average']]
history = history.sort_values(by=['month','day','bin','zone'])
history.head()

Unnamed: 0,zone,month,day,bin,historical_average
0,4,6,3,0,6.052632
5760,12,6,3,0,0.105263
11520,13,6,3,0,5.368421
17280,24,6,3,0,2.421053
23040,41,6,3,0,9.842105


In [70]:
history = history.reset_index(drop=True)
history.to_csv('data/processed/history.csv',index_label='index')
history.head()

Unnamed: 0,zone,month,day,bin,historical_average
0,4,6,3,0,6.052632
1,12,6,3,0,0.105263
2,13,6,3,0,5.368421
3,24,6,3,0,2.421053
4,41,6,3,0,9.842105


### Level of service

In [72]:
los = df[['zone','month','day','bin','subway_stations','bus_stops']]
los = los.sort_values(by=['month','day','bin','zone'])
los['public'] = los['subway_stations'] + 0.2*los['bus_stops']
los.head()

Unnamed: 0,zone,month,day,bin,subway_stations,bus_stops,public
0,4,6,3,0,0.0,11.0,2.2
5760,12,6,3,0,0.0,0.0,0.0
11520,13,6,3,0,0.0,3.0,0.6
17280,24,6,3,0,1.0,13.0,3.6
23040,41,6,3,0,6.0,36.0,13.2


In [73]:
los = los[['zone','month','day','bin','public']]
los = los.reset_index(drop=True)
los.to_csv('data/processed/public.csv',index_label='index')
los.head()

Unnamed: 0,zone,month,day,bin,public
0,4,6,3,0,2.2
1,12,6,3,0,0.0
2,13,6,3,0,0.6
3,24,6,3,0,3.6
4,41,6,3,0,13.2


### Weather

In [74]:
weather = df[['zone','month','day','bin','air_temp_feel','rain']]
weather = weather.sort_values(by=['month','day','bin','zone'])
weather.head()

Unnamed: 0,zone,month,day,bin,air_temp_feel,rain
0,4,6,3,0,73.31,0.0
5760,12,6,3,0,73.31,0.0
11520,13,6,3,0,73.31,0.0
17280,24,6,3,0,73.31,0.0
23040,41,6,3,0,73.31,0.0


In [75]:
weather = weather.reset_index(drop=True)
weather.to_csv('data/processed/weather.csv',index_label='index')
weather.head()

Unnamed: 0,zone,month,day,bin,air_temp_feel,rain
0,4,6,3,0,73.31,0.0
1,12,6,3,0,73.31,0.0
2,13,6,3,0,73.31,0.0
3,24,6,3,0,73.31,0.0
4,41,6,3,0,73.31,0.0


### Prepare into train format

In [171]:
df_demand

Unnamed: 0,index,zone,month,day,bin,demand,timestamp
0,0,4,6,3,0,7.0,54432
1,1,12,6,3,0,0.0,54432
2,2,13,6,3,0,1.0,54432
3,3,24,6,3,0,3.0,54432
4,4,41,6,3,0,10.0,54432
...,...,...,...,...,...,...,...
362875,362875,246,6,28,287,47.0,61919
362876,362876,249,6,28,287,42.0,61919
362877,362877,261,6,28,287,13.0,61919
362878,362878,262,6,28,287,2.0,61919


In [211]:
def combine(df_demand,df,timestamp,x,y,lookback=2):
    df = df.sort_values('zone')
    flag = True
    if df['timestamp'].values[0]-lookback>=54432:
        for _,row in df.iterrows():
                temp_x = []
                temp_y = []
                for i in reversed(range(1,lookback+1)):
                    pre_demand = df_demand.loc[(df_demand['zone']==row['zone'])&(df_demand['timestamp']==(row['timestamp']-i))]['demand']
                    try:
                        temp_x.append(pre_demand.values[0])
                        temp_y.append(row['demand'])
                    except:
                        flag = False
                        break
        
        if flag:
            timestamp.append(df['timestamp'].values[0])
            x.append(temp_x)
            y.append(temp_y) 


In [212]:
df_demand['timestamp'] = df_demand['bin'] + 288*df_demand['day'] + 31*288*df_demand['month']
timestamp = []
x = []
y = []
_ = df_demand.groupby('timestamp').apply(lambda df:combine(df_demand,df,timestamp,x,y))
    

KeyboardInterrupt: 