In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import sklearn as sk

import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D

import seaborn as sns
sns.set()
sns.set_style("whitegrid")
sns.set_color_codes()

  from pandas.core import datetools


In [2]:
# 경로 변수 (고정)

dataLoc = '../_data/'
trainfile = 'train_03_997.csv'
old_trainfile = 'train.csv'
testfile = 'test_clean.csv'
old_testfile = 'test.csv'
train_171016 = '171016_dow_hr_pas_L2_dur_only.csv'

In [3]:
# train_03_997.csv
taxi_df = pd.read_csv(dataLoc + trainfile)
taxi_df['p_dt'] = pd.to_datetime(taxi_df['p_dt'])
taxi_df.head()

Unnamed: 0,p_dt,pas,p_lng,p_lat,d_lng,d_lat,dur
0,2016-04-30 23:59:00,1,-73.987793,40.724792,-73.975616,40.656445,1454
1,2016-04-30 23:59:00,1,-73.957596,40.71777,-73.951424,40.77523,1409
2,2016-04-30 23:59:00,2,-74.000954,40.742031,-73.947708,40.7822,1081
3,2016-04-30 23:58:00,1,-73.985733,40.738258,-73.993179,40.75489,800
4,2016-04-30 23:58:00,1,-74.006615,40.74065,-73.985619,40.723362,1151


In [4]:
# 171016_dow_hr_pas_L2_dur_only.csv
main_df = pd.read_csv(dataLoc + train_171016)
main_df.head()

Unnamed: 0,dow,hr,pas,L2,dur
0,Sa,23,1,0.069424,1454
1,Sa,23,1,0.057791,1409
2,Sa,23,2,0.066698,1081
3,Sa,23,1,0.018223,800
4,Sa,23,1,0.027198,1151


In [5]:
copy_df = taxi_df.copy()  # 분석 필요한 df명 넣기
copy_df.head()

Unnamed: 0,p_dt,pas,p_lng,p_lat,d_lng,d_lat,dur
0,2016-04-30 23:59:00,1,-73.987793,40.724792,-73.975616,40.656445,1454
1,2016-04-30 23:59:00,1,-73.957596,40.71777,-73.951424,40.77523,1409
2,2016-04-30 23:59:00,2,-74.000954,40.742031,-73.947708,40.7822,1081
3,2016-04-30 23:58:00,1,-73.985733,40.738258,-73.993179,40.75489,800
4,2016-04-30 23:58:00,1,-74.006615,40.74065,-73.985619,40.723362,1151


In [6]:
copy_df.insert(loc=1, column='hr', value=copy_df['p_dt'].dt.hour)

In [7]:
dow_conv = {0 : 'M', 1 : 'T', 2 : 'W', 3 : 'R', 4 : 'F', 5 : 'Sa', 6 : 'Su'}

copy_df.insert(loc=1, column='dow', value=copy_df['p_dt'].dt.dayofweek)

In [8]:
copy_df = copy_df.replace({'dow' : dow_conv})

In [9]:
# 한번만 실행 가능
del copy_df['p_dt']
copy_df.head()

Unnamed: 0,dow,hr,pas,p_lng,p_lat,d_lng,d_lat,dur
0,Sa,23,1,-73.987793,40.724792,-73.975616,40.656445,1454
1,Sa,23,1,-73.957596,40.71777,-73.951424,40.77523,1409
2,Sa,23,2,-74.000954,40.742031,-73.947708,40.7822,1081
3,Sa,23,1,-73.985733,40.738258,-73.993179,40.75489,800
4,Sa,23,1,-74.006615,40.74065,-73.985619,40.723362,1151


In [10]:
def dist(plng, plat, dlng, dlat):
    return ((dlng-plng)**2 + (dlat-plat)**2) ** 0.5

In [11]:
copy_df.insert(loc=7, column='L2', value=np.vectorize(dist)(\
                copy_df['p_lng'], copy_df['p_lat'],\
                copy_df['d_lng'], copy_df['d_lat']))

copy_df.head()

Unnamed: 0,dow,hr,pas,p_lng,p_lat,d_lng,d_lat,L2,dur
0,Sa,23,1,-73.987793,40.724792,-73.975616,40.656445,0.069424,1454
1,Sa,23,1,-73.957596,40.71777,-73.951424,40.77523,0.057791,1409
2,Sa,23,2,-74.000954,40.742031,-73.947708,40.7822,0.066698,1081
3,Sa,23,1,-73.985733,40.738258,-73.993179,40.75489,0.018223,800
4,Sa,23,1,-74.006615,40.74065,-73.985619,40.723362,0.027198,1151


In [12]:
# 4개 열 전부 삭제
todel = ['p_lng', 'p_lat', 'd_lng', 'd_lat']

for thing in todel:
    del copy_df[thing]

In [13]:
copy_df.head()

Unnamed: 0,dow,hr,pas,L2,dur
0,Sa,23,1,0.069424,1454
1,Sa,23,1,0.057791,1409
2,Sa,23,2,0.066698,1081
3,Sa,23,1,0.018223,800
4,Sa,23,1,0.027198,1151


In [None]:
#copy2_df.to_csv("../_data/171016_dow_hr_pas_L2_dur_only.csv", index=False)

In [None]:
# zoom in on JFK Airport

imageSizeJFK = (600,750) #세로, 가로

latRangeJFK = [-15,-10] #세로
longRangeJFK = [14,18] # 가로

indToKeep  = np.logical_and(allLat > latRangeJFK[0], allLat < latRangeJFK[1])
indToKeep  = np.logical_and(indToKeep, np.logical_and(allLong > longRangeJFK[0], allLong < longRangeJFK[1]))
allLatJFK  = allLat[indToKeep]
allLongJFK = allLong[indToKeep]

allLatIndsJFK  = (imageSizeJFK[0]-1) - (imageSizeJFK[0] * (allLatJFK  - latRangeJFK[0])
                                                        / (latRangeJFK[1] - latRangeJFK[0])).astype(int)
allLongIndsJFK=                       (imageSizeJFK[1] * (allLongJFK - longRangeJFK[0])
                                                        / (longRangeJFK[1] - longRangeJFK[0])).astype(int)

locationDensityImageJFK = np.zeros(imageSizeJFK)
for latInd, longInd in zip(allLatIndsJFK,allLongIndsJFK):
    locationDensityImageJFK[latInd,longInd] += 1

fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(12,18))
ax.imshow(np.log(locationDensityImageJFK+1),cmap='hot')
ax.set_axis_off()


In [None]:
# zoom in on LaGuardia Airport

imageSizeLGA = (300,600) #세로, 가로

latRangeLGA = [1,3] #세로
longRangeLGA = [7,11] # 가로

indToKeep  = np.logical_and(allLat > latRangeLGA[0], allLat < latRangeLGA[1])
indToKeep  = np.logical_and(indToKeep, np.logical_and(allLong > longRangeLGA[0], allLong < longRangeLGA[1]))
allLatLGA  = allLat[indToKeep]
allLongLGA = allLong[indToKeep]

allLatIndsLGA  = (imageSizeLGA[0]-1) - (imageSizeLGA[0] * (allLatLGA  - latRangeLGA[0])
                                                        / (latRangeLGA[1] - latRangeLGA[0])).astype(int)
allLongIndsLGA=                       (imageSizeLGA[1] * (allLongLGA - longRangeLGA[0])
                                                        / (longRangeLGA[1] - longRangeLGA[0])).astype(int)


In [None]:
#JFK

# 1. whole
longitude = -73.79139709 ~ -73.77679442 #가로. 왼쪽 ~ 오른쪽
latitude = 40. 649094 ~ 40.4064365#세로. 위 ~ 아래

# 2. partial
#왼쪽 위 부터 시계 방향으로

# 2-1. 
longitude = -73.79139709 ~ -73.78795624 #가로. 왼쪽 ~ 오른쪽
latitude = 40.647731 ~ 40.646172 #세로. 위 ~ 아래

# 2-2.
longitude = -73.785692 ~ -73.781648 #가로. 왼쪽 ~ 오른쪽
latitude = 40. 649094~ 40.647995 #세로. 위 ~ 아래

# 2-3.
longitude = -73.779073 ~ -73.77679442 #가로. 왼쪽 ~ 오른쪽. 보내주신 함수를 통해서 확인한 제일 오른쪽 경도 입니다
latitude = 40.646994 ~ 40.643787 #세로. 위 ~ 아래

# 2-4.
longitude = -73.785295 ~ -73.779835 #가로. 왼쪽 ~ 오른쪽
latitude = 40.64698 ~ 40.64294 #세로. 위 ~ 아래

# 2-5.
longitude = -73.791168 ~ -73.787549 #가로. 왼쪽 ~ 오른쪽
latitude = 40.644526 ~ 40.4064365 #세로. 위 ~ 아래. 보내주신 함수를 통해서 확인한 제일 아래쪽 위도 입니다


In [None]:
#JFK

#lng : 왼쪽, 오른쪽
#ltt : 위, 아래

JFK_whole = {'lng' :  [-73.79139709, -73.77679442], 'ltt' : [40. 649094, 40.64365]}

#밀도 높은 부분들 중에서 왼쪽 위 부터 시계 방향으로

JFK_area1 = {'lng' :  [73.79139709, -73.78795624], 'ltt' : [40.647731, 40.646172]}
JFK_area2 = {'lng' : [-73.785692, -73.781648], 'ltt' : [40. 649094, 40.647995]}
JFK_area3 = {'lng' : [-73.779073, -73.77679442], 'ltt' : [40.646994, 40.643787]}
JFK_area4 = {'lng' : [-73.785295, -73.779835], 'ltt' : [40.64698, 40.64294]}
JFK_area5 = {'lng' : [-73.791168, -73.787549], 'ltt' : [40.644526, 40.4064365]}


In [None]:
# LaGuardia Airport

# 1. whole
longitude = -73.8862 ~ -73.8617 #가로. 왼쪽 ~ 오른쪽
latitude = 40.774811 ~ 40.767603 #세로. 위 ~ 아래

# 2. partial
#왼쪽 부터 오른쪽 방향으로

# 2-1. 
longitude = -73.8862 ~ -73.884859 #가로. 왼쪽 ~ 오른쪽. 함수를 통해 값 확인
latitude = 40.773526 ~ 40.772366 #세로. 위 ~ 아래

# 2-2.
longitude = -73.876338 ~ -73.870159 #가로. 왼쪽 ~ 오른쪽
latitude = 40.774811 ~ 40.773353 #세로. 위 ~ 아래

# 2-3.
longitude = -73.866019 ~ -73.8617 #가로. 왼쪽 ~ 오른쪽. 함수를 통해 값 확인
latitude = 40.771225 ~ 40.767603 #세로. 위 ~ 아래


In [None]:
# LaGuardia Airport

#lng : 왼쪽, 오른쪽
#ltt : 위, 아래

LGA_whole = {'lng' :  [-73.8862, -73.8617], 'ltt' : [40.774811, 40.767603]}

#밀도 높은 부분들 중에서 왼쪽 부터 오른쪽 방향으로

LGA_area1 = {'lng' : [-73.8862, -73.884859], 'ltt' : [40.773526, 40.772366]}
LGA_area2 = {'lng' : [-73.876338, -73.870159], 'ltt' : [40.774811, 40.773353]}
LGA_area3 = {'lng' : [ -73.866019, -73.8617], 'ltt' : [40.771225, 40.767603]}


In [137]:
#밀도 값 확인을 통한 경도, 위도 찾는 함수

lng_upper = -73.8617
lng_lower = -73.866019
lat_upper = 40.771225
lat_lower = 40.767603

def zone_filter(df, lng_upper, lng_lower, lat_upper, lat_lower):
	return df[ (df['p_lng'] < lng_upper)
			  &(df['p_lng'] > lng_lower)
			  &(df['p_lat'] < lat_upper)
			  &(df['p_lat'] > lat_lower) ]

print(zone_filter(taxi_df,\
				  lng_upper, lng_lower,\
				  lat_upper, lat_lower).shape)


(6695, 7)
