In [16]:
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import sklearn as sk

import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D

import seaborn as sns
sns.set()
sns.set_style("whitegrid")
sns.set_color_codes()

In [17]:
# csv 읽어들일 때 사용할 변수들 정의

# train.csv에서 dropoff_datetime 빼고 저장
cols_list = ['pickup_datetime', 'passenger_count', \
             'pickup_longitude', 'pickup_latitude', \
             'dropoff_longitude', 'dropoff_latitude',\
             'trip_duration']
# train.csv에서 dropoff_datetime 포함 저장
cols_list2 = ['pickup_datetime', 'dropoff_datetime', \
             'passenger_count', \
             'pickup_longitude', 'pickup_latitude', \
             'dropoff_longitude', 'dropoff_latitude', \
             'trip_duration']
# test.csv에서 저장
cols_test = ['pickup_datetime', 'passenger_count', \
             'pickup_longitude', 'pickup_latitude', \
             'dropoff_longitude', 'dropoff_latitude']

# cols_list 용
short_cols = ['p_dt', 'pas', 'p_lng', 'p_lat', 'd_lng', 'd_lat', 'dur']
# cols_list2 용
short_cols2 = ['p_dt', 'd_dt', 'pas', 'p_lng', 'p_lat', 'd_lng', 'd_lat', 'dur']
# cols_test 용
short_test = ['p_dt', 'pas', 'p_lng', 'p_lat', 'd_lng', 'd_lat']

In [18]:
# 경로 변수 (고정)

dataLoc = '../_data/'
trainfile = 'train.csv'
testfile = 'test.csv'

In [19]:
# train.csv
# dropoff_datetime을 뺄 경우: usecols=cols_list, .columns=short_cols
# dropoff_datetime을 넣을 경우: usecols=cols_list2, .columns=short_cols2

# train.csv (날짜 대충 읽는 버전, 빠름)
taxi_df = pd.read_csv(dataLoc + trainfile, usecols=cols_list)
taxi_df.columns = short_cols
taxi_df.head()

# train.csv (날짜 제대로 읽어오는 버전, 오래걸림)
# taxi_df = pd.read_csv(dataLoc + trainfile, \
#                       usecols=cols_list, \
#                       infer_datetime_format=True, \
#                       # dropoff 포함할 경우 아래 리스트에 'dropoff_datetime'도 추가
#                       parse_dates=['pickup_datetime'])
# taxi_df.columns = short_cols
# taxi_df.head()

Unnamed: 0,p_dt,pas,p_lng,p_lat,d_lng,d_lat,dur
0,4/30/16 23:59,1,-73.987793,40.724792,-73.975616,40.656445,1454
1,4/30/16 23:59,1,-73.957596,40.71777,-73.951424,40.77523,1409
2,4/30/16 23:59,2,-74.000954,40.742031,-73.947708,40.7822,1081
3,4/30/16 23:58,1,-73.985733,40.738258,-73.993179,40.75489,800
4,4/30/16 23:58,1,-74.006615,40.74065,-73.985619,40.723362,1151


In [20]:
# test.csv (날짜 대충 읽는 버전, 빠름)
test_df = pd.read_csv(dataLoc + testfile, usecols=cols_test)
test_df.columns = short_test
test_df.head()

# test.csv (날짜 제대로 읽어오는 버전, 오래걸림)
# test_df = pd.read_csv(dataLoc + testfile, \
#                       usecols=cols_test, \
#                       infer_datetime_format=True, \
#                       parse_dates=['pickup_datetime'])
# test_df.columns = short_test
# test_df.head()

Unnamed: 0,p_dt,pas,p_lng,p_lat,d_lng,d_lat
0,6/30/16 23:59,5,-73.998291,40.722618,-73.971779,40.76247
1,6/30/16 23:59,2,-73.955109,40.689564,-73.978203,40.685802
2,6/30/16 23:59,1,-73.978416,40.791576,-73.97316,40.675968
3,6/30/16 23:59,2,-73.873093,40.774097,-73.926704,40.856739
4,6/30/16 23:58,1,-73.979416,40.755211,-74.003548,40.738224


In [9]:
np.percentile(taxi_df['p_lat'], 0)

-78.547401429999994

In [11]:
taxi_df.shape

(701778, 7)

In [22]:
taxi_df[ taxi_df['p_lat'] > 40.7 ]

Unnamed: 0,p_dt,pas,p_lng,p_lat,d_lng,d_lat,dur
0,4/30/16 23:59,1,-73.987793,40.724792,-73.975616,40.656445,1454
1,4/30/16 23:59,1,-73.957596,40.717770,-73.951424,40.775230,1409
2,4/30/16 23:59,2,-74.000954,40.742031,-73.947708,40.782200,1081
3,4/30/16 23:58,1,-73.985733,40.738258,-73.993179,40.754890,800
4,4/30/16 23:58,1,-74.006615,40.740650,-73.985619,40.723362,1151
5,4/30/16 23:58,2,-73.997253,40.719234,-73.993835,40.762051,1619
6,4/30/16 23:58,1,-73.994537,40.753037,-73.961098,40.687229,2209
7,4/30/16 23:58,1,-73.873055,40.774086,-74.006378,40.738487,2458
8,4/30/16 23:58,2,-73.980354,40.780540,-73.943390,40.835976,1584
10,4/30/16 23:58,1,-73.997414,40.720810,-74.004639,40.716228,157
