# 12wk-2: NYCTaxi 자료 분석 (2)

최규빈  
2023-11-22

# 1. 강의영상

<https://youtu.be/playlist?list=PLQqh36zP38-xXZmSbTDYrghHNRkOBj-Es&si=iQeuCDt8Z_D3uzoq>

# 2. Imports

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio

In [2]:
pd.options.plotting.backend = "plotly"
pio.templates.default = "plotly_white"

# 3. 데이터준비

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/guebin/DV2023/main/posts/NYCTaxi.csv")
df_feature = df.assign(
    log_trip_duration = np.log(df.trip_duration),
    pickup_datetime = df.pickup_datetime.apply(pd.to_datetime),
    dropoff_datetime = df.dropoff_datetime.apply(pd.to_datetime),
    dist = np.sqrt((df.pickup_latitude-df.dropoff_latitude)**2 + (df.pickup_longitude-df.dropoff_longitude)**2),
    #---#
    vendor_id = df.vendor_id.map({1:'A',2:'B'})
).assign(
    speed = lambda df: df.dist / df.trip_duration,
    pickup_hour = lambda df: df.pickup_datetime.dt.hour,
    dropoff_hour = lambda df: df.dropoff_datetime.dt.hour,
    dayofweek = lambda df: df.pickup_datetime.dt.dayofweek
)

# 4. 시각화3 – 애니메이션

## A. scatter / (vendor_id,hour)

`-` 시각화

-   B가 전체적으로 동그라미가 큰것같지 않어..?
-   시간대별로 확실히 빈도가 다르다

`-` 추가시각화1 – `vendor_id`별 `passenger_count`를 barplot으로 시각화

`-` 추가시각화2 – `vendor_id`별 `passenger_count`를 boxplot으로 시각화

`-` 추가시각화3 – `vendor_id`별 `passenger_count`를 histogram으로 시각화

`-` 추가시각화4 – `pickup_hour`별 `count`를 barplot으로 시각화

`-` 추가시각화5 – (`pickup_hour`,`vendor_id`)별 `count`를 barplot으로
시각화

`-` 추가시각화6 – (`pickup_hour`,`vendor_id`)별 `count`를 areaplot으로
시각화

`-` 추가시각화7 – (`pickup_hour`,`vendor_id`)별 `count`를 lineplot으로
시각화

## B. scatter / (vendor_id,day_of_week)

# 5. 시각화4 – `heatmap`

## A. (요일,시간)에 따른 `count` 시각화

## B. (요일,시간)에 따른 `dist` 시각화

## C. (요일,시간)에 따른 `speed` 시각화

# 6. 시각화5 – 경로시각화

`-` 이거는 너무 무거워서 좀 작은 데이터로 실습합니다.

In [26]:
df_feature_small = df_feature[::100].reset_index(drop=True)

## A. 예비학습

`-` 경로그리기

In [27]:
df_sample = pd.DataFrame(
    {'path':['A','A','B','B','B'],
     'lon':[-73.986420,-73.995300,-73.975922,-73.988922,-73.962654],
     'lat':[40.756569,40.740059,40.754192,40.762859,40.772449]}
)

`-` 산점도로 그리기

In [28]:
_fig = px.scatter_mapbox(
    data_frame=df_sample,
    lat="lat", 
    lon="lon", 
    color='path',
    #---#
    mapbox_style='carto-positron',
    zoom=12,
    width=750,
    height=600,
)
_fig.show(config={'scrollZoom': False})

`-` 합치기

In [29]:
_fig.data

In [30]:
fig = px.line_mapbox(
    data_frame=df_sample,
    lat="lat", 
    lon="lon", 
    line_group = 'path',
    color='path',
    #---#
    mapbox_style='carto-positron',
    zoom=12,
    width=750,
    height=600,
)
fig.add_trace(_fig.data[0])
fig.add_trace(_fig.data[1])
fig.show(config={'scrollZoom': False})

## B. 전처리

In [31]:
pcols = ['pickup_datetime','pickup_longitude','pickup_latitude']
dcols = ['dropoff_datetime','dropoff_longitude','dropoff_latitude']

In [32]:
def transform(df):
    pick_up = df.loc[:,['id']+pcols].set_axis(['id','datetime','lon','lat'],axis=1).eval('state = "pickup"')
    drop_off = df.loc[:,['id']+dcols].set_axis(['id','datetime','lon','lat'],axis=1).eval('state = "dropoff"')
    return pd.concat([pick_up,drop_off],axis=0).reset_index(drop=True)

In [33]:
df_left = pd.concat([transform(df) for i,df in df_feature_small.groupby('id')]).reset_index(drop=True)
df_right = df_feature_small.drop(pcols+dcols,axis=1)
df_feature_small2 = pd.merge(df_left,df_right)

In [34]:
df_feature_small2

## C. `vendor_id` 별 시각화

In [35]:
fig = px.line_mapbox(
    data_frame=df_feature_small2,
    lat="lat", 
    lon="lon", 
    center = {'lat':40.7322, 'lon':-73.9052},
    line_group = 'id',
    color='vendor_id',
    #---#
    mapbox_style='carto-positron',
    zoom=10,
    width=750,
    height=600,
)
trace_data = px.scatter_mapbox(
        data_frame=df_feature_small2,
        lat = 'lat',
        lon = 'lon',
        center = {'lat':40.7322, 'lon':-73.9052},
        color = 'vendor_id',
        size = 'trip_duration',
        size_max = 10,
).data
for d in trace_data: 
    fig.add_trace(d)
fig.update_traces(
    line={
        'width':1,
    },
    opacity=0.8
)
fig.show(config={'scrollZoom': False})

## D. `dayofweek`별 시각화

In [36]:
tidydata = df_feature_small2.assign(dayofweek=lambda df: df.dayofweek.astype(str)).sort_values('dayofweek')

fig = px.line_mapbox(
    data_frame=tidydata,
    lat="lat", 
    lon="lon", 
    center = {'lat':40.7322, 'lon':-73.9052},
    line_group = 'id',
    color='dayofweek',
    #---#
    mapbox_style='carto-positron',
    zoom=10,
    width=750,
    height=600,
)
trace_data = px.scatter_mapbox(
        data_frame=tidydata,
        lat = 'lat',
        lon = 'lon',
        center = {'lat':40.7322, 'lon':-73.9052},
        color = 'dayofweek',
        size = 'trip_duration',
        size_max = 10,
).data
for d in trace_data: 
    fig.add_trace(d)
fig.update_traces(
    line={
        'width':1,
    },
    opacity=0.8
)
fig.show(config={'scrollZoom': False})

## E. `speed`별 시각화

In [37]:
df_feature_small2.assign(
    speed_cut = lambda df: pd.qcut(df.speed,4)
)

In [38]:
tidydata = df_feature_small2.assign(
    speed_cut = lambda df: pd.qcut(df.speed,4)
).sort_values('speed_cut')
fig = px.line_mapbox(
    data_frame=tidydata,
    lat="lat", 
    lon="lon", 
    center = {'lat':40.7322, 'lon':-73.9052},
    line_group = 'id',
    color='speed_cut',
    #---#
    mapbox_style='carto-positron',
    zoom=10,
    width=750,
    height=600,
)
trace_data = px.scatter_mapbox(
        data_frame=tidydata,
        lat = 'lat',
        lon = 'lon',
        center = {'lat':40.7322, 'lon':-73.9052},
        color = 'speed_cut',
        size = 'trip_duration',
        size_max = 10,
).data
for d in trace_data: 
    fig.add_trace(d)
fig.update_traces(
    line={
        'width':1,
    },
    opacity=0.8
)
fig.show(config={'scrollZoom': False})




