# 12wk-2: NYCTaxi 자료 분석 (2)

최규빈  
2023-11-22

# 1. 강의영상

<https://youtu.be/playlist?list=PLQqh36zP38-wFUcEr7JtSb7MWUYl6QrW_&si=CmWA765p0mnwWTGA>

# 2. Imports

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio

In [2]:
pd.options.plotting.backend = "plotly"
pio.templates.default = "plotly_white"

# 3. 데이터준비

In [3]:
df = pd.read_csv("NYCTaxi.csv")[::100].reset_index(drop=True)
df_feature = df.assign(
    log_trip_duration = np.log(df.trip_duration),
    pickup_datetime = df.pickup_datetime.apply(pd.to_datetime),
    dropoff_datetime = df.dropoff_datetime.apply(pd.to_datetime),
    dist = np.sqrt((df.pickup_latitude-df.dropoff_latitude)**2 + (df.pickup_longitude-df.dropoff_longitude)**2),
    #---#
    vendor_id = df.vendor_id.map({1:'A',2:'B'})
).assign(
    speed = lambda df: df.dist / df.trip_duration,
    pickup_hour = lambda df: df.pickup_datetime.dt.hour,
    dropoff_hour = lambda df: df.dropoff_datetime.dt.hour,
    dayofweek = lambda df: df.pickup_datetime.dt.dayofweek
)

# 4. 시각화3 – 애니메이션

## A. scatter / (vendor_id,hour)

`-` 시각화

In [4]:
fig = px.scatter_mapbox(
    data_frame=df_feature.sort_values('pickup_hour'),
    lat = 'pickup_latitude',
    lon = 'pickup_longitude',
    center = {'lat':40.7322, 'lon':-73.9052},
    color = 'vendor_id',
    size = 'passenger_count',
    size_max = 5,
    animation_frame = 'pickup_hour',
    #---#
    mapbox_style='carto-positron',
    zoom=10,
    width=750,
    height=600
)
fig.show(config={'scrollZoom': False})

-   B가 전체적으로 동그라미가 큰것같지 않어..?
-   시간대별로 확실히 빈도가 다르다

`-` 추가시각화1 – `vendor_id`별 `passenger_count`를 barplot으로 시각화

In [5]:
df_feature.groupby('vendor_id').agg({'passenger_count':'mean'})\
.reset_index()\
.plot.bar(y='vendor_id',x='passenger_count',color='vendor_id')

-   B가 평균적으로 승객수가 더 많다. (B는 대형차량 위주로 빌려주는
    회사인가?)

`-` 추가시각화2 – `vendor_id`별 `passenger_count`를 boxplot으로 시각화

In [6]:
df_feature.plot.box(x='vendor_id',y='passenger_count',color='vendor_id')

`-` 추가시각화3 – `vendor_id`별 `passenger_count`를 histogram으로 시각화

In [7]:
df_feature.plot.hist(x='passenger_count',color='vendor_id',facet_col='vendor_id')

-   1~4인 손님의 경우 A,B의 모양이 비슷한데, B는 5인 이상의 손님이
    특이하게 많은 편이다. (B가 A에 비하여 1인손님이 좀 더 적고, 2인
    손님은 좀 더 많긴함)

`-` 추가시각화4 – `pickup_hour`별 `count`를 barplot으로 시각화

In [8]:
df_feature.pickup_hour.value_counts().sort_index().plot.bar()

`-` 추가시각화5 – (`pickup_hour`,`vendor_id`)별 `count`를 barplot으로
시각화

In [9]:
df_feature.groupby(['vendor_id','pickup_hour']).size()\
.reset_index().rename({0:'count'},axis=1)\
.plot.bar(x='pickup_hour',y='count',color='vendor_id',facet_col='vendor_id')

`-` 추가시각화6 – (`pickup_hour`,`vendor_id`)별 `count`를 areaplot으로
시각화

In [10]:
df_feature.groupby(['vendor_id','pickup_hour']).size()\
.reset_index().rename({0:'count'},axis=1)\
.plot.area(x='pickup_hour',y='count',color='vendor_id')

`-` 추가시각화7 – (`pickup_hour`,`vendor_id`)별 `count`를 lineplot으로
시각화

In [11]:
df_feature.groupby(['vendor_id','pickup_hour']).size()\
.reset_index().rename({0:'count'},axis=1)\
.plot.line(x='pickup_hour',y='count',color='vendor_id')

## B. scatter / (vendor_id,day_of_week)

In [12]:
fig = px.scatter_mapbox(
    data_frame=df_feature.sort_values('dayofweek'),
    lat = 'pickup_latitude',
    lon = 'pickup_longitude',
    center = {'lat':40.7322, 'lon':-73.9052},
    color = 'vendor_id',
    size = 'passenger_count',
    size_max = 5,
    animation_frame = 'dayofweek',
    #---#
    mapbox_style='carto-positron',
    zoom=10,
    width=750,
    height=600
)
fig.show(config={'scrollZoom': False})

-   생각보다 요일별 특징은 그다지 뚜렷하지 않음

# 5. 시각화4 – `heatmap`

## A. (요일,시간)에 따른 `count` 시각화

In [13]:
tidydata = df_feature.pivot_table(
    index = 'pickup_hour',
    columns = 'dayofweek',
    aggfunc='size'
).stack().reset_index().rename({0:'count'},axis=1)
#---# 
px.density_heatmap(
    data_frame=tidydata,
    x='pickup_hour',
    y='dayofweek',
    z='count',
    nbinsx=24,
    nbinsy=7,
    height=500,
)

-   노란색: 불금? 피크타임?

In [14]:
240*3

## B. (요일,시간)에 따른 `dist` 시각화

In [15]:
tidydata = df_feature.pivot_table(
    index = 'pickup_hour',
    columns = 'dayofweek',
    values = 'dist',
    aggfunc='mean'
).stack().reset_index().rename({0:'dist'},axis=1)
#---# 
px.density_heatmap(
    data_frame=tidydata,
    x='pickup_hour',
    y='dayofweek',
    z='dist',
    nbinsx=24,
    nbinsy=7,
    height=500,
)

-   노란색: 일요일 아침부터 장거리.. (관광객? 공항가는 사람일까?
    놀러가는 사람일까?)

## C. (요일,시간)에 따른 `speed` 시각화

In [16]:
tidydata = df_feature.pivot_table(
    index = 'pickup_hour',
    columns = 'dayofweek',
    values = 'speed',
    aggfunc='mean'
).stack().reset_index().rename({0:'speed'},axis=1)
#---# 
px.density_heatmap(
    data_frame=tidydata,
    x='pickup_hour',
    y='dayofweek',
    z='speed',
    nbinsx=24,
    nbinsy=7,
    height=500
)

-   남색 = 교통체증이 심한 곳

# 6. 시각화5 – 경로시각화

`-` 이거는 너무 무거워서 좀 작은 데이터로 실습합니다.

In [17]:
df_feature_small = df_feature[::100].reset_index(drop=True)

## A. 예비학습

## B. 전처리

In [18]:
pcols = ['pickup_datetime','pickup_longitude','pickup_latitude']
dcols = ['dropoff_datetime','dropoff_longitude','dropoff_latitude']

In [19]:
def transform(df):
    pick_up = df.loc[:,['id']+pcols].set_axis(['id','datetime','lon','lat'],axis=1).eval('state = "pickup"')
    drop_off = df.loc[:,['id']+dcols].set_axis(['id','datetime','lon','lat'],axis=1).eval('state = "dropoff"')
    return pd.concat([pick_up,drop_off],axis=0).reset_index(drop=True)

In [20]:
df_left = pd.concat([transform(df) for i,df in df_feature_small.groupby('id')]).reset_index(drop=True)
df_right = df_feature_small.drop(pcols+dcols,axis=1)
df_feature_small2 = pd.merge(df_left,df_right)

In [21]:
df_feature_small2

## C. `vendor_id` 별 시각화

In [22]:
fig = px.line_mapbox(
    data_frame=df_feature_small2,
    lat="lat", 
    lon="lon", 
    center = {'lat':40.7322, 'lon':-73.9052},
    line_group = 'id',
    color='vendor_id',
    #---#
    mapbox_style='carto-positron',
    zoom=10,
    width=750,
    height=600,
)
trace_data = px.scatter_mapbox(
        data_frame=df_feature_small2,
        lat = 'lat',
        lon = 'lon',
        center = {'lat':40.7322, 'lon':-73.9052},
        color = 'vendor_id',
        size = 'trip_duration',
        size_max = 10,
).data
for d in trace_data: 
    fig.add_trace(d)
fig.update_traces(
    line={
        'width':1,
    },
    opacity=0.8
)
fig.show(config={'scrollZoom': False})

## D. `dayofweek`별 시각화

In [23]:
tidydata = df_feature_small2.assign(dayofweek=lambda df: df.dayofweek.astype(str)).sort_values('dayofweek')

fig = px.line_mapbox(
    data_frame=tidydata,
    lat="lat", 
    lon="lon", 
    center = {'lat':40.7322, 'lon':-73.9052},
    line_group = 'id',
    color='dayofweek',
    #---#
    mapbox_style='carto-positron',
    zoom=10,
    width=750,
    height=600,
)
trace_data = px.scatter_mapbox(
        data_frame=tidydata,
        lat = 'lat',
        lon = 'lon',
        center = {'lat':40.7322, 'lon':-73.9052},
        color = 'dayofweek',
        size = 'trip_duration',
        size_max = 10,
).data
for d in trace_data: 
    fig.add_trace(d)
fig.update_traces(
    line={
        'width':1,
    },
    opacity=0.8
)
fig.show(config={'scrollZoom': False})

## E. `speed`별 시각화

In [24]:
df_feature_small2.assign(
    speed_cut = lambda df: pd.qcut(df.speed,4)
)

In [25]:
tidydata = df_feature_small2.assign(
    speed_cut = lambda df: pd.qcut(df.speed,4)
).sort_values('speed_cut')
fig = px.line_mapbox(
    data_frame=tidydata,
    lat="lat", 
    lon="lon", 
    center = {'lat':40.7322, 'lon':-73.9052},
    line_group = 'id',
    color='speed_cut',
    #---#
    mapbox_style='carto-positron',
    zoom=10,
    width=750,
    height=600,
)
trace_data = px.scatter_mapbox(
        data_frame=tidydata,
        lat = 'lat',
        lon = 'lon',
        center = {'lat':40.7322, 'lon':-73.9052},
        color = 'speed_cut',
        size = 'trip_duration',
        size_max = 10,
).data
for d in trace_data: 
    fig.add_trace(d)
fig.update_traces(
    line={
        'width':1,
    },
    opacity=0.8
)
fig.show(config={'scrollZoom': False})




