## Causal Inferece
with `dowhy` package

In [25]:
from dowhy import CausalModel
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for korean plotting
plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['axes.unicode_minus'] = False

# Pandas option
pd.set_option('display.max_columns', None)

In [29]:
# Dataset
df = pd.read_csv('df_preprocessed.csv')
df = df.drop(['전체범죄', '남성_생활인구', '여성_생활인구', '연령대_10_생활인구',
       '연령대_20_생활인구', '연령대_30_생활인구', '연령대_40_생활인구', '연령대_50_생활인구',
       '연령대_60_이상_생활인구', '시간대_1_생활인구_수', '시간대_2_생활인구_수', '시간대_3_생활인구_수',
       '시간대_4_생활인구_수', '시간대_5_생활인구_수', '시간대_6_생활인구_수'], axis=1)

cols_crime = ['살인', '강도', '강간', '절도', '폭력', '방화', '마약', '약취', '도박']

t = pd.CategoricalDtype(categories=[1,2,3,4,5], ordered=True)
df[cols_crime] = df[cols_crime].astype(t)
df = df.dropna(axis=0, subset='총_생활인구')

In [30]:
# Reconstruct categorical
commercial_type = df[df.columns[df.columns.str.contains('상권타입')]]
commercial_type.columns = ['골목상권','관광특구','발달상권','전통시장']
commercial_type = pd.Series(commercial_type.columns[np.where(commercial_type!=0)[1]])

df = df.drop(df.columns[df.columns.str.contains('상권타입')], axis=1)
df['상권타입'] = pd.Categorical(commercial_type, categories=commercial_type.unique(), ordered=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1668 entries, 0 to 1670
Data columns (total 22 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   Bus        1668 non-null   float64 
 1   Subway     1668 non-null   float64 
 2   유흥업소       1668 non-null   float64 
 3   살인         1668 non-null   category
 4   강도         1668 non-null   category
 5   강간         1668 non-null   category
 6   절도         1668 non-null   category
 7   폭력         1668 non-null   category
 8   방화         1668 non-null   category
 9   마약         1668 non-null   category
 10  약취         1668 non-null   category
 11  도박         1668 non-null   category
 12  112신고      1668 non-null   float64 
 13  총_생활인구     1668 non-null   float64 
 14  아파트_단지수    1486 non-null   float64 
 15  아파트_평균_시가  1486 non-null   float64 
 16  개업율        1662 non-null   float64 
 17  폐업율        1662 non-null   float64 
 18  프랜차이즈_침투율  1662 non-null   float64 
 19  매출액        1648 non-null   

In [3]:
from graphviz import Digraph
import pygraphviz

causal_graph = """digraph {
Bus[label="버스정류장 수"];
Subway[label="지하철역 수"];
유흥업소[label="유흥업소 개수"];
살인;
강도;
강간;
절도;
폭력;
방화;
마약;
약취;
도박;
112신고[label="112신고건수"];
총_생활인구[label="생활인구"];
아파트_단지수[label="아파트 단지 수"];
아파트_평균_시가[label="아파트 평균시가"];
개업율;
폐업율;
프랜차이즈_침투율[label="프랜차이즈 침투율"];
매출액;
매출건수;
상권타입;
U[label="Unobserved Confounders", observed="no"];
U->{}

}"""

SyntaxError: invalid syntax (868470948.py, line 4)

In [5]:
import pygraphviz

causal_graph = """digraph {
different_room_assigned[label="Different Room Assigned"];
is_canceled[label="Booking Cancelled"];
booking_changes[label="Booking Changes"];
previous_bookings_not_canceled[label="Previous Booking Retentions"];
days_in_waiting_list[label="Days in Waitlist"];
lead_time[label="Lead Time"];
market_segment[label="Market Segment"];
country[label="Country"];
U[label="Unobserved Confounders",observed="no"];
is_repeated_guest;
total_stay;
guests;
meal;
hotel;
U->{different_room_assigned,required_car_parking_spaces,guests,total_stay,total_of_special_requests};
market_segment -> lead_time;
lead_time->is_canceled; country -> lead_time;
different_room_assigned -> is_canceled;
country->meal;
lead_time -> days_in_waiting_list;
days_in_waiting_list ->{is_canceled,different_room_assigned};
previous_bookings_not_canceled -> is_canceled;
previous_bookings_not_canceled -> is_repeated_guest;
is_repeated_guest -> {different_room_assigned,is_canceled};
total_stay -> is_canceled;
guests -> is_canceled;
booking_changes -> different_room_assigned; booking_changes -> is_canceled; 
hotel -> {different_room_assigned,is_canceled};
required_car_parking_spaces -> is_canceled;
total_of_special_requests -> {booking_changes,is_canceled};
country->{hotel, required_car_parking_spaces,total_of_special_requests};
market_segment->{hotel, required_car_parking_spaces,total_of_special_requests};
}"""

In [6]:
causal_graph.replace("\n", " ")

'digraph { different_room_assigned[label="Different Room Assigned"]; is_canceled[label="Booking Cancelled"]; booking_changes[label="Booking Changes"]; previous_bookings_not_canceled[label="Previous Booking Retentions"]; days_in_waiting_list[label="Days in Waitlist"]; lead_time[label="Lead Time"]; market_segment[label="Market Segment"]; country[label="Country"]; U[label="Unobserved Confounders",observed="no"]; is_repeated_guest; total_stay; guests; meal; hotel; U->{different_room_assigned,required_car_parking_spaces,guests,total_stay,total_of_special_requests}; market_segment -> lead_time; lead_time->is_canceled; country -> lead_time; different_room_assigned -> is_canceled; country->meal; lead_time -> days_in_waiting_list; days_in_waiting_list ->{is_canceled,different_room_assigned}; previous_bookings_not_canceled -> is_canceled; previous_bookings_not_canceled -> is_repeated_guest; is_repeated_guest -> {different_room_assigned,is_canceled}; total_stay -> is_canceled; guests -> is_cancel

In [9]:
dataset = pd.read_csv('https://raw.githubusercontent.com/Sid-darthvader/DoWhy-The-Causal-Story-Behind-Hotel-Booking-Cancellations/master/hotel_bookings.csv')
dataset.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
