# Feature Engineering - Expedia Hotel dataset

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


import tensorflow as tf
import warnings
import math
from math import sqrt

import sys
import holidays

import datetime

In [None]:
df = pd.read_csv('../data/hotel_data/train.csv', sep=',', nrows=150000)
destinations = pd.read_csv('../data/hotel_data/destinations.csv', sep=',')
df.shape

In [None]:
#merge only top 10 most correlated columns with rating column
df = pd.merge(df,destinations[['srch_destination_id','d33', 'd64', 'd52', 'd120', 'd72', 'd136', 'd7', 'd59', 'd50', 'd30']],on='srch_destination_id')

In [None]:
# rename 2 columns
df = df.rename(columns={'hotel_cluster': 'item_id', 'is_booking': 'rating'})

In [None]:
df = df.dropna()

In [None]:
df.shape

## Feature Engineering

## date_time

### Extract week and month

In [None]:
# sort values
df = df.sort_values("date_time").reset_index()
df.drop('index',axis=1,inplace=True)

In [None]:
df["date_time"] =  pd.to_datetime(df["date_time"], infer_datetime_format=True)
df["date_time"] = df.date_time.dt.strftime('%Y-%m-%d')
df["date_time_timestamp"] =  pd.to_datetime(df["date_time"], infer_datetime_format=True)

In [None]:
ax= df[['date_time','rating']].groupby('date_time')['rating'].count().plot(color='blue',figsize=(20,4))
ax.set_xlabel("Date")
ax.set_ylabel('Number')
ax.set_title("Number of Customers Visit")
plt.show()

In [None]:
d = datetime.timedelta(days=14)
df['lagged_date_time'] = df["date_time"].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%d") + d)

#df["str_lagged_date_time"] = df.lagged_date_time.dt.strftime('%Y-%m-%d')
#df["lagged_date_time"][1].strftime("%B")
#df["date_time_timestamp"].apply(lambda x: x.strftime("%Y-%U-%w"))

In [None]:
df['num_visit']=1

In [None]:
df_test = df.groupby([pd.Grouper(key='lagged_date_time', freq='W-MON')])['num_visit'].count().reset_index().set_index('lagged_date_time').sort_values('lagged_date_time')

In [None]:
df_test=df_test.reset_index()

In [None]:
df_test

In [None]:
def extract_week(feature,week,lag):
    df[feature] =  pd.to_datetime(df[feature], infer_datetime_format=True)
    df[feature] = df.date_time.dt.strftime('%Y-%m-%d')
    if lag == True:
        d = datetime.timedelta(days=14)
        df['lag_date_time'] = df[feature].apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%d") + d)
        df['week'] = pd.DatetimeIndex(df['lag_date_time']).week
        df['year']=pd.DatetimeIndex(df['lag_date_time']).year
        
        # countinue week numbers for the next year
        df[week] = df['week'].where(df['year'] ==2013 , df['week']+52)

In [None]:
extract_week('date_time','click_week',lag=True)

In [None]:
df.head()

In [None]:
# extract month from date_time
df['click_month'] = pd.DatetimeIndex(df['date_time']).month

## checkin_month , checkout_month

In [None]:
df['checkin_month'] = pd.DatetimeIndex(df['srch_ci']).month
df['checkout_month'] = pd.DatetimeIndex(df['srch_co']).month

df['checkin_year'] = pd.DatetimeIndex(df['srch_ci']).year
df['checkout_year'] = pd.DatetimeIndex(df['srch_co']).year

## add holiday

In [None]:
# Define holidays in some countries
ca_holidays = holidays.Canada()
us_holidays = holidays.UnitedStates()

In [None]:
# check if checkin or checkout date is in holiday of different countries

df['north_am_ci'] = df['srch_ci'].apply(lambda x: 1 if x in (us_holidays or ca_holidays)  else 0)
df['north_am_co'] = df['srch_co'].apply(lambda x: 1 if x in (us_holidays or ca_holidays)  else 0)

In [None]:
df= df.drop(['date_time'],axis=1)
df= df.drop(['week'],axis=1)
df= df.drop(['year'],axis=1)
df= df.drop(['srch_ci'],axis=1)
df= df.drop(['srch_co'],axis=1)
df= df.drop(['lag_date_time'],axis=1)
df= df.drop(['date_time_timestamp'],axis=1)
df= df.drop(['lagged_date_time'],axis=1)
df= df.drop(['num_visit'],axis=1)

In [None]:
df.columns

## site_name
ID of the Expedia point of sale (i.e. Expedia.com, Expedia.co.uk, Expedia.co.jp, …)

In [None]:
sns.countplot(x=df['site_name'],palette='hls',order=sorted(df['site_name'].unique()))
plt.show()

In [None]:
sns.countplot(x=df['site_name'],palette='hls',order=df['site_name'].value_counts().index)
plt.show()

## Check for poisson distribution

In [None]:
from statistics import variance, mean

mean(df['site_name']) ,variance(df['site_name'])


### orig_destination_distance
Physical distance between a hotel and a customer at the time of search. A null means the distance could not be calculated

In [None]:
plt.figure(figsize=(8, 4))
sns.distplot(df['orig_destination_distance'],kde=False,bins=40,color='b')

In [None]:
#Note that we add 1 to the raw count to prevent the logarithm from
# exploding into negative infinity in case the count is zero.
df['log_orig_destination_distance'] = np.log10(df['orig_destination_distance'] + 1)

In [None]:
plt.figure(figsize=(8, 4))
sns.distplot(df['log_orig_destination_distance'],kde=False,bins=40,color='b')

In [None]:
df= df.drop(['orig_destination_distance'],axis=1)

## posa_continent
ID of continent associated with site_name

In [None]:
sns.countplot(x=df['posa_continent'],color='#2E86C1')
plt.show()

## user_location_country
The ID of the country the customer is located

In [None]:
sns.countplot(x=df['user_location_country'],color='#2E86C1')
plt.show()

## user_location_region
The ID of the region the customer is located

In [None]:
sns.countplot(x=df['user_location_region'],color='#2E86C1')
plt.show()

## Create clusters

In [None]:
df.columns

In [None]:
df.head()

In [None]:
from sklearn.cluster import KMeans
def create_cluster(feature):
    y = df[feature]
    X = df.drop(feature,axis=1)
    wcss=[]
    for i in range(1,11):
        kmeans = KMeans(n_clusters= i, init='k-means++', random_state=0)
        kmeans.fit(X)
        wcss.append(kmeans.inertia_)
    plt.plot(range(1,11), wcss)
    plt.title('The Elbow Method')
    plt.xlabel('number of clusters')
    plt.ylabel('wcss')
    plt.show()

In [None]:
create_cluster('user_location_region')

In [None]:
X = df.drop("user_location_region",axis=1)
kmeansmodel = KMeans(n_clusters= 2, init='k-means++', random_state=0)
y_kmeans= kmeansmodel.fit_predict(X)
df['kmeans_user_location_region']=y_kmeans
df= df.drop(['user_location_region'],axis=1)

## user_location_city
The ID of the country the customer is located

In [None]:
sns.countplot(x=df['user_location_city'],color='#2E86C1')
plt.show()

In [None]:
create_cluster('user_location_city')

In [None]:
X = df.drop("user_location_city",axis=1)
kmeansmodel = KMeans(n_clusters= 2, init='k-means++', random_state=0)
y_kmeans= kmeansmodel.fit_predict(X)
df['kmeans_user_location_city']=y_kmeans
df= df.drop(['user_location_city'],axis=1)

In [None]:
df.head()

## channel:
ID of a marketing channel

In [None]:
sns.countplot(x=df['channel'],color='#2E86C1')
plt.show()

## srch_adults_cnt , srch_children_cnt
The number of adults specified in the hotel room

The number of (extra occupancy) children specified in the hotel room',


In [None]:
sns.countplot(x=df['srch_adults_cnt'],color='#2E86C1')
plt.show()

In [None]:
sns.countplot(x=df['srch_children_cnt'],color='#2E86C1')
plt.show()

In [None]:
condlist = [(df['srch_adults_cnt']==0) & (df['srch_children_cnt']==0),
            (df['srch_adults_cnt']==2) & (df['srch_children_cnt']==0),
            (df['srch_adults_cnt']==2) & (df['srch_children_cnt']==1),
            (df['srch_adults_cnt']==2) & (df['srch_children_cnt']==2),
           (df['srch_adults_cnt']==1) & (df['srch_children_cnt']==0),
            (df['srch_adults_cnt']>1) & (df['srch_children_cnt']>0),
           (df['srch_adults_cnt']==1) & (df['srch_children_cnt'] > 0),
           (df['srch_adults_cnt']>2) & (df['srch_children_cnt'] == 0),
           (df['srch_adults_cnt']==0) & (df['srch_children_cnt'] > 0)]

choicelist = ['empty_room',
                'couple_with_no_children',
                'couple_with_one_child',
                'couple_with_two_children',
                'single',
                'big_family',
                'single_parent',
                'friends',
                'unsupervised_children']

df['family_status'] = np.select(condlist,choicelist)

In [None]:
#Convert the family_status into dummy variables
dummies = pd.get_dummies(df['family_status'],drop_first=True)
df= pd.concat( [df.drop('family_status',axis=1),dummies],axis=1)

In [None]:
df=df.drop("unsupervised_children",axis=1)

## srch_rm_cnt
The number of hotel rooms specified in the search

In [None]:
sns.countplot(x=df['srch_rm_cnt'],color='#2E86C1')
plt.show()

## srch_destination_id
ID of the destination where the hotel search was performed

In [None]:
len(df["cnt"].unique())

## srch_destination_type_id
Type of destination

## cnt
Numer of similar events in the context of the same user session

In [None]:
from statistics import variance, mean

mean(df['cnt']) ,variance(df['cnt'])

In [None]:
sns.countplot(x=df['cnt'],color='#2E86C1')
plt.show()

## z-score normalizing

In [None]:
df['cnt'] = (df['cnt'] - df['cnt'].mean())/df['cnt'].std()

In [None]:
scaler.transform(df['cnt'])

In [None]:
sns.countplot(x=df['cnt'],color='#2E86C1')
plt.show()

## check for poisson distribution

In [None]:
mean(df['cnt']) ,variance(df['cnt'])

'hotel_continent':'Hotel continent',
'hotel_country':'Hotel country',
'hotel_market':'Hotel market',

In [None]:
df.head()