In [2]:
import numpy as np
import pandas as pd
import random

In [3]:
test = pd.read_csv('./test.csv')
test_nolabel = pd.read_csv('./test_nolabel.csv')
train = pd.read_csv('./train.csv')
train_label = pd.read_csv('./train_label.csv')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91531 entries, 0 to 91530
Data columns (total 33 columns):
ID                                91531 non-null int64
hotel                             91531 non-null object
is_canceled                       91531 non-null int64
lead_time                         91531 non-null int64
arrival_date_year                 91531 non-null int64
arrival_date_month                91531 non-null object
arrival_date_week_number          91531 non-null int64
arrival_date_day_of_month         91531 non-null int64
stays_in_weekend_nights           91531 non-null int64
stays_in_week_nights              91531 non-null int64
adults                            91531 non-null int64
children                          91527 non-null float64
babies                            91531 non-null int64
meal                              91531 non-null object
country                           91063 non-null object
market_segment                    91531 non-null object
dist

In [5]:
train.isna().sum()

ID                                    0
hotel                                 0
is_canceled                           0
lead_time                             0
arrival_date_year                     0
arrival_date_month                    0
arrival_date_week_number              0
arrival_date_day_of_month             0
stays_in_weekend_nights               0
stays_in_week_nights                  0
adults                                0
children                              4
babies                                0
meal                                  0
country                             468
market_segment                        0
distribution_channel                  0
is_repeated_guest                     0
previous_cancellations                0
previous_bookings_not_canceled        0
reserved_room_type                    0
assigned_room_type                    0
booking_changes                       0
deposit_type                          0
agent                             13217


In [39]:
df = train.drop(['ID', 'is_canceled', 'adr', 'reservation_status', 'reservation_status_date'], axis=1)

In [40]:
def preprocess(df):
    df['children'] = df['children'].fillna(-1)
    df['company'] = df['company'].fillna(-1).astype(object)
    df['agent'] = df['agent'].fillna(-1).astype(object)
    df['country'] = df['country'].fillna('NA')
    df['arrival_date_year'] = df['arrival_date_year'].astype(object)
    df['arrival_date_week_number'] = df['arrival_date_year'].astype(object)
    df['arrival_date_day_of_month'] = df['arrival_date_day_of_month'].astype(object)
    df['children'] = df['children'].astype(np.int64)
preprocess(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91531 entries, 0 to 91530
Data columns (total 28 columns):
hotel                             91531 non-null object
lead_time                         91531 non-null int64
arrival_date_year                 91531 non-null object
arrival_date_month                91531 non-null object
arrival_date_week_number          91531 non-null object
arrival_date_day_of_month         91531 non-null object
stays_in_weekend_nights           91531 non-null int64
stays_in_week_nights              91531 non-null int64
adults                            91531 non-null int64
children                          91531 non-null int64
babies                            91531 non-null int64
meal                              91531 non-null object
country                           91531 non-null object
market_segment                    91531 non-null object
distribution_channel              91531 non-null object
is_repeated_guest                 91531 non-null int64
pr

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score, mean_squared_error

In [11]:
y = train['adr']
X_train, X_val, y_train, y_val = train_test_split(df, y, test_size=0.2, random_state = 50)

features_to_encode = df.columns[df.dtypes==object].tolist()
col_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'), features_to_encode), remainder='passthrough')

In [47]:
regr = make_pipeline(col_trans, svm.SVR(kernel='linear', C=1, epsilon=0.1, gamma='auto'))

In [48]:
regr.fit(df, y)

print(mean_squared_error(y, regr.predict(df)))
y_pred = regr.predict(df_test)

print(y_pred)
adr_result = pd.DataFrame(data=y_pred)
adr_result.to_csv('adr.csv', index=False)

1285.7357723832145
[109.19079673  55.8804597   79.08705052 ... 113.68812233  76.52804341
 136.91852942]


In [53]:
y = train['reservation_status']

In [66]:
clf = make_pipeline(col_trans, svm.SVC(kernel='linear', C=1, gamma='auto'))

In [69]:
clf.fit(df, y)

print(clf.score(df, y))
y_pred = clf.predict(df_test)

print(y_pred)
rs_result = pd.DataFrame(data=y_pred)
rs_result.to_csv('rs.csv', index=False)

0.8321989271394391
['Check-Out' 'Canceled' 'Check-Out' ... 'Check-Out' 'Check-Out'
 'Check-Out']


In [70]:
test_nolabel

Unnamed: 0,arrival_date
0,2017-04-01
1,2017-04-02
2,2017-04-03
3,2017-04-04
4,2017-04-05
5,2017-04-06
6,2017-04-07
7,2017-04-08
8,2017-04-09
9,2017-04-10


In [77]:
rs_result

Unnamed: 0,0
0,Check-Out
1,Canceled
2,Check-Out
3,Canceled
4,Canceled
5,Check-Out
6,Check-Out
7,Check-Out
8,Check-Out
9,Canceled


In [85]:
import math
import datetime

In [78]:
def foo(X_test, y_test, y_test_2):
    df = X_test
    df['adr'] = y_test
    df['reservation_status'] = y_test_2
    df = df[df['reservation_status']!='Canceled']
    df = df.drop(df[(df['reservation_status']=='No show') & (df['deposit_type']=='No Deposit')].index)
    df['stays_in_nights'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']
    df['label'] = df['adr']*df['stays_in_nights']
    df['arrival_date'] = df[['arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month']].agg('-'.join, axis=1)
    result = df[['arrival_date', 'label']]
    result = np.floor(result.groupby('arrival_date').sum()/10000)
    return result

In [79]:
result = foo(test, adr_result, rs_result)
result

TypeError: ('sequence item 0: expected str instance, int found', 'occurred at index 0')

In [99]:
df = test
df['adr'] = adr_result
df['reservation_status'] = rs_result
df = df[df['reservation_status']!='Canceled']
df = df.drop(df[(df['reservation_status']=='No show') & (df['deposit_type']=='No Deposit')].index)
df['stays_in_nights'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']
df['label'] = df['adr']*df['stays_in_nights']
df['arrival_date']=pd.to_datetime(df['arrival_date_year'].astype(int).astype(str)  + df['arrival_date_month'] + df['arrival_date_day_of_month'].astype(int).astype(str),format='%Y%B%d')
result = df[['arrival_date', 'label']]
result = np.floor(result.groupby('arrival_date').sum()/10000)
result

Unnamed: 0_level_0,label
arrival_date,Unnamed: 1_level_1
2017-04-01,3.0
2017-04-02,3.0
2017-04-03,4.0
2017-04-04,2.0
2017-04-05,3.0
2017-04-06,2.0
2017-04-07,2.0
2017-04-08,4.0
2017-04-09,6.0
2017-04-10,4.0


In [104]:
result['label'].to_csv ('result.csv', index = True, header=True)