In [1]:
import pandas as pd
import numpy as np

from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from joblib import dump, load


In [3]:
df = pd.read_csv('./train.csv')
df.drop(df.columns[0:2], axis=1, inplace=True)

df.dropna(axis=0,inplace=True)
df.reset_index(inplace=True)

dfc = df.copy()
top_c = dfc["Nationality"].value_counts().index[0:10]
dfc["new_nationality"] = dfc["Nationality"].apply(lambda x: x if x in top_c else "Other")
dfc["has_not_appeared"] = np.where((dfc["BookingsCheckedIn"] == 0) , 1, 0)

drop_cols = ['index','Nationality','BookingsCanceled','BookingsNoShowed','BookingsCheckedIn']
cat_cols = ["new_nationality","MarketSegment","DistributionChannel"]

ohe = OneHotEncoder(handle_unknown = 'ignore')
encoded_train = ohe.fit_transform(dfc[cat_cols]).toarray()

ohdf = pd.DataFrame(encoded_train,columns = ohe.get_feature_names(cat_cols))

cat_cols_data = pd.get_dummies(dfc[cat_cols])

combines_frames = [dfc,ohdf]
dfc = pd.concat(combines_frames,axis=1)

dfc.drop(drop_cols,axis=1,inplace=True)
dfc.drop(cat_cols,axis=1,inplace=True)

sc = StandardScaler()

X = dfc.drop("has_not_appeared",axis=1)
y = dfc["has_not_appeared"]
y = pd.get_dummies(y).values

X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2)

sc.fit(X_train)

dump(sc, 'scaler.joblib') 
dump(ohe, 'encoder.joblib') 


['encoder.joblib']

In [4]:

def preprocess_data(df):
    
    dfn = df.drop(df.columns[0:2], axis=1)
    dfn = dfn.dropna(axis=0)
    dfn = dfn.reset_index()
    
    dfc = dfn.copy()
    top_c = ['BRA', 'PRT', 'FRA', 'DEU', 'ITA', 'GBR', 'ESP', 'USA', 'NLD', 'CHE']
    dfc["new_nationality"] = dfc["Nationality"].apply(lambda x: x if x in top_c else "Other")
    dfc["has_not_appeared"] = np.where( (dfc["BookingsCheckedIn"] == 0) , 1, 0)
    drop_cols = ['index','Nationality','BookingsCanceled','BookingsNoShowed','BookingsCheckedIn']
    cat_cols = ["new_nationality","MarketSegment","DistributionChannel"]
    
    ohe = load('encoder.joblib')
    sc = load('scaler.joblib')
    
    encoded_train = ohe.transform(dfc[cat_cols]).toarray()
    
    ohdf = pd.DataFrame(encoded_train,columns = ohe.get_feature_names(cat_cols))
    
    combines_frames = [dfc,ohdf]
    dfc = pd.concat(combines_frames,axis=1)

    dfc = dfc.drop(drop_cols,axis=1)
    dfc = dfc.drop(cat_cols,axis=1)
    
    X = dfc.drop("has_not_appeared",axis=1)
    y = dfc["has_not_appeared"]
    y = pd.get_dummies(y).values

    X = sc.transform(X)
    
    return X, y

Index(['Unnamed: 0', 'ID', 'Nationality', 'Age', 'DaysSinceCreation',
       'AverageLeadTime', 'LodgingRevenue', 'OtherRevenue', 'BookingsCanceled',
       'BookingsNoShowed', 'BookingsCheckedIn', 'PersonsNights', 'RoomNights',
       'DaysSinceLastStay', 'DaysSinceFirstStay', 'DistributionChannel',
       'MarketSegment', 'SRHighFloor', 'SRLowFloor', 'SRAccessibleRoom',
       'SRMediumFloor', 'SRBathtub', 'SRShower', 'SRCrib', 'SRKingSizeBed',
       'SRTwinBed', 'SRNearElevator', 'SRAwayFromElevator',
       'SRNoAlcoholInMiniBar', 'SRQuietRoom'],
      dtype='object')

In [7]:
df = pd.read_csv('./train.csv')

X, y = preprocess_data(df)

X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2)

In [8]:
df = pd.read_csv('./test.csv')

X, y = preprocess_data(df)

In [9]:
df.columns

Index(['Unnamed: 0', 'ID', 'Nationality', 'Age', 'DaysSinceCreation',
       'AverageLeadTime', 'LodgingRevenue', 'OtherRevenue', 'BookingsCanceled',
       'BookingsNoShowed', 'BookingsCheckedIn', 'PersonsNights', 'RoomNights',
       'DaysSinceLastStay', 'DaysSinceFirstStay', 'DistributionChannel',
       'MarketSegment', 'SRHighFloor', 'SRLowFloor', 'SRAccessibleRoom',
       'SRMediumFloor', 'SRBathtub', 'SRShower', 'SRCrib', 'SRKingSizeBed',
       'SRTwinBed', 'SRNearElevator', 'SRAwayFromElevator',
       'SRNoAlcoholInMiniBar', 'SRQuietRoom'],
      dtype='object')

In [None]:
model = Sequential()
model.add(Dense(2, input_dim=44, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=7)


In [None]:
preds = model.predict(X_train)
preds_val = [x.argmax() for x in preds]
y_train_vals = [x.argmax() for x in y_train]
print(f"train score - {accuracy_score(preds_val,y_train_vals)}")

In [None]:

df = pd.read_csv('./test.csv')
Xt, yt = preprocess_data(df)
test_preds = model.predict(Xt)
preds_val = [x.argmax() for x in test_preds]
y_test_vals = [x.argmax() for x in yt]
print(f"test score - {accuracy_score(y_test_vals,preds_val)}")

In [14]:
model.save('nn/')

INFO:tensorflow:Assets written to: nn/assets


0.9741468459152016