In [1]:
run_in_colab = False
if 'google.colab' in str(get_ipython()):
    run_in_colab = True
    print('Running on CoLab')
else:
    print('Running locally on Jupyter')

Running on CoLab


In [2]:
if run_in_colab:
    from google.colab import drive
    drive.mount('/content/drive')
else:  # Set local path 
    data_path = "path/to/data_folder"

Mounted at /content/drive


In [276]:
if run_in_colab:
    from google.colab import files
    uploaded = files.upload()

Saving 207047259_313450876_208346320.csv to 207047259_313450876_208346320 (1).csv
Saving agoda_cancellation_train.csv to agoda_cancellation_train (1).csv
Saving test_set_labels_week_2.csv to test_set_labels_week_2 (1).csv
Saving test_set_week_1_labels.csv to test_set_week_1_labels (1).csv
Saving test_set_week_1.csv to test_set_week_1 (1).csv
Saving test_set_week_2.csv to test_set_week_2 (1).csv
Saving test_set_week_3_labels.csv to test_set_week_3_labels (1).csv
Saving test_set_week_3.csv to test_set_week_3 (1).csv
Saving test_set_week_4.csv to test_set_week_4.csv


In [4]:
# from challenge.agoda_cancellation_estimator import AgodaCancellationEstimator
# from IMLearn.utils import split_train_test
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.preprocessing import normalize
from datetime import datetime
import re
import io
import plotly
import plotly.express as px
import numpy as np
import pandas as pd


In [5]:
COUNTRY_ALPHA2_TO_CONTINENT = {
    "TL": "Asia",
    'AB': 'Asia',
    'AD': 'Europe',
    'AE': 'Asia',
    'AF': 'Asia',
    'AG': 'North America',
    'AI': 'North America',
    'AL': 'Europe',
    'AM': 'Asia',
    'AO': 'Africa',
    'AR': 'South America',
    'AS': 'Oceania',
    'AT': 'Europe',
    'AU': 'Oceania',
    'AW': 'North America',
    'AX': 'Europe',
    'AZ': 'Asia',
    'BA': 'Europe',
    'BB': 'North America',
    'BD': 'Asia',
    'BE': 'Europe',
    'BF': 'Africa',
    'BG': 'Europe',
    'BH': 'Asia',
    'BI': 'Africa',
    'BJ': 'Africa',
    'BL': 'North America',
    'BM': 'North America',
    'BN': 'Asia',
    'BO': 'South America',
    'BQ': 'North America',
    'BR': 'South America',
    'BS': 'North America',
    'BT': 'Asia',
    'BV': 'Antarctica',
    'BW': 'Africa',
    'BY': 'Europe',
    'BZ': 'North America',
    'CA': 'North America',
    'CC': 'Asia',
    'CD': 'Africa',
    'CF': 'Africa',
    'CG': 'Africa',
    'CH': 'Europe',
    'CI': 'Africa',
    'CK': 'Oceania',
    'CL': 'South America',
    'CM': 'Africa',
    'CN': 'Asia',
    'CO': 'South America',
    'CR': 'North America',
    'CU': 'North America',
    'CV': 'Africa',
    'CW': 'North America',
    'CX': 'Asia',
    'CY': 'Asia',
    'CZ': 'Europe',
    'DE': 'Europe',
    'DJ': 'Africa',
    'DK': 'Europe',
    'DM': 'North America',
    'DO': 'North America',
    'DZ': 'Africa',
    'EC': 'South America',
    'EE': 'Europe',
    'EG': 'Africa',
    'ER': 'Africa',
    'ES': 'Europe',
    'ET': 'Africa',
    'FI': 'Europe',
    'FJ': 'Oceania',
    'FK': 'South America',
    'FM': 'Oceania',
    'FO': 'Europe',
    'FR': 'Europe',
    'GA': 'Africa',
    'GB': 'Europe',
    'GD': 'North America',
    'GE': 'Asia',
    'GF': 'South America',
    'GG': 'Europe',
    'GH': 'Africa',
    'GI': 'Europe',
    'GL': 'North America',
    'GM': 'Africa',
    'GN': 'Africa',
    'GP': 'North America',
    'GQ': 'Africa',
    'GR': 'Europe',
    'GS': 'South America',
    'GT': 'North America',
    'GU': 'Oceania',
    'GW': 'Africa',
    'GY': 'South America',
    'HK': 'Asia',
    'HM': 'Antarctica',
    'HN': 'North America',
    'HR': 'Europe',
    'HT': 'North America',
    'HU': 'Europe',
    'ID': 'Asia',
    'IE': 'Europe',
    'IL': 'Asia',
    'IM': 'Europe',
    'IN': 'Asia',
    'IO': 'Asia',
    'IQ': 'Asia',
    'IR': 'Asia',
    'IS': 'Europe',
    'IT': 'Europe',
    'JE': 'Europe',
    'JM': 'North America',
    'JO': 'Asia',
    'JP': 'Asia',
    'KE': 'Africa',
    'KG': 'Asia',
    'KH': 'Asia',
    'KI': 'Oceania',
    'KM': 'Africa',
    'KN': 'North America',
    'KP': 'Asia',
    'KR': 'Asia',
    'KW': 'Asia',
    'KY': 'North America',
    'KZ': 'Asia',
    'LA': 'Asia',
    'LB': 'Asia',
    'LC': 'North America',
    'LI': 'Europe',
    'LK': 'Asia',
    'LR': 'Africa',
    'LS': 'Africa',
    'LT': 'Europe',
    'LU': 'Europe',
    'LV': 'Europe',
    'LY': 'Africa',
    'MA': 'Africa',
    'MC': 'Europe',
    'MD': 'Europe',
    'ME': 'Europe',
    'MF': 'North America',
    'MG': 'Africa',
    'MH': 'Oceania',
    'MK': 'Europe',
    'ML': 'Africa',
    'MM': 'Asia',
    'MN': 'Asia',
    'MO': 'Asia',
    'MP': 'Oceania',
    'MQ': 'North America',
    'MR': 'Africa',
    'MS': 'North America',
    'MT': 'Europe',
    'MU': 'Africa',
    'MV': 'Asia',
    'MW': 'Africa',
    'MX': 'North America',
    'MY': 'Asia',
    'MZ': 'Africa',
    'NA': 'Africa',
    'NC': 'Oceania',
    'NE': 'Africa',
    'NF': 'Oceania',
    'NG': 'Africa',
    'NI': 'North America',
    'NL': 'Europe',
    'NO': 'Europe',
    'NP': 'Asia',
    'NR': 'Oceania',
    'NU': 'Oceania',
    'NZ': 'Oceania',
    'OM': 'Asia',
    'OS': 'Asia',
    'PA': 'North America',
    'PE': 'South America',
    'PF': 'Oceania',
    'PG': 'Oceania',
    'PH': 'Asia',
    'PK': 'Asia',
    'PL': 'Europe',
    'PM': 'North America',
    'PR': 'North America',
    'PS': 'Asia',
    'PT': 'Europe',
    'PW': 'Oceania',
    'PY': 'South America',
    'QA': 'Asia',
    'RE': 'Africa',
    'RO': 'Europe',
    'RS': 'Europe',
    'RU': 'Europe',
    'RW': 'Africa',
    'SA': 'Asia',
    'SB': 'Oceania',
    'SC': 'Africa',
    'SD': 'Africa',
    'SE': 'Europe',
    'SG': 'Asia',
    'SH': 'Africa',
    'SI': 'Europe',
    'SJ': 'Europe',
    'SK': 'Europe',
    'SL': 'Africa',
    'SM': 'Europe',
    'SN': 'Africa',
    'SO': 'Africa',
    'SR': 'South America',
    'SS': 'Africa',
    'ST': 'Africa',
    'SV': 'North America',
    'SY': 'Asia',
    'SZ': 'Africa',
    'TC': 'North America',
    'TD': 'Africa',
    'TG': 'Africa',
    'TH': 'Asia',
    'TJ': 'Asia',
    'TK': 'Oceania',
    'TM': 'Asia',
    'TN': 'Africa',
    'TO': 'Oceania',
    'TP': 'Asia',
    'TR': 'Asia',
    'TT': 'North America',
    'TV': 'Oceania',
    'TW': 'Asia',
    'TZ': 'Africa',
    'UA': 'Europe',
    'UG': 'Africa',
    'US': 'North America',
    'UY': 'South America',
    'UZ': 'Asia',
    'VC': 'North America',
    'VE': 'South America',
    'VG': 'North America',
    'VI': 'North America',
    'VN': 'Asia',
    'VU': 'Oceania',
    'WF': 'Oceania',
    'WS': 'Oceania',
    'XK': 'Europe',
    'YE': 'Asia',
    'YT': 'Africa',
    'ZA': 'Africa',
    'ZM': 'Africa',
    'ZW': 'Africa',
    "A1": 'Unknown',
    np.nan: "Unknown"
}
has_unique = ['charge_option', 'original_payment_type','continent',"accommadation_type_name"]

bool_cols = ['is_user_logged_in', 'is_first_booking']

names_of_non_numeric_cols = ['hotel_country_code', 'accommadation_type_name',
                                 'charge_option', 'customer_nationality',
                                 'guest_nationality_country_name', 'origin_country_code', 'language',
                                 'original_payment_method', 'original_payment_type',
                                 'original_payment_currency', 'cancellation_policy_code']

date_time_cols = ['booking_datetime', 'checkin_date', 'checkout_date',
                      'hotel_live_date']


In [6]:
#not in use
def compute_z_score(df):
  return (df-df.mean())/df.std()

def fillter_to_binary(val):
    if val in [0,1,1.0,0.0] or np.isnan(val):
        return True
    return False

#not in use
def match_to_test_dat(df):
  df1 = df[(df["charge_option"]!='Pay at Check-in')]
  df2 = df1[~(df1.accommadation_type_name.isin(['Pay at Check-in','Chalet','Holiday Park / Caravan Park','Homestay','Inn', 'Lodge', 'Love Hotel']))]
  return df2
#prase the policy str to 2 numeric features
def prase_to_vec(lst,days):
    vec=np.zeros(2)
    if lst:
      before_D = re.findall(r"(\d+)D", " ".join(lst))
      before_N = re.findall(r"(\d+)N", " ".join(lst))
      before_P = re.findall(r"(\d+)P", " ".join(lst))
        # print(before_D)    
      if before_D:
          vec[0] = (np.array(before_D).astype(int)).mean()
      if before_N:
          vec[1] = (np.array(before_N).astype(int)).mean()
      if before_P:
          vec[1]+=((np.array(before_P).astype(int)*days)/100).astype(float).mean()
      return vec
    return [0,0]
#not in use
def counry_code_to_continent(contry):
    return COUNTRY_ALPHA2_TO_CONTINENT[contry]

def remove_not_showing(lst):
    return [strr for strr in lst if "D" in strr]

In [265]:
def data_preprocessing(full_data,train : bool):
  features = []
  #convert cancellation_datetime to binary clf
  if train:
    full_data.cancellation_datetime = full_data.cancellation_datetime.fillna(0).astype(bool).astype(int)
  # remove h_booking_id 
  if "h_booking_id" in full_data.columns:
    full_data = full_data.drop(columns=["h_booking_id"],axis=1)
  # add column of continent of each country  
  full_data["continent"] = full_data.origin_country_code.apply(counry_code_to_continent)
  #same date features
  full_data["day_year_checkin"] = pd.to_datetime(full_data.checkin_date).dt.dayofyear
  full_data["day_year_booked"] = pd.to_datetime(full_data.booking_datetime).dt.dayofyear
  full_data["days_before_checkin"] = (pd.to_datetime(full_data.checkin_date)-pd.to_datetime(full_data.booking_datetime)).dt.days.abs()

  #convert all date string to unix
  for date_time_col_name in date_time_cols:
      full_data[date_time_col_name] = (pd.to_datetime(full_data[date_time_col_name]).view(np.int64))/1000000000
  # convert all categorial variable to dummies    
  for has_unique_col_name in has_unique:
      one_hot = pd.get_dummies(full_data[has_unique_col_name])
      features.append(one_hot.columns)
      full_data = full_data.drop(has_unique_col_name, axis=1)
      full_data = full_data.join(one_hot)
  for bool_col_name in bool_cols:
      full_data[bool_col_name] = full_data[bool_col_name].astype(int)
  # create num_of_booked_days col     
  full_data['num_of_booked_days'] = full_data['checkout_date'] - full_data['checkin_date']
  full_data['num_of_booked_days'] = full_data['num_of_booked_days']/(60*60*24)
  # calc price_per_night
  full_data["price_per_night"] = full_data.original_selling_amount/full_data.num_of_booked_days
  # creating 2 features payment_late_cancellation&norm_of_cancellation_policy from policy cancellation
  str_vec = full_data.cancellation_policy_code.str.split("_")
  str_vec = str_vec.apply(remove_not_showing)
  df1 = pd.DataFrame(list(pd.concat([str_vec,(full_data.num_of_booked_days)],axis=1).apply(lambda x: prase_to_vec(x[0], x[1]), axis=1)),columns=["D","N"])
  df2 = pd.concat([df1["N"],full_data["price_per_night"]],axis=1)
  full_data["payment_late_cancellation"] = df2["N"]*df2["price_per_night"]
  scale_df1 = df1/(365,30)
  full_data["norm_of_cancellation_policy"]=np.linalg.norm(scale_df1,ord=1,axis=1)
  full_data["days_befor_pay_cancellation"] = df1.D
  full_data["num_of_request"] = (full_data.iloc[:,27:34].fillna(0)!=0).sum(axis=1)
  return full_data,features

In [None]:
# full_data2 = pd.read_csv(io.BytesIO(uploaded["test_set_week_2.csv"]))
# match_to_test_dat(full_data2)
# full_data2.loc[(full_data2.accommadation_type_name=='Chalet')]

Unnamed: 0,h_booking_id,booking_datetime,checkin_date,checkout_date,hotel_id,hotel_country_code,hotel_live_date,hotel_star_rating,accommadation_type_name,charge_option,...,request_latecheckin,request_highfloor,request_largebed,request_twinbeds,request_airport,request_earlycheckin,hotel_area_code,hotel_brand_code,hotel_chain_code,hotel_city_code


In [330]:

def load_data(filename: str,train : bool,tree : bool):
    """
    Load Agoda booking cancellation dataset
    Parameters
    ----------
    filename: str
        Path to house prices dataset

    Returns
    -------
    Design matrix and response vector in either of the following formats:
    1) Single dataframe with last column representing the response
    2) Tuple of pandas.DataFrame and Series
    3) Tuple of ndarray of shape (n_samples, n_features) and ndarray of shape (n_samples,)
    """
    # TODO - replace below code with any desired preprocessing
    if   run_in_colab and train:
      full_data = pd.read_csv(io.BytesIO(uploaded["agoda_cancellation_train.csv"]))
    elif  run_in_colab and not train:
      full_data = pd.read_csv(io.BytesIO(uploaded["test_set_week_4.csv"]))
    elif not run_in_colab and train:
      full_data = pd.read_csv(data_path + filename)
    else:  
      full_data = pd.read_csv(data_path + filename)
    full_data,one_hot_feature = data_preprocessing(full_data,train)
    binary_data = full_data.select_dtypes([np.number]).columns[full_data.select_dtypes([np.number]).applymap(fillter_to_binary).all()]
    bad_columns = ['Pay at Check-in','Chalet','Holiday Park / Caravan Park','Homestay','Inn', 'Lodge', 'Love Hotel']# becuase not in test set
    one_hot_feature = [col_name for columns in one_hot_feature for col_name in columns if col_name not in bad_columns]
    # we saw that num of rooms has high corr with num of adults so we can keep one of them.
   
    #original_payment_method, origin_country_code&guest_nationality_country_name, charge_option - add as features
    
    #looks like pyment method is not so informative becuase it is corralte with the comment ones
    # wanted_features=["norm_of_cancellation_policy","payment_late_cancellation","booking_datetime","checkin_date","hotel_star_rating","guest_is_not_the_customer","no_of_children","no_of_extra_bed","no_of_room","original_selling_amount","is_user_logged_in","is_first_booking",
    #              "request_nonesmoke","request_latecheckin","request_highfloor","request_twinbeds",]+one_hot_feature
    # wanted_features_tree = ["norm_of_cancellation_policy","payment_late_cancellation","booking_datetime","checkin_date","hotel_star_rating","guest_is_not_the_customer","no_of_children","no_of_extra_bed","no_of_room","original_selling_amount","is_user_logged_in","is_first_booking",
    #              "request_nonesmoke","request_latecheckin","request_highfloor","request_twinbeds","hotel_area_code","hotel_chain_code"]+one_hot_feature
    small_good_features = ['norm_of_cancellation_policy', 'payment_late_cancellation',
       'booking_datetime', 'checkin_date', 'hotel_star_rating',
       'original_selling_amount', 'hotel_area_code', 'hotel_chain_code',
       'Pay Later', 'Pay Now',]
    if tree:
      features = full_data
    else:  
      features = full_data
    features.fillna(0,inplace=True)
    if train:
      labels = full_data["cancellation_datetime"]
      return features, labels
    return features




In [None]:
# full_data = pd.read_csv(io.BytesIO(uploaded["test_set_week_2.csv"]))
# for has_unique_col_name in has_unique:
#   print(full_data[has_unique_col_name].unique())

In [331]:
train_x,train_y = load_data("im on colab",True,True)
test_x = load_data("im on colab",False,True)

In [301]:

# full_data = pd.read_csv(io.BytesIO(uploaded["agoda_cancellation_train.csv"]))
# test_weak3 = pd.read_csv(io.BytesIO(uploaded["test_set_week_3.csv"]))
# test_weak3,one_hot_feature = data_preprocessing(test_weak3,False)
# test_weak3.fillna(0,inplace=True)

In [296]:
# test_weak2 = pd.read_csv(io.BytesIO(uploaded["test_set_week_2.csv"]))
# test_weak2,one_hot_feature = data_preprocessing(test_weak2,False)
# test_weak2.fillna(0,inplace=True)

In [379]:
small_good_features = ['norm_of_cancellation_policy', 'payment_late_cancellation','price_per_night','num_of_booked_days','days_befor_pay_cancellation'
,'booking_datetime', 'checkin_date','hotel_star_rating','day_year_checkin',
       'original_selling_amount', 'hotel_area_code',
       'Pay Later', 'Pay Now','Asia', 'Hotel','Resort']#'Credit Card', 'Gift Card', 'Invoice', 'Europe', 'North America', 'Oceania','hotel_id'

In [380]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=100,verbose=1,random_state=0).fit(train_x[small_good_features],train_y )


      Iter       Train Loss   Remaining Time 
         1           1.1215           34.86s
         2           1.0878           30.14s
         3           1.0605           29.67s
         4           1.0383           28.53s
         5           1.0200           28.42s
         6           1.0047           27.60s
         7           0.9920           25.81s
         8           0.9811           24.41s
         9           0.9721           23.18s
        10           0.9642           22.22s
        20           0.9253           16.71s
        30           0.9025           13.70s
        40           0.8898           11.34s
        50           0.8808            9.25s
        60           0.8747            7.30s
        70           0.8703            5.42s
        80           0.8665            3.58s
        90           0.8629            1.78s
       100           0.8604            0.00s


In [381]:
from sklearn.metrics import f1_score
print(f1_score(model.predict(test_weak2[small_good_features]), labels2, average='macro'))


0.5087949101796407


In [382]:
from google.colab import files
pd.DataFrame(model.predict(test_x[small_good_features]),columns=["predicted_values"]).to_csv("207047259_313450876_208346320.csv",index=False)
files.download("207047259_313450876_208346320.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [298]:
# labels3 = pd.read_csv(io.BytesIO(uploaded["test_set_week_3_labels.csv"]))
# labels3 = (labels3.iloc[:,0]).map(lambda x: x[-1]).astype(int)
# labels2 = pd.read_csv(io.BytesIO(uploaded["test_set_labels_week_2.csv"]))
# labels2 = (labels2.iloc[:,0]).map(lambda x: x[-1]).astype(int)

In [307]:
# play_week_3 =  pd.concat((test_weak3,labels),axis=1)
# play_week_3[small_good_features].describe()

Unnamed: 0,norm_of_cancellation_policy,payment_late_cancellation,price_per_night,num_of_booked_days,days_befor_pay_cancellation,booking_datetime,checkin_date,hotel_star_rating,hotel_id,day_year_checkin,Credit Card,Gift Card,Invoice,original_selling_amount,hotel_area_code,hotel_chain_code,Pay Later,Pay Now,Asia,Hotel
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,0.277274,236.684815,135.304153,2.45,79.227857,1542473000.0,1548835000.0,3.277857,1600243.0,167.982857,0.987143,0.002857,0.01,315.4972,2950.665714,111.491429,0.461429,0.538571,0.802857,0.722857
std,0.407856,316.095458,138.752218,1.964364,143.056133,753773.2,4447349.0,1.083896,1593363.0,154.289427,0.112739,0.053414,0.09957,393.51953,1725.082753,203.783803,0.498866,0.498866,0.398126,0.447908
min,0.0,0.0,6.08,1.0,0.0,1541036000.0,1544918000.0,0.0,178.0,1.0,0.0,0.0,0.0,11.3,25.0,0.0,0.0,0.0,0.0,0.0
25%,0.038813,70.2375,58.042143,1.0,1.0,1541840000.0,1545955000.0,3.0,267602.0,28.0,1.0,0.0,0.0,95.215,1411.0,0.0,0.0,0.0,1.0,0.0
50%,0.069406,136.595,99.37,2.0,3.0,1542537000.0,1547251000.0,3.0,931824.0,86.0,1.0,0.0,0.0,185.215,2966.5,0.0,0.0,1.0,1.0,1.0
75%,0.199144,265.41,155.475,3.0,31.0,1543181000.0,1550102000.0,4.0,2697294.0,356.0,1.0,0.0,0.0,364.45,4463.0,119.0,1.0,1.0,1.0,1.0
max,1.4,2695.59,1045.72,19.0,365.0,1543620000.0,1572912000.0,5.0,5929344.0,365.0,1.0,1.0,1.0,3826.43,5891.0,679.0,1.0,1.0,1.0,1.0


In [374]:
test_weak2.Hotel.sum()

468

In [149]:
# from sklearn.feature_selection import SelectFromModel
# sel = SelectFromModel(GradientBoostingClassifier(n_estimators = 50))
# sel.fit(train_x.select_dtypes(exclude=['object']), train_y)

SelectFromModel(estimator=GradientBoostingClassifier(n_estimators=50))

In [367]:
model.feature_importances_

array([1.47673195e-01, 4.48952800e-03, 6.93780695e-03, 1.70385154e-02,
       3.76137506e-02, 1.80236879e-01, 2.26668776e-02, 1.48785833e-02,
       2.47081431e-02, 2.22252172e-02, 2.34782494e-03, 1.70511496e-02,
       4.88887796e-01, 9.50539405e-03, 3.35636494e-03, 3.82975208e-04])

In [368]:
small_good_features[-1]

'Resort'

In [None]:
# train_x.columns.size

In [None]:
# ["newton-cg","lbfgs", "liblinear", "sag", "saga"]
# lr_clf = LogisticRegression(solver="lbfgs")
# dec_tree = DecisionTreeClassifier(min_samples_split=4)
# tree = dec_tree.fit(train_x, train_y)
# lr_clf.fit(train_x, train_y)


In [None]:
# forest_cl = clf = RandomForestClassifier(n_estimators=50)
# forest = forest_cl.fit(X=train_x,y=train_y)
# forest.score(test_x,labels)

0.8714285714285714

In [None]:
# f1_score(forest.predict(test_x), labels, average='macro')

0.486820240461373

In [None]:
# forest_features = train_x.columns[forest.feature_importances_>= 0.01]
# forest_features

NameError: ignored

In [None]:
# from sklearn.model_selection import train_test_split

# X_train2, X_test2, y_train2, y_test2 = train_test_split(
# train_x, train_y, test_size=0.3, random_state=42)


In [None]:
# dec_tree2 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5)
# dec_tree2.fit(X_train2, y_train2)
# lr_clf.fit(X_train2, y_train2)

In [None]:
# dec_tree2.score(X_test2,y_test2)

In [None]:
# lr_clf.score(X_test2,y_test2)

In [None]:


# param_grid = {
#     "max_depth": [3,5,10,15,20,None],
#     "min_samples_split": [2,5,7,10],
#     "min_samples_leaf": [1,2,5]
# }

# clf = DecisionTreeClassifier(random_state=42)
# tree_grid_cv = GridSearchCV(clf, param_grid, scoring="roc_auc",verbose=4 ,n_jobs=-1, cv=3).fit(train_x,train_y)

In [None]:
# ans = pd.DataFrame(grid_cv.cv_results_)
# ans.sort_values("rank_test_score").head(10)


In [None]:
# param_grid_2 = {
#     "max_depth": [5],
#     "min_samples_split": [2,5,7,10],
#     "min_samples_leaf": [1,2]
# }
# tree_grid_cv2 = GridSearchCV(clf, param_grid_2, scoring="roc_auc",verbose=4 ,n_jobs=-1, cv=8).fit(train_x,train_y)

In [None]:
# ans_2 = pd.DataFrame(tree_grid_cv2.cv_results_)
# ans_2.sort_values("rank_test_score").head(10)

In [None]:
# parameters = {
#     'penalty' : ['l1','l2'], 
#     'C'       : np.logspace(-3,3,7),
#     'solver'  : ["saga", 'liblinear'],
# }
# lr_clf = LogisticRegression(random_state=42)
# reg_grid_cv = GridSearchCV(lr_clf, parameters, scoring="roc_auc",verbose=4 ,n_jobs=-1, cv=3).fit(train_x,train_y)


In [None]:
# ans2 = pd.DataFrame(grid_cv2.cv_results_)
# ans2.sort_values("rank_test_score")


In [None]:
# best_featues_logostic = grid_cv2.best_estimator_
# best_featues_logostic

In [None]:
# lr_clf = LogisticRegression(penalty="l1",C=1.0,solver="liblinear",max_iter=1000,random_state=42)
# lr_clf.fit(X=train_x[forest_features],y=train_y)

In [None]:
# lr_clf.score(X=test_x[forest_features],y=labels)

In [None]:
# from google.colab import files
# pd.DataFrame(lr_clf.predict(test_x),columns=["predicted_values"]).to_csv("207047259.csv",index=False)
# files.download("207047259.csv")

In [None]:
# from sklearn.feature_selection import SelectFromModel
# sel = SelectFromModel(LogisticRegression(penalty="l1",C=1.0,solver="liblinear",max_iter=100,random_state=42))
# sel.fit(train_x, train_y)


SelectFromModel(estimator=LogisticRegression(penalty='l1', random_state=42,
                                             solver='liblinear'))

In [None]:
# reg_features = train_x.columns[sel.get_support()]

In [None]:
# lr_clf = LogisticRegression(penalty="l1",C=1.0,solver="liblinear",max_iter=100,random_state=42)
# lr_clf.fit(X=train_x,y=train_y)
# f1_score(lr_clf.predict(test_x), labels, average='macro')



0.4125516606915246

In [None]:
# lr_clf = LogisticRegression(random_state=42)
# parameters_2 = {
#     'penalty' : ['l1'], 
#     'C'       : [1.0,10.0,100.0],
#     'solver'  : ['liblinear'],
# }
# reg_grid_cv2 = GridSearchCV(lr_clf, parameters_2, scoring="roc_auc",verbose=4 ,n_jobs=-1, cv=8).fit(train_x,train_y)

In [24]:
# small_good_features = ['norm_of_cancellation_policy', 'payment_late_cancellation',
#        'booking_datetime', 'checkin_date', 'hotel_star_rating',
#        'original_selling_amount', 'hotel_area_code', 'hotel_chain_code',
#        'Pay Later', 'Pay Now', 'Africa', 'Asia', 'Europe', 'North America', 'Oceania',
#        'South America', 'Hotel']#,'Unknown', 'Credit Card', 'Gift Card',
       #'Invoice']

In [25]:
# from sklearn.ensemble import GradientBoostingClassifier
# clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(train_x[small_good_features], train_y)
# from sklearn.metrics import f1_score
# f1_score(clf.predict(test_x[small_good_features]), labels, average='macro')

In [26]:
# from sklearn.metrics import f1_score
# f1_score(clf.predict(test_x[small_good_features]), labels, average='macro')

0.4860499265785609

In [None]:
# from google.colab import files
# pd.DataFrame(model.predict(test_x[small_good_features]),columns=["predicted_values"]).to_csv("207047259.csv",index=False)
# files.download("207047259.csv")

Index(['norm_of_cancellation_policy', 'payment_late_cancellation',
       'booking_datetime', 'checkin_date', 'hotel_star_rating',
       'guest_is_not_the_customer', 'no_of_children', 'no_of_extra_bed',
       'no_of_room', 'original_selling_amount', 'is_user_logged_in',
       'is_first_booking', 'request_nonesmoke', 'request_latecheckin',
       'request_highfloor', 'request_twinbeds', 'hotel_area_code',
       'hotel_chain_code', 'Pay Later', 'Pay Now', 'Credit Card', 'Gift Card',
       'Invoice', 'Africa', 'Asia', 'Europe', 'North America', 'Oceania',
       'South America', 'Unknown', 'Apartment', 'Boat / Cruise', 'Bungalow',
       'Capsule Hotel', 'Guest House / Bed & Breakfast', 'Home', 'Hostel',
       'Hotel', 'Motel', 'Private Villa', 'Resort', 'Resort Villa', 'Ryokan',
       'Serviced Apartment', 'Tent', 'UNKNOWN'],
      dtype='object')

In [None]:
# from sklearn.model_selection import GridSearchCV
# parameters = {
#     "loss":["deviance"],
#     "learning_rate": [0.01, 0.2],
#     "min_samples_split": [0.1,0.5],
#     "min_samples_leaf":[0.1,0.5],
#     "max_depth":[3,5],
#     "max_features":["log2","sqrt"],
#     "criterion": ["friedman_mse",  "mae"],
#     "n_estimators":[10]
#     }
# #passing the scoring function in the GridSearchCV
# clf = GridSearchCV(GradientBoostingClassifier(), parameters,verbose=4,scoring="f1_macro",refit=False,cv=2, n_jobs=-1).fit(train_x[small_good_features], train_y)

Fitting 2 folds for each of 16 candidates, totalling 32 fits


In [None]:
# boost = pd.DataFrame(clf.cv_results_)
# boost.sort_values("rank_test_score").loc[0,"params"]


{'criterion': 'friedman_mse',
 'learning_rate': 0.01,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': 'log2',
 'n_estimators': 10}

In [None]:
# model = GradientBoostingClassifier(n_estimators=100,random_state=0).fit(train_x[small_good_features], train_y)
# for_sub2 = GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,random_state=0).fit(train_x[small_good_features], train_y)
# from sklearn.metrics import f1_score
# print(f1_score(for_sub2.predict(test_x[small_good_features]), labels, average='macro'))
# f1_score(model.predict(test_x[small_good_features]), labels, average='macro')

0.4285864756688434


0.5241619964047796

In [None]:
# week2 = pd.read_csv(io.BytesIO(uploaded["test_set_week_2.csv"]))
# week2_labels = pd.read_csv(io.BytesIO(uploaded["test_set_labels_week_2.csv"]))
# week2_labels = (week2_labels.iloc[:,0]).map(lambda x: x[-1]).astype(int)

In [None]:
# full_data = pd.read_csv(io.BytesIO(uploaded["agoda_cancellation_train.csv"]))
# full_data.cancellation_datetime = full_data.cancellation_datetime.fillna(0).astype(bool).astype(int)