## Final Project Check-in

## Yolo


## Student Names
1. Dillon Quan
2. Shrikar Thodla
3. Mikio Tada

In [1]:
# import the necessary libraries
import warnings
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from tqdm import tqdm
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
warnings.filterwarnings('ignore')

## Load Data

In [3]:
#reading in the file into pandas dataframe
df = pd.read_csv('data/train.csv')

# separating the target variable and the predictor variables
df_X = df.drop(columns=['hotel_cluster'])
df_y = df['hotel_cluster']

In [4]:
# using stratified shuffle split to get 1% of the original data while keeping the distribution
sss = StratifiedShuffleSplit(n_splits=1, train_size=0.01, random_state=0)
for train_idx, test_idx in sss.split(df_X, df_y):
    X_train, X_test = df_X.iloc[train_idx], df_X.iloc[test_idx]
    y_train, y_test = df_y.iloc[train_idx], df_y.iloc[test_idx]

In [5]:
#reset the index
sample_x = X_train.reset_index(drop=True)
sample_y = y_train.reset_index(drop=True)

# How many samples are greater than 2017? seems to be only one sample that has the year of 2557 in this 5% sample
#sample_x.loc[(sample_x['srch_ci'] > '2017')] # erroneous year
sample_x.loc[58538, 'srch_ci'] = '2014-11-14'
sample_x.loc[58538, 'srch_co'] = '2014-11-15'

# feature engineering into datetime
sample_x['date_time'] = pd.to_datetime(sample_x['date_time'], yearfirst=True)
sample_x['srch_ci'] = pd.to_datetime(sample_x['srch_ci'], yearfirst=True, errors='coerce')
sample_x['srch_co'] = pd.to_datetime(sample_x['srch_co'], yearfirst=True, errors='coerce')

# timedelta for days in advance feature
sample_x['advance'] = sample_x['srch_ci'] - sample_x['date_time']
sample_x['advance'] = sample_x.advance.dt.days

# duration of stays
sample_x['duration'] = sample_x['srch_co'] - sample_x['srch_ci']
sample_x['duration'] = sample_x.duration.dt.days

In [6]:
new_x = sample_x.drop(columns=['hotel_market', 'user_id', 'site_name',
                      'posa_continent', 'user_location_region', 'user_location_city',
                     'hotel_continent', 'date_time', 'srch_ci', 'srch_co']) 

In [7]:
# Train test split the 1% data
xtrain, xtest, ytrain, ytest = train_test_split(new_x, sample_y, test_size=.2 ,random_state=42)

In [8]:
# Building the pipeline
def make_pipeline():
    """This function here makes the pipeline but does not include the clf to prevent data leakage."""
    
    date_features = ['advance', 'duration']
    date_transformer = Pipeline(steps=[('imp', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0))])

    distance = ['orig_destination_distance']
    distance_transformer = Pipeline(steps=[('imp', SimpleImputer(missing_values=np.nan, strategy='mean'))])

    preprocessor = ColumnTransformer(transformers=[('date', date_transformer, date_features),
                                                   ('dist', distance_transformer, distance)],
                                    remainder='passthrough')

    pipeline = Pipeline(steps=[('preprocess', preprocessor)])
    
    return pipeline

In [9]:
# make the pipeline and transform the data
pipeline = make_pipeline()
trans_xtrain = pipeline.fit_transform(xtrain)
trans_xtest = pipeline.fit_transform(xtest) 

## Fit the model for Random Forest

In [23]:
# RandomForest
def make_random_cv_rf(cv, n_iter):
    """
    Steps:
    1. Define Estimator
    2. Define hyperparameter search space
    3. Instantiate RandomizedSearchCV
    """
    
    rf = RandomForestClassifier(random_state=42, oob_score=True)
    params_grid = {'n_estimators': [100,800, 1200, 2000],
                   'max_features': [0.25, 0.5, 0.75, 0.9],
                   'min_samples_leaf': [1000, 15000, 30000]}
    
    clf_random_cv_rf = RandomizedSearchCV(estimator=rf, scoring='f1_weighted',
                                       param_distributions=params_grid, n_iter=n_iter, cv=cv)
                                         
    return clf_random_cv_rf

In [None]:
# Removed cell that had the keyboard interruption
clf_rand_rf = make_random_cv_rf(2, 7) 
clf_rand_rf.fit(trans_xtrain, ytrain);

## Evaluation Metric for Random Forest

In [12]:
#RandomForest 
print(f"The train set accuracy - {accuracy_score(ytrain, clf_rand_rf.predict(trans_xtrain)):.2%}")
print(f"The train weighted-f1-Score: {f1_score(ytrain, clf_rand_rf.predict(trans_xtrain), average='micro')}")
      
print(f"The test set accuracy - {accuracy_score(ytest, clf_rand_rf.predict(trans_xtest)):.2%}")
print(f"The test weighted-f1-Score: {f1_score(ytest, clf_rand_rf.predict(trans_xtest), average='micro')}")

The train set accuracy - 11.36%
The train weighted-f1-Score: 0.11359797717687424
The test set accuracy - 11.30%
The test weighted-f1-Score: 0.11295310654225454
