## Final Project Check-in

## Yolo


## Student Names
1. Dillon Quan
2. Shrikar Thodla
3. Mikio Tada

In [None]:
# import the necessary libraries
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from tqdm import tqdm
import numpy as np
import pandas as pd
pd.options.display.max_columns = None

## Load Data

In [None]:
#reading in the file into pandas dataframe
df = pd.read_csv('data/train.csv')

# separating the target variable and the predictor variables
df_X = df.drop(columns=['hotel_cluster'])
df_y = df['hotel_cluster']

In [None]:
# using stratified shuffle split to get 5% of the original data while keeping the distribution
sss = StratifiedShuffleSplit(n_splits=1, train_size=0.05, random_state=0)
for train_idx, test_idx in sss.split(df_X, df_y):
    X_train, X_test = df_X.iloc[train_idx], df_X.iloc[test_idx]
    y_train, y_test = df_y.iloc[train_idx], df_y.iloc[test_idx]

In [None]:
#reset the index
sample_x = X_train.reset_index(drop=True)
sample_y = y_train.reset_index(drop=True)

# How many samples are greater than 2017? seems to be only one sample that has the year of 2557 in this 5% sample
#sample_x.loc[(sample_x['srch_ci'] > '2017')] # erroneous year
sample_x.loc[558005, 'srch_ci'] = '2014-11-14'
sample_x.loc[558005, 'srch_co'] = '2014-11-15'

# feature engineering into datetime
sample_x['date_time'] = pd.to_datetime(sample_x['date_time'], yearfirst=True)
sample_x['srch_ci'] = pd.to_datetime(sample_x['srch_ci'], yearfirst=True, errors='coerce')
sample_x['srch_co'] = pd.to_datetime(sample_x['srch_co'], yearfirst=True, errors='coerce')

# timedelta for days in advance feature
sample_x['advance'] = sample_x['srch_ci'] - sample_x['date_time']
sample_x['advance'] = sample_x.advance.dt.days

# duration of stays
sample_x['duration'] = sample_x['srch_co'] - sample_x['srch_ci']
sample_x['duration'] = sample_x.duration.dt.days

In [None]:
new_x = sample_x.drop(columns=['hotel_market', 'user_id', 'site_name',
                      'posa_continent', 'user_location_region', 'user_location_city',
                     'hotel_continent', 'date_time', 'srch_ci', 'srch_co', 'y', 'year'])

In [None]:
# Train test split the 5% data
xtrain, xtest, ytrain, ytest = train_test_split(new_x, sample_y, test_size=.2 ,random_state=42)

In [None]:
# Building the pipeline
def make_pipeline():
    """This function here makes the pipeline but does not include the clf to prevent data leakage."""
    
    date_features = ['advance', 'duration']
    date_transformer = Pipeline(steps=[('imp', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0))])

    distance = ['orig_destination_distance']
    distance_transformer = Pipeline(steps=[('imp', SimpleImputer(missing_values=np.nan, strategy='mean'))])

    preprocessor = ColumnTransformer(transformers=[('date', date_transformer, date_features),
                                                   ('dist', distance_transformer, distance)],
                                    remainder='passthrough')

    pipeline = Pipeline([('preprocess', preprocessor)])
    
    return pipeline

In [None]:
# make the pipeline and transform the data
pipeline = make_pipeline()
trans_xtrain = pipeline.fit_transform(xtrain)

## Fit scikit-learn model

In [None]:
def make_random_cv(cv, n_iter):
    """
    Steps:
    1. Define Estimator
    2. Define hyperparameter search space
    3. Instantiate RandomizedSearchCV
    """
    
    rf = RandomForestClassifier(random_state=42)
    params_grid = {'n_estimators': [100, 400, 800, 1200, 1600, 2000],
                  'max_features': [5, 10, 15, 20],
                  'min_samples_leaf': [1000, 5000, 15000, 30000]}
    
    clf_random_cv = RandomizedSearchCV(estimator=rf, scoring='f1_weighted',
                                       param_distributions=params_grid,
                                       max_depth=20, n_iter=n_iter, cv=cv, oob_score=True)
                                         
    return clf_random_cv

In [None]:
# creating the RandomizedSearchCV and using the best estimator
randomize_clf = make_random_cv(5, 10) # 5-fold, 10 iterations
randomize_clf.fit(trans_xtrain, ytrain)
best_rf = randomize_clf.best_estimator_ # returning the best estimator

## Evaluation Metric

In [None]:
# fitting the best estimator and getting the training scores
best_rf.fit(trans_xtrain, ytrain)
print(f"Out-of-Bag Accuracy: {best_rf.best_score_}")
print(f"Training weighted-f1-Score: {f1_score(ytrain, best_rf.predict(trans_xtrain))}")

In [None]:
# transforming the test data and evaluating the model

# does this fit the testing data and then transform? 
trans_xtest = pipeline.fit_transform(xtest) # #What if I want to impute based on the training data to prevent data leakage
print(f"Test Accuracy: {accuracy_score(ytest, best_rf.predict(trans_xtest))}")
print(f"Training weighted-f1-Score: {f1_score(ytrain, best_rf.predict(trans_xtrain))}")