## Instructions {-}

- This is the template for the code and report on the Prediction Problem assignments.

- Your code in steps 1, 3, 4, and 5 will be executed sequentially, and must produce the RMSE / accuracy claimed on Kaggle.

- Your code in step 2 will also be executed, and must produce the optimal hyperparameter values used to train the model.

## Read data

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import warnings
import time

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split, KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold, GridSearchCV, ParameterGrid, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix, mean_squared_error, r2_score, mean_squared_error

import os
os.environ["OMP_NUM_THREADS"] = "1"

In [4]:
raw_train = pd.read_csv('../Datasets/train_classification.csv') 
raw_test = pd.read_csv('../Datasets/test_classification.csv')

## 1) Data pre-processing

Put the data pre-processing code. You don't need to explain it. You may use the same code from last quarter.

In [5]:
# Create copies of the raw datasets
train = raw_train.copy()
test = raw_test.copy()

# Convert 'host_acceptance_rate' and 'host_response_rate' columns to float and scale by dividing by 100
train['acceptance_rate'] = train['host_acceptance_rate'].str.replace('%', '').astype(float) / 100
train['response_rate'] = train['host_response_rate'].str.replace('%', '').astype(float) / 100

test['acceptance_rate'] = test['host_acceptance_rate'].str.replace('%', '').astype(float) / 100
test['response_rate'] = test['host_response_rate'].str.replace('%', '').astype(float) / 100

# Drop unnecessary columns
train.drop(columns=['host_acceptance_rate', 'host_response_rate'], inplace=True)
test.drop(columns=['host_acceptance_rate', 'host_response_rate'], inplace=True)



# Extract numeric values from 'bathrooms_text' column and convert to float
train['bathrooms_num'] = train['bathrooms_text'].str.extract(r'(\d+)').astype(float)
test['bathrooms_num'] = test['bathrooms_text'].str.extract(r'(\d+)').astype(float)

# Fill missing values in 'bathrooms_num' where 'Half-bath' is mentioned in 'bathrooms_text' with 0.5
train.loc[train['bathrooms_text'].str.contains('Half-bath', case=False, na=False) & train['bathrooms_num'].isna(), 'bathrooms_num'] = 0.5
test.loc[test['bathrooms_text'].str.contains('Half-bath', case=False, na=False) & test['bathrooms_num'].isna(), 'bathrooms_num'] = 0.5


In [6]:
# Convert date columns to datetime format
# def strip_date(row):
#     if isinstance(row, str):
#         row = datetime.strptime(row, '%Y-%m-%d').date()
#     return row

# # Apply date conversion to train dataset
# train['host_since'] = train['host_since'].apply(strip_date)
# train['first_review'] = train['first_review'].apply(strip_date)
# train['last_review'] = train['last_review'].apply(strip_date)

# # Apply date conversion to test dataset
# test['host_since'] = test['host_since'].apply(strip_date)
# test['first_review'] = test['first_review'].apply(strip_date)
# test['last_review'] = test['last_review'].apply(strip_date)

# # ----- #

# # Calculate months since various dates for train dataset
# train['host_since_in_months'] = round(((datetime.now().date() - train['host_since']).dt.days) / 30, 2)
# train['first_review_in_months'] = round(((datetime.now().date() - train['first_review']).dt.days) / 30, 2)
# train['last_review_in_months'] = round(((datetime.now().date() - train['last_review']).dt.days) / 30, 2)

# # Calculate months since various dates for test dataset
# test['host_since_in_months'] = round(((datetime.now().date() - test['host_since']).dt.days) / 30,  2)
# test['first_review_in_months'] = round(((datetime.now().date() - test['first_review']).dt.days) / 30, 2)
# test['last_review_in_months'] = round(((datetime.now().date() - test['last_review']).dt.days) / 30, 2)


# Convert the relevant columns to datetime
train['host_since'] = pd.to_datetime(train['host_since'])
train['first_review'] = pd.to_datetime(train['first_review'])
train['last_review'] = pd.to_datetime(train['last_review'])
test['host_since'] = pd.to_datetime(test['host_since'])
test['first_review'] = pd.to_datetime(test['first_review'])
test['last_review'] = pd.to_datetime(test['last_review'])


# Calculate months since various dates for train dataset
now = datetime.now()
train['host_since_in_months'] = round((now - train['host_since']).dt.days / 30, 2)
train['first_review_in_months'] = round((now - train['first_review']).dt.days / 30, 2)
train['last_review_in_months'] = round((now - train['last_review']).dt.days / 30, 2)
test['host_since_in_months'] = round((now - test['host_since']).dt.days / 30, 2)
test['first_review_in_months'] = round((now - test['first_review']).dt.days / 30, 2)
test['last_review_in_months'] = round((now - test['last_review']).dt.days / 30, 2)

train_clean = train.drop(columns=['host_since', 'first_review', 'last_review'])
test_clean = test.drop(columns=['host_since', 'first_review', 'last_review'])

# train['review_scores_avg'] = train['review_scores_avg'].fillna(value=0)
# test['review_scores_avg'] = test['review_scores_avg'].fillna(value=0)

In [7]:
# Create a dictionary for response time category conversions
response_time_dict = {'within an hour': 1, 'within a few hours': 12, 'within a day': 24, 'a few days or more': 72}

def replace_response_time(row):
    if pd.notna(row):
        return response_time_dict.get(row)
    else:
        return None

train_clean['response_time'] = train_clean['host_response_time'].apply(replace_response_time)
test_clean['response_time'] = test_clean['host_response_time'].apply(replace_response_time)

Clean Transform

In [8]:
def clean_vars(row):
    # Check if 'shared' is in 'bathrooms_text' to identify shared bathrooms
    if 'shared' in str(row['bathrooms_text']):
        row['bathrooms_shared'] = "t"
        
    # Check if 'bathrooms_text' is empty and 'room_type' is 'Shared' to identify shared bathrooms
    elif pd.isna(row['bathrooms_text']):
        if 'Shared' in row['room_type']:
            row['bathrooms_shared'] = "t"              
        else:
            row['bathrooms_shared'] = "f"
    else: 
        row['bathrooms_shared'] = "f"
        
    # Convert 'Hotel room' room type to 'Private room'
    if row.loc['room_type'] == 'Hotel room':
        row['room_type'] = 'Private room'
        
    return row

# Apply the function to clean variables to train and test datasets
train_clean = train_clean.apply(clean_vars, axis=1)
test_clean = test_clean.apply(clean_vars, axis=1)


# create variables for rate of reviews for listing count and for host_since_months
train_clean['reviews_per_listing'] = train_clean['number_of_reviews']/train_clean['calculated_host_listings_count']
train_clean['reviews_per_month'] = train_clean['number_of_reviews']/train_clean['host_since_in_months']
train_clean['reviews_per_listing_per_month'] = train_clean['reviews_per_listing']/train_clean['host_since_in_months']

test_clean['reviews_per_listing'] = test_clean['number_of_reviews']/test_clean['calculated_host_listings_count']
test_clean['reviews_per_month'] = test_clean['number_of_reviews']/test_clean['host_since_in_months']
test_clean['reviews_per_listing_per_month'] = test_clean['reviews_per_listing']/test_clean['host_since_in_months']

train_clean.drop(columns=['bathrooms_text'], inplace=True)
test_clean.drop(columns=['bathrooms_text'], inplace=True)


clean neighbourhoods

In [9]:
# if neighbourhood has less than 150 occurances group them into 'Other'
neighbourhood_counts = train_clean['neighbourhood_cleansed'].value_counts()  
test_only_hoods = [i for i in test_clean['neighbourhood_cleansed'].unique() 
                   if i not in neighbourhood_counts 
                   and i != 'Other']


other_hoods = []
for i in neighbourhood_counts.index:
    if neighbourhood_counts[i] < 50:
        other_hoods.append(i)         
    
def clean_hoods(row):
    if row.loc['neighbourhood_cleansed'] in other_hoods or row.loc['neighbourhood_cleansed'] in test_only_hoods:
        row['neighbourhood_grouped'] = 'Other'
        
    else:    
        row['neighbourhood_grouped'] = row.loc['neighbourhood_cleansed']
        
    return row
   
    
train_clean = train_clean.apply(clean_hoods, axis=1)  
test_clean = test_clean.apply(clean_hoods, axis=1)  

In [10]:
# Clean filler words out of property types
words_to_remove = ['place', 'room', 'private', 'shared', 'entire', ' in', ' room', ' private', ' shared', ' entire', ' in',]

def remove_words(text):
    text=text.lower()
    for word in words_to_remove:
        word = word.lower()
        text = text.replace(word, '')
    return text.strip()

train_clean['property_type'] = train_clean['property_type'].apply(remove_words)
test_clean['property_type'] = test_clean['property_type'].apply(remove_words)


# group properties with less than 10 occurances into 'Other'
property_counts = train_clean['property_type'].value_counts()
keep = [i for i in property_counts.index if property_counts[i] >= 5]

def clean_property(row):
    if row not in keep or row == "":
        row = 'Other'
      
    return row

train_clean['property_type_cleansed'] = train_clean['property_type'].apply(clean_property)
test_clean['property_type_cleansed'] = test_clean['property_type'].apply(clean_property)

train_filter_1 = train_clean.copy()
test_filter_1 = test_clean.copy()

In [11]:
host_hood_counts = train_filter_1['host_neighbourhood'].value_counts()
keep_host_hood = host_hood_counts[host_hood_counts >= 5].index

train_filter_1['host_neighbourhood'] = train_filter_1['host_neighbourhood'].apply(lambda x: 'Other' if x not in keep_host_hood else x)
test_filter_1['host_neighbourhood'] = test_filter_1['host_neighbourhood'].apply(lambda x: 'Other' if x not in keep_host_hood else x)
# train_final[['host_neighbourhood']].value_counts()
# test_final[['host_neighbourhood']].value_counts()

# ----- #

host_loc_counts = train_filter_1['host_location'].value_counts()
keep_host_loc = host_loc_counts[host_loc_counts >= 10].index

train_filter_1['host_location'] = train_filter_1['host_location'].apply(lambda x: 'Other' if x not in keep_host_loc else x)
test_filter_1['host_location'] = test_filter_1['host_location'].apply(lambda x: 'Other' if x not in keep_host_loc else x)
# train_final['host_location'].value_counts()
# test_final['host_location'].value_counts()


In [12]:
try:
    train_filter_1['host_verifications'] = train_filter_1['host_verifications'].apply(ast.literal_eval)
except: pass

try:
    test_filter_1['host_verifications'] = test_filter_1['host_verifications'].apply(ast.literal_eval)
except: pass


In [13]:
train_filter_1['num_verifications'] = train_filter_1['host_verifications'].apply(len)
test_filter_1['num_verifications'] = test_filter_1['host_verifications'].apply(len)

In [14]:
def split_vers(df):
    def update_verification(row):
        ver_phone = 't' if 'phone' in row['host_verifications'] else 'f'
        ver_email = 't' if 'email' in row['host_verifications'] else 'f'
        ver_work_email = 't' if 'work_email' in row['host_verifications'] else 'f'
        return pd.Series({'ver_phone': ver_phone, 'ver_email': ver_email, 'ver_work_email': ver_work_email})

    df[['ver_phone', 'ver_email', 'ver_work_email']] = df.apply(update_verification, axis=1)

    return df


train_filter_2 = split_vers(train_filter_1).drop('host_verifications', axis=1)
test_filter_2 = split_vers(test_filter_1).drop('host_verifications', axis=1)

Fill missing values

In [15]:
# review scores are very correlated, average review scores to handle this
train_filter_2['review_scores_avg'] = train_filter_2[['review_scores_accuracy', 'review_scores_checkin', 'review_scores_communication', 'review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness']].mean(axis=1, skipna=True)
test_filter_2['review_scores_avg'] = test_filter_2[['review_scores_accuracy', 'review_scores_checkin', 'review_scores_communication', 'review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness']].mean(axis=1, skipna=True)

train_filter_2.drop(columns=['review_scores_accuracy', 'review_scores_checkin', 'review_scores_communication', 'review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness'], inplace=True)
test_filter_2.drop(columns=['review_scores_accuracy', 'review_scores_checkin', 'review_scores_communication', 'review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness'], inplace=True)

train_filter_2['host_is_superhost'] = train_filter_2['host_is_superhost'].map({"f":False, "t":True})

# train_final['review_scores_avg'] = train_final['review_scores_avg']  # .fillna(value=train_final['review_scores_avg'].median())
# test_final['review_scores_avg'] = test_final['review_scores_avg']  # .fillna(value=train_final['review_scores_avg'].median())


# Fill in remaining missing values with median for numerical columns
train_filter_3 = train_filter_2.fillna(train_filter_2.median(numeric_only=True))
test_filter_3 = test_filter_2.fillna(test_filter_2.median(numeric_only=True))


In [16]:
# import statsmodels.api as sm
# # from statsmodels.stats.outliers_influence import variance_inflation_factor

# # non_numeric_columns = train_filter_3.select_dtypes(exclude=[np.number]).columns
# # data_numeric = train_filter_3.drop(columns=non_numeric_columns)

# # X = data_numeric.drop(columns=['id', 'latitude', 'longitude'])
# # y = train_filter_3.host_is_superhost

# # vif = pd.DataFrame()
# # vif["Predictor"] = X.columns
# # vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# # vif[vif['VIF'] >= 5].sort_values('VIF', ascending=False)

In [17]:
# train_filter_4 = train_filter_3.drop(columns=['maximum_nights_avg_ntm', 'minimum_nights_avg_ntm', 'calculated_host_listings_count', 'availability_60', 'host_listings_count'])  #'maximum_maximum_nights', 'minimum_maximum_nights'])
# test_filter_4 = test_filter_3.drop(columns=['maximum_nights_avg_ntm', 'minimum_nights_avg_ntm', 'calculated_host_listings_count', 'availability_60', 'host_listings_count'])  # 'maximum_maximum_nights', 'minimum_maximum_nights'])

In [18]:
# non_numeric_columns = train_filter_4.select_dtypes(exclude=[np.number]).columns
# data_numeric = train_filter_4.drop(columns=non_numeric_columns)

# X = data_numeric.drop(columns=['id', 'latitude', 'longitude'])
# y = train_filter_4.host_is_superhost

# vif = pd.DataFrame()
# vif["Predictor"] = X.columns
# vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# vif[vif['VIF'] >= 5].sort_values('VIF', ascending=False)

In [19]:
# train_filter_5 = train_filter_4.drop(columns=['response_rate', 'review_scores_avg']) #['availability_60', 'response_rate', 'availability_90', 'host_listings_count', 'review_scores_avg'])
# test_filter_5 = test_filter_4.drop(columns=['response_rate', 'review_scores_avg']) #['availability_60', 'response_rate', 'availability_90', 'host_listings_count', 'review_scores_avg'])

In [20]:
# non_numeric_columns = train_filter_5.select_dtypes(exclude=[np.number]).columns
# data_numeric = train_filter_5.drop(columns=non_numeric_columns)

# X = data_numeric.drop(columns=['id', 'latitude', 'longitude'])
# y = train_filter_5.host_is_superhost

# vif = pd.DataFrame()
# vif["Predictor"] = X.columns
# vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# vif[vif['VIF'] >= 5].sort_values('VIF', ascending=False)

In [21]:
# train_filter_6 = train_filter_5.drop(columns=['acceptance_rate', 'host_since_in_months', 'minimum_nights_avg_ntm', 'reviews_per_month'])
# test_filter_6 = test_filter_5.drop(columns=['acceptance_rate', 'host_since_in_months', 'minimum_nights_avg_ntm', 'reviews_per_month'])

# non_numeric_columns = train_filter_6.select_dtypes(exclude=[np.number]).columns
# data_numeric = train_filter_6.drop(columns=non_numeric_columns)

# X = data_numeric.drop(columns=['id', 'latitude', 'longitude'])
# y = train_filter_6.host_is_superhost

# vif = pd.DataFrame()
# vif["Predictor"] = X.columns
# vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# vif[vif['VIF'] >= 5].sort_values('VIF', ascending=False)

In [22]:
# Create final DataFrames
train_final = train_filter_3.copy()
test_final = test_filter_3.copy()

## 2) Hyperparameter tuning

### How many attempts did it take you to tune the model hyperparameters?

### Which tuning method did you use (grid search / Bayes search / etc.)?

### What challenges did you face while tuning the hyperparameters, and what actions did you take to address those challenges?

### How many hours did you spend on hyperparameter tuning?

**Paste the hyperparameter tuning code below. You must show at least one hyperparameter tuning procedure.**

In [23]:
#Hyperparameter tuning code

**Paste the optimal hyperparameter values below.**

## 3) Model

Using the optimal model hyperparameters, train the model, and paste the code below.

In [24]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate, GridSearchCV, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier

In [25]:
numeric_columns = train_final.select_dtypes(include=['number']).drop(columns=['host_id', 'id']).columns

X_train = train_final.drop(columns=['host_id', 'host_is_superhost', 'id'])
X_test = test_final.drop(columns=['host_id', 'id'])
train_final
X_train_num = X_train[numeric_columns]
y_train = train_final.host_is_superhost

sc = StandardScaler()
sc.fit(X_train_num)

X_train_scaled = sc.transform(X_train[numeric_columns])
X_test_scaled = sc.transform(X_test[numeric_columns])

X_train_num_scaled = pd.DataFrame(X_train_scaled, columns=numeric_columns)
X_test_num_scaled = pd.DataFrame(X_test_scaled, columns=numeric_columns)


train_testing = train_final.drop(columns=['host_is_superhost']) 
test_testing = test_final

train_testing_cat = train_testing.select_dtypes(exclude=['number'])
test_testing_cat = test_testing.select_dtypes(exclude=['number'])



enc = OneHotEncoder(drop='if_binary', handle_unknown='ignore')
enc.fit(train_testing_cat)

drop_enc = enc.transform(train_testing_cat)
drop_enc_test = enc.transform(test_testing_cat)

train_encoded_df = pd.DataFrame(drop_enc.toarray(), columns=enc.get_feature_names_out(train_testing_cat.columns))
test_encoded_df = pd.DataFrame(drop_enc_test.toarray(), columns=enc.get_feature_names_out(test_testing_cat.columns))

X_train_final = pd.concat([X_train_num_scaled, train_encoded_df], axis=1)
X_test_final = pd.concat([X_test_num_scaled, test_encoded_df], axis=1)



### Model Creation

### AdaBoost

#### GSCV

In [26]:
base_model = DecisionTreeClassifier(random_state=1)
ada_model = AdaBoostClassifier(estimator=base_model, random_state=1)

In [34]:
coarse_params = {
    'estimator__max_depth': range(10, 26, 5),
    'n_estimators': range(25, 205, 25), # range(350, 460, 25),
    'learning_rate': [0.01, 0.1, 1.0]
}

coarse_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
gscv_coarse_model = GridSearchCV(ada_model, coarse_params, cv=coarse_cv, verbose=2, scoring='accuracy', n_jobs=int(os.getenv("SLURM_NPROCS", 1))).fit(X_train_final, y_train)

cv_results = gscv_coarse_model.cv_results_
gscv_coarse_model.best_params_

Fitting 3 folds for each of 96 candidates, totalling 288 fits


{'estimator__max_depth': 20, 'learning_rate': 1.0, 'n_estimators': 200}

In [41]:
mid_params = {
    'estimator__max_depth': range(15, 26, 2),
    'n_estimators': range(60, 112, 10),
    'learning_rate': [0.5, 1.0, 5]
}

mid_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
gscv_mid_model = GridSearchCV(ada_model, mid_params, cv=mid_cv, verbose=2, scoring='accuracy', n_jobs=int(os.getenv("SLURM_NPROCS", 1))).fit(X_train_final, y_train)

cv_results = gscv_mid_model.cv_results_
gscv_mid_model.best_params_


Fitting 3 folds for each of 108 candidates, totalling 324 fits


{'estimator__max_depth': 21, 'learning_rate': 1.0, 'n_estimators': 100}

In [28]:
fine_params = {
    'estimator__max_depth': range(17, 22, 1),
    'n_estimators': range(105, 122, 5),
    'learning_rate': [0.75, 1.0, 2.5]
}

fine_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
gscv_fine_model = GridSearchCV(ada_model, fine_params, cv=fine_cv, verbose=2, scoring='accuracy', n_jobs=-1).fit(X_train_final, y_train)

cv_results = gscv_fine_model.cv_results_
gscv_fine_model.best_params_


Fitting 10 folds for each of 60 candidates, totalling 600 fits


{'estimator__max_depth': 19, 'learning_rate': 1.0, 'n_estimators': 110}

In [None]:
cross_val_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
ada_pred_probas = cross_val_predict(gscv_fine_model, X_train_final, y_train, cv=cross_val_cv, n_jobs=-1)[:, 1]

ada_scores = []
for thr in np.arange(0.05, 1.0, 0.05):
    ada_preds = ada_pred_probas > thr
    acc = accuracy_score(y_train, ada_preds)
    ada_scores.append[acc, thr]
    
ada_score_df = pd.DataFrame(ada_scores, columns=['acc', 'thr'])

In [None]:
ada_score_df

#### AdaBoosting Loop

In [29]:
coarse_loop_params = {
    'estimator__max_depth': range(10, 26, 5),
    'n_estimators': range(25, 205, 25),
    'learning_rate': [0.01, 0.1, 1.0],
    'thresholds': np.arange(0.3, 0.71, 0.05)
}

loop_results = [(list(coarse_loop_params.keys())+['accuracy'])]

for dpth in coarse_loop_params['estimator__max_depth']:
    base_model = DecisionTreeClassifier(max_depth=dpth, random_state=1)
    
    for n_est in coarse_loop_params['n_estimators']:
        print(dpth, "-", n_est)
        for lr_rt in coarse_loop_params['learning_rate']:
            ada_model = AdaBoostClassifier(estimator=base_model, n_estimators=n_est, learning_rate=lr_rt, random_state=1) #.fit(temp_X_train, temp_y_train)
            pred_probas = cross_val_predict(ada_model, X_train_final, y_train, cv=3, method='predict_proba', n_jobs=int(os.getenv("SLURM_NPROCS", 1)))[:, 1]
            
            for thr in coarse_loop_params['thresholds']:
                preds = pred_probas > thr
                acc = accuracy_score(y_train, preds)
                
                loop_results.append([dpth, n_est, lr_rt, thr, acc])

loop_results


10 - 25
10 - 50
10 - 75
10 - 100
10 - 125
10 - 150
10 - 175
10 - 200
15 - 25
15 - 50
15 - 75
15 - 100
15 - 125
15 - 150
15 - 175
15 - 200
20 - 25
20 - 50
20 - 75
20 - 100
20 - 125
20 - 150
20 - 175
20 - 200
25 - 25
25 - 50
25 - 75
25 - 100
25 - 125
25 - 150
25 - 175
25 - 200


[['estimator__max_depth',
  'n_estimators',
  'learning_rate',
  'thresholds',
  'accuracy'],
 [10, 25, 0.01, 0.3, 0.8175607795860961],
 [10, 25, 0.01, 0.35, 0.8161543098251959],
 [10, 25, 0.01, 0.39999999999999997, 0.814948764315853],
 [10, 25, 0.01, 0.44999999999999996, 0.8143459915611815],
 [10, 25, 0.01, 0.49999999999999994, 0.8131404460518384],
 [10, 25, 0.01, 0.5499999999999999, 0.812537673297167],
 [10, 25, 0.01, 0.5999999999999999, 0.8129395218002813],
 [10, 25, 0.01, 0.6499999999999999, 0.8135422945549527],
 [10, 25, 0.01, 0.7, 0.8131404460518384],
 [10, 25, 0.1, 0.3, 0.833634719710669],
 [10, 25, 0.1, 0.35, 0.832228249949769],
 [10, 25, 0.1, 0.39999999999999997, 0.8320273256982118],
 [10, 25, 0.1, 0.44999999999999996, 0.8324291742013261],
 [10, 25, 0.1, 0.49999999999999994, 0.8318264014466547],
 [10, 25, 0.1, 0.5499999999999999, 0.8312236286919831],
 [10, 25, 0.1, 0.5999999999999999, 0.8304199316857545],
 [10, 25, 0.1, 0.6499999999999999, 0.8286116134217401],
 [10, 25, 0.1, 0

In [30]:
loop_df = pd.DataFrame(loop_results, columns=loop_results[0]).loc[1:, :]
loop_df.sort_values('accuracy', ascending=False, inplace=True)

In [31]:
best_loop_param = loop_df.iloc[0, :]
print(best_loop_param)

best_loop_model = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=best_loop_param['estimator__max_depth'], random_state=1), n_estimators=best_loop_param['n_estimators'], learning_rate=best_loop_param['learning_rate'], random_state=1).fit(X_train_final, y_train)

estimator__max_depth          15
n_estimators                 175
learning_rate                1.0
thresholds                   0.4
accuracy                0.892506
Name: 399, dtype: object


In [None]:
grid_2 = {
    'estimator__max_depth': np.arange(10, 21, 2),
    'n_estimators': np.arange(150, 201, 10),
    'learning_rate': [0.5, 1, 5]
}   

fine_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
test_gscv_model = GridSearchCV(ada_model, grid_2, cv=fine_cv, verbose=2, scoring='accuracy', n_jobs=int(os.getenv("SLURM_NPROCS", 1))).fit(X_train_final, y_train)

cv_results = test_gscv_model.cv_results_
test_gscv_model.best_params_

Fitting 15 folds for each of 108 candidates, totalling 1620 fits
[CV] END estimator__max_depth=10, learning_rate=0.5, n_estimators=150; total time=  36.2s
[CV] END estimator__max_depth=10, learning_rate=0.5, n_estimators=150; total time=  35.4s
[CV] END estimator__max_depth=10, learning_rate=0.5, n_estimators=150; total time=  26.2s
[CV] END estimator__max_depth=10, learning_rate=0.5, n_estimators=150; total time=  29.2s
[CV] END estimator__max_depth=10, learning_rate=0.5, n_estimators=150; total time=  28.0s
[CV] END estimator__max_depth=10, learning_rate=0.5, n_estimators=150; total time=  27.4s
[CV] END estimator__max_depth=10, learning_rate=0.5, n_estimators=150; total time=  27.8s
[CV] END estimator__max_depth=10, learning_rate=0.5, n_estimators=150; total time=  29.7s
[CV] END estimator__max_depth=10, learning_rate=0.5, n_estimators=150; total time=  26.6s
[CV] END estimator__max_depth=10, learning_rate=0.5, n_estimators=150; total time=  32.5s
[CV] END estimator__max_depth=10, l

[CV] END estimator__max_depth=10, learning_rate=1, n_estimators=150; total time=  27.2s
[CV] END estimator__max_depth=10, learning_rate=1, n_estimators=150; total time=  25.5s
[CV] END estimator__max_depth=10, learning_rate=1, n_estimators=150; total time=  29.3s
[CV] END estimator__max_depth=10, learning_rate=1, n_estimators=150; total time=  26.3s
[CV] END estimator__max_depth=10, learning_rate=1, n_estimators=150; total time=  31.3s
[CV] END estimator__max_depth=10, learning_rate=1, n_estimators=150; total time=  26.1s
[CV] END estimator__max_depth=10, learning_rate=1, n_estimators=150; total time=  27.9s
[CV] END estimator__max_depth=10, learning_rate=1, n_estimators=150; total time=  27.3s
[CV] END estimator__max_depth=10, learning_rate=1, n_estimators=150; total time=  25.3s
[CV] END estimator__max_depth=10, learning_rate=1, n_estimators=150; total time=  25.9s
[CV] END estimator__max_depth=10, learning_rate=1, n_estimators=150; total time=  27.5s
[CV] END estimator__max_depth=10

[CV] END estimator__max_depth=10, learning_rate=5, n_estimators=150; total time= 1.3min
[CV] END estimator__max_depth=10, learning_rate=5, n_estimators=150; total time= 1.2min
[CV] END estimator__max_depth=10, learning_rate=5, n_estimators=150; total time= 1.3min
[CV] END estimator__max_depth=10, learning_rate=5, n_estimators=150; total time= 1.3min
[CV] END estimator__max_depth=10, learning_rate=5, n_estimators=150; total time= 1.3min
[CV] END estimator__max_depth=10, learning_rate=5, n_estimators=150; total time= 1.2min
[CV] END estimator__max_depth=10, learning_rate=5, n_estimators=150; total time= 1.4min
[CV] END estimator__max_depth=10, learning_rate=5, n_estimators=150; total time= 1.3min
[CV] END estimator__max_depth=10, learning_rate=5, n_estimators=150; total time= 1.1min
[CV] END estimator__max_depth=10, learning_rate=5, n_estimators=150; total time= 1.1min
[CV] END estimator__max_depth=10, learning_rate=5, n_estimators=160; total time= 1.2min
[CV] END estimator__max_depth=10

In [None]:

#     'thresholds': (0.35, 0.45, 0.005)

### Gradient Boosting
#### GSCV

In [24]:
start_time = time.time()
grad_model = GradientBoostingClassifier(random_state=1)
coarse_grad_grid = {
    'n_estimators': [500, 700, 900, 1200, 1500, 3000],
    'learning_rate': [1.0], # [0.01, 0.1, 1.0],
    'max_leaf_nodes': [4], # [2, 3, 4, 5, 6, 7],
    'subsample': [0.8] # [0.5, 0.6, 0.8, 1.0]
}

coarse_grad_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
rscv_grad_coarse = GridSearchCV(estimator=grad_model, param_grid=coarse_grad_grid, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), cv=coarse_grad_cv, verbose=True, scoring='accuracy').fit(X_train_final, y_train)

print("Best: %f using %s" % (rscv_grad_coarse.best_score_, rscv_grad_coarse.best_params_))
print("Time taken = ", time.time() - start_time, "seconds")

Fitting 3 folds for each of 6 candidates, totalling 18 fits


KeyboardInterrupt: 

In [None]:
subsamp = rscv_grad_coarse.best_params_['subsample']
learn_rt = rscv_grad_coarse.best_params_['learning_rate']
n_est = rscv_grad_coarse.best_params_['n_estimators']
max_leaf = rscv_grad_coarse.best_params_['max_leaf_nodes']

if subsamp > 0.8:
    subsamp_range = np.arange(subsamp-0.2, 1.05, 0.1)
else:
    subsamp_range = np.arange(subsamp-0.2, subsamp+0.25, 0.1)
    
learn_rt_range = np.arange(learn_rt-2*(learn_rt/5), learn_rt+3*(learn_rt/5), (learn_rt)/5)
n_est_range = np.arange(n_est-2*(n_est/5), n_est+3*(n_est/5), n_est/5).astype(int)
max_leaf_range = np.arange(max_leaf-2, max_leaf+3, 1)

start_time = time.time()
model = GradientBoostingClassifier(random_state=1, loss='log_loss')
coarse_grid_2 = {
        'n_estimators': n_est_range,
        'max_leaf_nodes': max_leaf_range,
        'learning_rate': learn_rt_range,
        'subsample': subsamp_range
}

cv = KFold(n_splits=3, shuffle=True, random_state=1)
rscv_coarse_grad_2 = RandomizedSearchCV(estimator=model, param_distributions=coarse_grid_2, n_iter=50, n_jobs=int(os.getenv("SLURM_NPROCS", 1)), cv=cv, verbose=True, scoring='accuracy').fit(X_train_final, y_train)

print(rscv_coarse_grad_2.best_params_)
print("Time taken = ",(time.time()-start_time)/60," minutes")

In [None]:
best_param = rscv_coarse_grad_2.best_params_
cross_val_ypred = cross_val_predict(GradientBoostingClassifier(random_state=1, max_leaf_nodes=best_param['max_leaf_nodes'], learning_rate=best_param['learning_rate'], subsample=best_param['subsample'], n_estimators=best_param['n_estimators']), 
                                    X_train_final, y_train, 
                                    cv=5, method='predict_proba')[:, 1]

In [None]:
threshes = np.arange(0.2, 0.81, 0.05)

accs = {} 
for thr in threshes:
    y_preds = cross_val_ypred > thr
    accs[thr] = accuracy_score(y_train, y_preds)
    
thr_df = pd.DataFrame(np.array(list(accs.items())), columns=['thr', 'acc']).sort_values('acc', ascending=False).reset_index(drop=True)
best_thr = thr_df.loc[np.argmax(thr_df['acc']), 'thr']

In [36]:
grad_loop_params = {
    'max_leaf_nodes': list(range(5, 10)),
    'n_estimators': list(range(100, 312, 40)),
    'subsample': [0.8, 0.9, 1.0],
    'learning_rate': [0.01, 0.1, 1.0],
    'thresholds': np.arange(0.3, 0.71, 0.1)
}

grad_loop_results = [(list(grad_loop_params.keys())+['accuracy'])]

grad_loop_params
for dpth in grad_loop_params['max_leaf_nodes']:
#     base_model = DecisionTreeClassifier(max_leaf_nodes=dpth, random_state=1)
    print(dpth)
    
    for n_est in grad_loop_params['n_estimators']:
        for sub_samp in grad_loop_params['subsample']:
            for lr_rt in grad_loop_params['learning_rate']:
                grad_model = GradientBoostingClassifier(max_leaf_nodes=dpth, n_estimators=n_est, learning_rate=lr_rt, subsample=sub_samp, random_state=1) #.fit(temp_X_train, temp_y_train)
                pred_probas = cross_val_predict(grad_model, X_train_final, y_train, cv=3, method='predict_proba', n_jobs=int(os.getenv("SLURM_NPROCS", 1)))[:, 1]

                for thr in grad_loop_params['thresholds']:
                    preds = pred_probas > thr
                    acc = accuracy_score(y_train, preds)

                    grad_loop_results.append([dpth, n_est, sub_samp, lr_rt, thr, acc])

# grad_loop_results
print("Done")

5


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(grad_loop_results)
df = df.rename(columns=df.iloc[0]).iloc[1:, :]
df.sort_values('accuracy', ascending=False, inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

In [None]:
# grad_loop_params = {
#     'max_leaf_nodes': range(6, 11, 1),
#     'max_depth': 
#     'n_estimators': range(40, 102, 10),
#     'subsample': [0.7, 0.8, 0.9, 1.0],
#     'learning_rate': [0.01, 0.1, 1.0],
#     'thresholds': np.arange(0.3, 0.71, 0.1)
# }

# grad_loop_results = [(list(grad_loop_params.keys())+['accuracy'])]


In [140]:
best_grad_loop_model = GradientBoostingClassifier(max_leaf_nodes=8, n_estimators=100, learning_rate=1.0, subsample=1.0, random_state=1).fit(X_train_final, y_train)

### XGBoost

In [None]:
from xgboost import XGBRegressor, XGBClassifier

In [None]:
model = XGBClassifier(random_state = 12, # default value, can be taken out
                     objective = "binary:logistic",
                    scale_pos_weight = 9) # If the class imbalance is 90%-10%.

grid = {

                     'n_estimators': [...], # Same idea as Gradient Boosting
                     'max_depth': [...], # Same idea as Gradient Boosting
                     'subsample': [...], # Same idea as Gradient Boosting
                    'learning_rate': [...], # Same idea as Gradient Boosting
                    
                    # XGBoost hyperparams
                    'reg_lambda':[0.01, 0.1, 1], # Try different orders of magnitude - maybe 0.001
                    'gamma': [0, 0.1, 1], # Always try 0 and a couple of orders of magnitude (starting with only 0 can be useful)

                    'colsample_bytree': [0.5, 0.75, 1.0] # You can just keep it out as well.
}



### CatBoost

In [None]:
from catboost import CatBoostRegressor, CatBoostClassifier

### LightGBM

In [None]:
from lightgbm import LGBMRegressor, LGBMClassifier

### Predict

In [32]:
# # AdaBoost Prediction
# y_preds_test = pd.DataFrame(gscv_fine_model.predict(X_test_final)).rename({0:'predicted'}, axis=1)
# AdaBoost Loop predictions
y_preds_test = pd.DataFrame(best_loop_model.predict_proba(X_test_final)[:, 1] > best_loop_param['thresholds']).rename({0:'predicted'}, axis=1)


# # Gradient Boost RandomizedSearchCV Predictions
# y_preds_test = pd.DataFrame(rscv_grad_coarse.predict_proba(X_test_final)[:, 1] > best_thr).rename({0:'predicted'}, axis=1)
# # Gradient Boost Loop
# y_preds_test = pd.DataFrame(best_grad_loop_model.predict_proba(X_test_final)[:, 1] > 0.4).rename({0:'predicted'}, axis=1)



In [33]:
predicted_values = pd.concat([test_final[['id', 'host_id']], y_preds_test], axis=1)
overlapping_hosts = train_final[train_final['host_id'].isin(test_final['host_id'])].drop_duplicates('host_id')[['host_id', 'host_is_superhost']]

def overwrite(row):
    if row['host_id'] in overlapping_hosts['host_id'].values:
        row['predicted'] = overlapping_hosts[overlapping_hosts['host_id'] == row['host_id']]['host_is_superhost'].values[0]
    return row
        
predicted_values = predicted_values.apply(overwrite, axis=1)
predicted_values = predicted_values[['id', 'predicted']].set_index('id')
predicted_values

Unnamed: 0_level_0,predicted
id,Unnamed: 1_level_1
1543972437713169913,False
1710552057351883447,False
97075525,True
83734823,True
56722823,False
...,...
44798957,True
1929899281829298917,True
36015595,False
1472538040789213113,False


## 4) Put any ad-hoc steps for further improving model accuracy
For example, scaling up or scaling down the predictions, capping predictions, etc.

## 5) Export the predictions in the format required to submit on Kaggle

In [34]:
# predicted_values.to_csv('pred_csvs/boost_class_model_10.csv')