## Instructions {-}

- This is the template for the code and report on the Prediction Problem assignments.

- Your code in steps 1, 3, 4, and 5 will be executed sequentially, and must produce the RMSE / accuracy claimed on Kaggle.

- Your code in step 2 will also be executed, and must produce the optimal hyperparameter values used to train the model.

## Read data

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, mean_squared_error
from scipy.stats import uniform
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from IPython import display

In [2]:
raw_train = pd.read_csv('../Datasets/train_classification.csv') # use reg data for class?
raw_test = pd.read_csv('../Datasets/test_classification.csv')

## 1) Data pre-processing

Put the data pre-processing code. You don't need to explain it. You may use the same code from last quarter.

In [3]:
# Create copies of the raw datasets
train = raw_train.copy()
test = raw_test.copy()

# Convert 'host_acceptance_rate' and 'host_response_rate' columns to float and scale by dividing by 100
train['acceptance_rate'] = train['host_acceptance_rate'].str.replace('%', '').astype(float) / 100
train['response_rate'] = train['host_response_rate'].str.replace('%', '').astype(float) / 100

test['acceptance_rate'] = test['host_acceptance_rate'].str.replace('%', '').astype(float) / 100
test['response_rate'] = test['host_response_rate'].str.replace('%', '').astype(float) / 100

# Drop unnecessary columns
train.drop(columns=['host_acceptance_rate', 'host_response_rate'], inplace=True)
test.drop(columns=['host_acceptance_rate', 'host_response_rate'], inplace=True)


# Extract numeric values from 'bathrooms_text' column and convert to float
train['bathrooms_num'] = train['bathrooms_text'].str.extract('(\d+)').astype(float)
test['bathrooms_num'] = test['bathrooms_text'].str.extract('(\d+)').astype(float)

# Fill missing values in 'bathrooms_num' where 'Half-bath' is mentioned in 'bathrooms_text' with 0.5
train.loc[train['bathrooms_text'].str.contains('Half-bath', case=False, na=False) & train['bathrooms_num'].isna(), 'bathrooms_num'] = 0.5
test.loc[test['bathrooms_text'].str.contains('Half-bath', case=False, na=False) & test['bathrooms_num'].isna(), 'bathrooms_num'] = 0.5


In [4]:
# Convert date columns to datetime format
def strip_date(row):
    if isinstance(row, str):
        row = datetime.strptime(row, '%Y-%m-%d').date()
    return row

# Apply date conversion to train dataset
train['host_since'] = train['host_since'].apply(strip_date)
train['first_review'] = train['first_review'].apply(strip_date)
train['last_review'] = train['last_review'].apply(strip_date)

# Apply date conversion to test dataset
test['host_since'] = test['host_since'].apply(strip_date)
test['first_review'] = test['first_review'].apply(strip_date)
test['last_review'] = test['last_review'].apply(strip_date)

# ----- #

# Calculate months since various dates for train dataset
train['host_since_in_months'] = round(((datetime.now().date() - train['host_since']).dt.days) / 30, 2)
train['first_review_in_months'] = round(((datetime.now().date() - train['first_review']).dt.days) / 30, 2)
train['last_review_in_months'] = round(((datetime.now().date() - train['last_review']).dt.days) / 30, 2)

# Calculate months since various dates for test dataset
test['host_since_in_months'] = round(((datetime.now().date() - test['host_since']).dt.days) / 30,  2)
test['first_review_in_months'] = round(((datetime.now().date() - test['first_review']).dt.days) / 30, 2)
test['last_review_in_months'] = round(((datetime.now().date() - test['last_review']).dt.days) / 30, 2)


train_clean = train.drop(columns=['host_since', 'first_review', 'last_review'])
test_clean = test.drop(columns=['host_since', 'first_review', 'last_review'])


In [5]:
# Create a dictionary for response time category conversions
response_time_dict = {'within an hour': 1, 'within a few hours': 12, 'within a day': 24, 'a few days or more': 72}

def replace_response_time(row):
    if pd.notna(row):
        return response_time_dict.get(row)
    else:
        return None

train_clean['response_time'] = train_clean['host_response_time'].apply(replace_response_time)
test_clean['response_time'] = test_clean['host_response_time'].apply(replace_response_time)

Clean Transform

In [6]:
def clean_vars(row):
    # Check if 'shared' is in 'bathrooms_text' to identify shared bathrooms
    if 'shared' in str(row['bathrooms_text']):
        row['bathrooms_shared'] = "t"
        
    # Check if 'bathrooms_text' is empty and 'room_type' is 'Shared' to identify shared bathrooms
    elif pd.isna(row['bathrooms_text']):
        if 'Shared' in row['room_type']:
            row['bathrooms_shared'] = "t"              
        else:
            row['bathrooms_shared'] = "f"
    else: 
        row['bathrooms_shared'] = "f"
        
    # Convert 'Hotel room' room type to 'Private room'
    if row.loc['room_type'] == 'Hotel room':
        row['room_type'] = 'Private room'
        
    return row

# Apply the function to clean variables to train and test datasets
train_clean = train_clean.apply(clean_vars, axis=1)
test_clean = test_clean.apply(clean_vars, axis=1)


# create variables for rate of reviews for listing count and for host_since_months
train_clean['reviews_per_listing'] = train_clean['number_of_reviews']/train_clean['calculated_host_listings_count']
train_clean['reviews_per_month'] = train_clean['number_of_reviews']/train_clean['host_since_in_months']
train_clean['reviews_per_listing_per_month'] = train_clean['reviews_per_listing']/train_clean['host_since_in_months']

test_clean['reviews_per_listing'] = test_clean['number_of_reviews']/test_clean['calculated_host_listings_count']
test_clean['reviews_per_month'] = test_clean['number_of_reviews']/test_clean['host_since_in_months']
test_clean['reviews_per_listing_per_month'] = test_clean['reviews_per_listing']/test_clean['host_since_in_months']


clean neighbourhoods

In [7]:
# if neighbourhood has less than 150 occurances group them into 'Other'
neighbourhood_counts = train_clean['neighbourhood_cleansed'].value_counts()  
test_only_hoods = [i for i in test_clean['neighbourhood_cleansed'].unique() 
                   if i not in neighbourhood_counts 
                   and i != 'Other']


other_hoods = []
for i in neighbourhood_counts.index:
    if neighbourhood_counts[i] < 150:
        other_hoods.append(i)         
    
def clean_hoods(row):
    if row.loc['neighbourhood_cleansed'] in other_hoods or row.loc['neighbourhood_cleansed'] in test_only_hoods:
        row['neighbourhood_grouped'] = 'Other'
        
    else:    
        row['neighbourhood_grouped'] = row.loc['neighbourhood_cleansed']
        
    return row
   
    
train_clean = train_clean.apply(clean_hoods, axis=1)  
test_clean = test_clean.apply(clean_hoods, axis=1)  

In [8]:
# Clean filler words out of property types
words_to_remove = ['place', 'room', 'private', 'shared', 'entire', ' in', ' room', ' private', ' shared', ' entire', ' in',]

def remove_words(text):
    text=text.lower()
    for word in words_to_remove:
        word = word.lower()
        text = text.replace(word, '')
    return text.strip()

train_clean['property_type'] = train_clean['property_type'].apply(remove_words)
test_clean['property_type'] = test_clean['property_type'].apply(remove_words)


# group properties with less than 10 occurances into 'Other'
property_counts = train_clean['property_type'].value_counts()
keep = [i for i in property_counts.index if property_counts[i] > 10]

def clean_property(row):
    if row not in keep or row == "":
        row = 'Other'
      
    return row

train_clean['property_type_cleansed'] = train_clean['property_type'].apply(clean_property)
test_clean['property_type_cleansed'] = test_clean['property_type'].apply(clean_property)

train_filter_1 = train_clean.copy()
test_filter_1 = test_clean.copy()

In [9]:
host_hood_counts = train_filter_1['host_neighbourhood'].value_counts()
keep_host_hood = host_hood_counts[host_hood_counts >= 5].index

train_filter_1['host_neighbourhood'] = train_filter_1['host_neighbourhood'].apply(lambda x: 'Other' if x not in keep_host_hood else x)
test_filter_1['host_neighbourhood'] = test_filter_1['host_neighbourhood'].apply(lambda x: 'Other' if x not in keep_host_hood else x)
# train_final[['host_neighbourhood']].value_counts()
# test_final[['host_neighbourhood']].value_counts()

# ----- #

host_loc_counts = train_filter_1['host_location'].value_counts()
keep_host_loc = host_loc_counts[host_loc_counts >= 10].index

train_filter_1['host_location'] = train_filter_1['host_location'].apply(lambda x: 'Other' if x not in keep_host_loc else x)
test_filter_1['host_location'] = test_filter_1['host_location'].apply(lambda x: 'Other' if x not in keep_host_loc else x)
# train_final['host_location'].value_counts()
# test_final['host_location'].value_counts()


In [10]:
try:
    train_filter_1['host_verifications'] = train_filter_1['host_verifications'].apply(ast.literal_eval)
except: pass

try:
    test_filter_1['host_verifications'] = test_filter_1['host_verifications'].apply(ast.literal_eval)
except: pass


In [11]:
train_filter_1['num_verifications'] = train_filter_1['host_verifications'].apply(len)
test_filter_1['num_verifications'] = test_filter_1['host_verifications'].apply(len)

In [12]:
def split_vers(df):
    def update_verification(row):
        ver_phone = 't' if 'phone' in row['host_verifications'] else 'f'
        ver_email = 't' if 'email' in row['host_verifications'] else 'f'
        ver_work_email = 't' if 'work_email' in row['host_verifications'] else 'f'
        return pd.Series({'ver_phone': ver_phone, 'ver_email': ver_email, 'ver_work_email': ver_work_email})

    df[['ver_phone', 'ver_email', 'ver_work_email']] = df.apply(update_verification, axis=1)

    return df


train_filter_2 = split_vers(train_filter_1).drop('host_verifications', axis=1)
test_filter_2 = split_vers(test_filter_1).drop('host_verifications', axis=1)

Fill missing values

In [13]:
# Fill in remaining missing values with median for numerical columns
train_filter_2.fillna(train_filter_2.median(numeric_only=True), inplace=True)
test_filter_2.fillna(test_filter_2.median(numeric_only=True), inplace=True)


In [14]:
# review scores are very correlated, average review scores to handle this
train_filter_2['review_scores_avg'] = np.mean(train_filter_2[['review_scores_accuracy', 'review_scores_checkin', 'review_scores_communication', 'review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness']], axis=1)
test_filter_2['review_scores_avg'] = np.mean(test_filter_2[['review_scores_accuracy', 'review_scores_checkin', 'review_scores_communication', 'review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness']], axis=1)

train_filter_2.drop(columns=['review_scores_accuracy', 'review_scores_checkin', 'review_scores_communication', 'review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness'], inplace=True)
test_filter_2.drop(columns=['review_scores_accuracy', 'review_scores_checkin', 'review_scores_communication', 'review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness'], inplace=True)

In [15]:
train_filter_2['host_is_superhost'] = train_filter_2['host_is_superhost'].replace({"f":False, "t":True})

In [16]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

non_numeric_columns = train_filter_2.select_dtypes(exclude=[np.number]).columns
data_numeric = train_filter_2.drop(columns=non_numeric_columns)

X = data_numeric.drop(columns=['host_id', 'id', 'latitude', 'longitude'])
y = train_filter_2.host_is_superhost

vif = pd.DataFrame()
vif["Predictor"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

vif[vif['VIF'] >= 5].sort_values('VIF', ascending=False)

Unnamed: 0,Predictor,VIF
11,maximum_nights_avg_ntm,414409900000.0
8,minimum_maximum_nights,220338100000.0
9,maximum_maximum_nights,43102220000.0
19,calculated_host_listings_count,37130.06
20,calculated_host_listings_count_entire_homes,36913.91
13,availability_60,255.8071
10,minimum_nights_avg_ntm,192.454
25,response_rate,163.6986
34,review_scores_avg,158.7833
14,availability_90,156.6422


In [17]:
train_filter_3 = train_filter_2.drop(columns=['availability_60', 'availability_90', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'minimum_minimum_nights', 'minimum_maximum_nights', 'maximum_minimum_nights', 'maximum_maximum_nights'])
test_filter_3 = test_filter_2.drop(columns=['availability_60', 'availability_90', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'minimum_minimum_nights', 'minimum_maximum_nights', 'maximum_minimum_nights', 'maximum_maximum_nights'])

In [18]:
non_numeric_columns = train_filter_3.select_dtypes(exclude=[np.number]).columns
data_numeric = train_filter_3.drop(columns=non_numeric_columns)

X = data_numeric.drop(columns=['host_id', 'id', 'latitude', 'longitude'])
y = train_filter_3.host_is_superhost

vif = pd.DataFrame()
vif["Predictor"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

vif[vif['VIF'] >= 5].sort_values('VIF', ascending=False)

Unnamed: 0,Predictor,VIF
17,response_rate,162.468319
26,review_scores_avg,158.29592
0,host_listings_count,114.321915
1,host_total_listings_count,48.292613
6,minimum_nights_avg_ntm,45.241613
16,acceptance_rate,34.067081
25,num_verifications,16.427689
15,reviews_per_month,12.251551
2,accommodates,11.671614
23,reviews_per_listing,11.617984


In [19]:
train_filter_4 = train_filter_3.drop(columns=['host_listings_count', 'response_rate'])
test_filter_4 = test_filter_3.drop(columns=['host_listings_count', 'response_rate'])

In [20]:
non_numeric_columns = train_filter_4.select_dtypes(exclude=[np.number]).columns
data_numeric = train_filter_4.drop(columns=non_numeric_columns)

X = data_numeric.drop(columns=['host_id', 'id', 'latitude', 'longitude'])
y = train_filter_4.host_is_superhost

vif = pd.DataFrame()
vif["Predictor"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

vif[vif['VIF'] >= 5].sort_values('VIF', ascending=False)

Unnamed: 0,Predictor,VIF
24,review_scores_avg,50.53296
15,acceptance_rate,30.401089
23,num_verifications,16.233927
5,minimum_nights_avg_ntm,15.105715
14,reviews_per_month,12.2205
1,accommodates,11.618679
21,reviews_per_listing,11.615309
2,beds,11.463107
9,number_of_reviews,11.19271
22,reviews_per_listing_per_month,10.56373


In [21]:
# Create final DataFrames
train_final = train_filter_4.copy()
test_final = test_filter_4.copy()

## 2) Hyperparameter tuning

### How many attempts did it take you to tune the model hyperparameters?

I attempted the model hyperparameters and tuning about 11 times.

### Which tuning method did you use (grid search / Bayes search / etc.)?

I used loops and added each combinations values to a dataframe.

### What challenges did you face while tuning the hyperparameters, and what actions did you take to address those challenges?

Originally I used GridSearchCV, but changed to loops so I could tune the threshold along with n_neighbors and weights.

### How many hours did you spend on hyperparameter tuning?

I spent approximately 5 hours tuning hyperparameters.

**Paste the hyperparameter tuning code below. You must show at least one hyperparameter tuning procedure.**

In [22]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate, GridSearchCV, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [23]:
numeric_columns = train_final.select_dtypes(include=['number']).drop(columns=['id']).columns

X_train = train_final.drop(columns=['host_is_superhost', 'id'])
X_test = test_final.drop(columns=['id'])
train_final
X_train_num = X_train[numeric_columns]
y_train = train_final.host_is_superhost

sc = StandardScaler()
sc.fit(X_train_num)

X_train_scaled = sc.transform(X_train[numeric_columns])
X_test_scaled = sc.transform(X_test[numeric_columns])

X_train_num_scaled = pd.DataFrame(X_train_scaled, columns=numeric_columns)
X_test_num_scaled = pd.DataFrame(X_test_scaled, columns=numeric_columns)


train_testing = train_final.drop(columns=['host_is_superhost']) # 'host_location', 'host_neighbourhood', 
test_testing = test_final  #.drop(columns=['host_location', 'host_neighbourhood'])

train_testing_cat = train_testing.select_dtypes(exclude=['number'])
test_testing_cat = test_testing.select_dtypes(exclude=['number'])



enc = OneHotEncoder(drop='if_binary', handle_unknown='ignore')
enc.fit(train_testing_cat)

drop_enc = enc.transform(train_testing_cat)
drop_enc_test = enc.transform(test_testing_cat)

train_encoded_df = pd.DataFrame(drop_enc.toarray(), columns=enc.get_feature_names_out(train_testing_cat.columns))
test_encoded_df = pd.DataFrame(drop_enc_test.toarray(), columns=enc.get_feature_names_out(test_testing_cat.columns))

X_train_final = pd.concat([X_train_num_scaled, train_encoded_df], axis=1)
X_test_final = pd.concat([X_test_num_scaled, test_encoded_df], axis=1)



In [24]:
def dist_power_2(distance):
    return 1/(1e-10+distance**2)
def dist_power_3(distance):
    return 1/(1e-10+distance**3)
def dist_power_4(distance):
    return 1/(1e-10+distance**4)
def dist_power_5(distance):
    return 1/(1e-10+distance**5)

def dist_power_6(distance):
    return 1/(1e-10+distance**6)
def dist_power_7(distance):
    return 1/(1e-10+distance**7)
def dist_power_8(distance):
    return 1/(1e-10+distance**8)
def dist_power_9(distance):
    return 1/(1e-10+distance**9)
def dist_power_10(distance):
    return 1/(1e-10+distance**10)

def dist_power_11(distance):
    return 1/(1e-10+distance**11)
def dist_power_12(distance):
    return 1/(1e-10+distance**12)
def dist_power_13(distance):
    return 1/(1e-10+distance**13)
def dist_power_14(distance):
    return 1/(1e-10+distance**14)
def dist_power_15(distance):
    return 1/(1e-10+distance**15)

def dist_power_16(distance):
    return 1/(1e-10+distance**16)
def dist_power_17(distance):
    return 1/(1e-10+distance**17)
def dist_power_18(distance):
    return 1/(1e-10+distance**18)
def dist_power_19(distance):
    return 1/(1e-10+distance**19)
def dist_power_20(distance):
    return 1/(1e-10+distance**20)


### Model Creation

In [25]:
first_k_step = 25
first_thr_step = 0.1

cv_settings = StratifiedKFold(n_splits=5, shuffle=True, random_state=12)

In [26]:
thresholds = np.arange(0.3, 0.8, first_thr_step)
Ks = np.arange(50, 525, first_k_step)
weight_options = [dist_power_6, dist_power_7, dist_power_8, dist_power_9, dist_power_10, dist_power_11, dist_power_12]

param_df = pd.DataFrame(columns = ['K', 'weight', 'thr', 'acc'])
counter = 0

# Loop through each parameter combination
for K in Ks:
    print(K)
    
    for weight in weight_options:
        model = KNeighborsClassifier(n_neighbors=K, weights=weight)
        y_pred_probas = cross_val_predict(model, X_train_final, y_train, cv=cv_settings, method='predict_proba')[:, 1]
        
        for thr in thresholds:
            param_df.loc[counter, 'acc'] = accuracy_score(y_train, (y_pred_probas > thr))
            param_df.loc[counter, 'K'] = K
            param_df.loc[counter, 'thr'] = thr
            param_df.loc[counter, 'weight'] = weight
        
            counter += 1
            

50
75
100
125
150
175
200
225
250
275
300
325
350
375
400
425
450
475
500


In [27]:
best_entry = param_df[param_df['acc'] == param_df['acc'].max()]

best_k, best_weight, best_thr = np.array(best_entry[['K', 'weight', 'thr']])[0].tolist()
best_thr = round(best_thr, 3)

print((best_k, best_weight, best_thr))

(125, <function dist_power_9 at 0x00000193519467A0>, 0.6)


### Model Tuning

In [28]:
next_k_range = int(np.ceil(first_k_step + first_k_step/5))
next_k_step = 5
next_thr_range = round(first_thr_step + first_thr_step/5, 3)
next_thr_step = 0.05

new_Ks = np.arange(best_k-next_k_range, best_k+next_k_step+next_k_range, next_k_step)
new_thresholds = np.arange(best_thr-next_thr_range, best_thr+next_thr_range+next_thr_step, next_thr_step)
new_weights = [dist_power_6, dist_power_7, dist_power_8, dist_power_9, dist_power_10]

print(best_k, next_k_range, next_k_step, best_thr, next_thr_range, next_thr_step)

param_df_2 = pd.DataFrame(columns = ['K', 'weight', 'thr', 'acc'])
counter = 0

# Loop through each parameter combination
for K in new_Ks:
    print(K)
    for weight in new_weights:
        model = KNeighborsClassifier(n_neighbors=K, weights=weight)
        y_pred_probas = cross_val_predict(model, X_train_final, y_train, cv=cv_settings, method='predict_proba')[:, 1]
        
        for thr in new_thresholds:
            param_df_2.loc[counter, 'K'] = K
            param_df_2.loc[counter, 'thr'] = thr
            param_df_2.loc[counter, 'weight'] = weight
            param_df_2.loc[counter, 'acc'] = accuracy_score(y_train, (y_pred_probas > thr))
        
            counter += 1
            

125 30 5 0.6 0.12 0.05
95
100
105
110
115
120
125
130
135
140
145
150
155


In [29]:
best_entry_2 = param_df_2[param_df_2['acc'] == param_df_2['acc'].max()]
    
best_k_2, best_weight_2, best_thr_2 = np.array(best_entry_2[['K', 'weight', 'thr']])[0].tolist()
best_thr_2 = round(best_thr_2, 4)

print((best_k_2, best_weight_2, best_thr_2))

(110, <function dist_power_8 at 0x0000019351946700>, 0.58)


In [30]:
next_k_range_2 = int(np.ceil(next_k_step + next_k_step/5))
next_k_step_2 = 1
next_thr_range_2 = round(next_thr_step + next_thr_step/5, 3)
next_thr_step_2 = 0.01

new_Ks_2 = np.arange(best_k_2-next_k_range_2, best_k_2+next_k_step_2+next_k_range_2, next_k_step_2)
new_thresholds_2 = np.arange(best_thr_2-next_thr_range_2, best_thr_2+next_thr_range_2+next_thr_step_2, next_thr_step_2)
new_weights_2 = [dist_power_5, dist_power_6, dist_power_7, dist_power_8, dist_power_9]

print(best_k_2, next_k_range_2, next_k_step_2, best_thr_2, next_thr_range_2, next_thr_step_2)

param_df_3 = pd.DataFrame(columns = ['K', 'weight', 'thr', 'acc'])
counter = 0

# Loop through each parameter combination
for K in new_Ks_2:
    print(K)
    for weight in new_weights_2:
        model = KNeighborsClassifier(n_neighbors=K, weights=weight)
        y_pred_probas = cross_val_predict(model, X_train_final, y_train, cv=cv_settings, method='predict_proba')[:, 1]
        
        for thr in new_thresholds_2:
            param_df_3.loc[counter, 'K'] = K
            param_df_3.loc[counter, 'thr'] = thr
            param_df_3.loc[counter, 'weight'] = weight
            param_df_3.loc[counter, 'acc'] = accuracy_score(y_train, (y_pred_probas > thr))
        
            counter += 1
            

110 6 1 0.58 0.06 0.01
104
105
106
107
108
109
110
111
112
113
114
115
116


In [31]:
best_entry_3 = param_df_3[param_df_3['acc'] == param_df_3['acc'].max()]
    
best_k_3, best_weight_3, best_thr_3 = np.array(best_entry_3[['K', 'weight', 'thr']])[0].tolist()
best_thr_3 = round(best_thr_3, 4)

**Paste the optimal hyperparameter values below.**

In [None]:
print((best_k_3, best_weight_3, best_thr_3))

n_neighbors=25, weights=dist_power_7

## 3) Model

### Create Optimal Model

In [33]:
model = KNeighborsClassifier(n_neighbors=best_k_3, weights=best_weight_3).fit(X_train_final, y_train)

### Predict

In [34]:
y_preds_test = pd.DataFrame((model.predict_proba(X_test_final)[:, 1] > best_thr_3)).rename({0:'predicted'}, axis=1)

In [35]:
predicted_values = pd.concat([test_final[['id', 'host_id']], y_preds_test], axis=1)
overlapping_hosts = train_final[train_final['host_id'].isin(test_final['host_id'])].drop_duplicates('host_id')[['host_id', 'host_is_superhost']]

def overwrite(row):
    if row['host_id'] in overlapping_hosts['host_id'].values:
        row['predicted'] = overlapping_hosts[overlapping_hosts['host_id'] == row['host_id']]['host_is_superhost'].values[0]
    return row
        
predicted_values = predicted_values.apply(overwrite, axis=1)
predicted_values = predicted_values[['id', 'predicted']].set_index('id')
predicted_values

Unnamed: 0_level_0,predicted
id,Unnamed: 1_level_1
1543972437713169913,False
1710552057351883447,False
97075525,True
83734823,True
56722823,False
...,...
44798957,True
1929899281829298917,True
36015595,False
1472538040789213113,False


## 4) Put any ad-hoc steps for further improving model accuracy

## 5) Export the predictions in the format required to submit on Kaggle

In [36]:
predicted_values.to_csv('pred_csvs/KNN_class_model.csv')