## Instructions {-}

- This is the template for the code and report on the Prediction Problem assignments.

- Your code in steps 1, 3, 4, and 5 will be executed sequentially, and must produce the RMSE / accuracy claimed on Kaggle.

- Your code in step 2 will also be executed, and must produce the optimal hyperparameter values used to train the model.

## Read data

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, mean_squared_error
from scipy.stats import uniform
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from IPython import display

In [2]:
raw_train = pd.read_csv('../Datasets/train_classification.csv') # use reg data for class?
raw_test = pd.read_csv('../Datasets/test_classification.csv')

## 1) Data pre-processing

Put the data pre-processing code. You don't need to explain it. You may use the same code from last quarter.

In [3]:
# Create copies of the raw datasets
train = raw_train.copy()
test = raw_test.copy()

# Convert 'host_acceptance_rate' and 'host_response_rate' columns to float and scale by dividing by 100
train['acceptance_rate'] = train['host_acceptance_rate'].str.replace('%', '').astype(float) / 100
train['response_rate'] = train['host_response_rate'].str.replace('%', '').astype(float) / 100

test['acceptance_rate'] = test['host_acceptance_rate'].str.replace('%', '').astype(float) / 100
test['response_rate'] = test['host_response_rate'].str.replace('%', '').astype(float) / 100

# Drop unnecessary columns
train.drop(columns=['host_acceptance_rate', 'host_response_rate'], inplace=True)
test.drop(columns=['host_acceptance_rate', 'host_response_rate'], inplace=True)



# Extract numeric values from 'bathrooms_text' column and convert to float
train['bathrooms_num'] = train['bathrooms_text'].str.extract('(\d+)').astype(float)
test['bathrooms_num'] = test['bathrooms_text'].str.extract('(\d+)').astype(float)

# Fill missing values in 'bathrooms_num' where 'Half-bath' is mentioned in 'bathrooms_text' with 0.5
train.loc[train['bathrooms_text'].str.contains('Half-bath', case=False, na=False) & train['bathrooms_num'].isna(), 'bathrooms_num'] = 0.5
test.loc[test['bathrooms_text'].str.contains('Half-bath', case=False, na=False) & test['bathrooms_num'].isna(), 'bathrooms_num'] = 0.5


In [4]:
# Convert date columns to datetime format
def strip_date(row):
    if isinstance(row, str):
        row = datetime.strptime(row, '%Y-%m-%d').date()
    return row

# Apply date conversion to train dataset
train['host_since'] = train['host_since'].apply(strip_date)
train['first_review'] = train['first_review'].apply(strip_date)
train['last_review'] = train['last_review'].apply(strip_date)

# Apply date conversion to test dataset
test['host_since'] = test['host_since'].apply(strip_date)
test['first_review'] = test['first_review'].apply(strip_date)
test['last_review'] = test['last_review'].apply(strip_date)

# ----- #

# Calculate months since various dates for train dataset
train['host_since_in_months'] = round(((datetime.now().date() - train['host_since']).dt.days) / 30, 2)
train['first_review_in_months'] = round(((datetime.now().date() - train['first_review']).dt.days) / 30, 2)
train['last_review_in_months'] = round(((datetime.now().date() - train['last_review']).dt.days) / 30, 2)

# Calculate months since various dates for test dataset
test['host_since_in_months'] = round(((datetime.now().date() - test['host_since']).dt.days) / 30,  2)
test['first_review_in_months'] = round(((datetime.now().date() - test['first_review']).dt.days) / 30, 2)
test['last_review_in_months'] = round(((datetime.now().date() - test['last_review']).dt.days) / 30, 2)


train_clean = train.copy()
test_clean = test.copy()


In [5]:
# Create a dictionary for response time category conversions
response_time_dict = {'within an hour': 1, 'within a few hours': 12, 'within a day': 24, 'a few days or more': 72}

def replace_response_time(row):
    if pd.notna(row):
        return response_time_dict.get(row)
    else:
        return None

train['response_time'] = train['host_response_time'].apply(replace_response_time)
test['response_time'] = test['host_response_time'].apply(replace_response_time)

Clean Transform

In [6]:
def clean_vars(row):
    # Check if 'shared' is in 'bathrooms_text' to identify shared bathrooms
    if 'shared' in str(row['bathrooms_text']):
        row['bathrooms_shared'] = "t"
        
    # Check if 'bathrooms_text' is empty and 'room_type' is 'Shared' to identify shared bathrooms
    elif pd.isna(row['bathrooms_text']):
        if 'Shared' in row['room_type']:
            row['bathrooms_shared'] = "t"              
        else:
            row['bathrooms_shared'] = "f"
    else: 
        row['bathrooms_shared'] = "f"
        
    # Convert 'Hotel room' room type to 'Private room'
    if row.loc['room_type'] == 'Hotel room':
        row['room_type'] = 'Private room'
        
    return row

# Apply the function to clean variables to train and test datasets
train_clean = train_clean.apply(clean_vars, axis=1)
test_clean = test_clean.apply(clean_vars, axis=1)


# create variables for rate of reviews for listing count and for host_since_months
train_clean['reviews_per_listing'] = train_clean['number_of_reviews']/train_clean['calculated_host_listings_count']
train_clean['reviews_per_month'] = train_clean['number_of_reviews']/train_clean['host_since_in_months']
train_clean['reviews_per_listing_per_month'] = train_clean['reviews_per_listing']/train_clean['host_since_in_months']

test_clean['reviews_per_listing'] = test_clean['number_of_reviews']/test_clean['calculated_host_listings_count']
test_clean['reviews_per_month'] = test_clean['number_of_reviews']/test_clean['host_since_in_months']
test_clean['reviews_per_listing_per_month'] = test_clean['reviews_per_listing']/test_clean['host_since_in_months']


clean neighbourhoods

In [7]:
# if neighbourhood has less than 150 occurances group them into 'Other'
neighbourhood_counts = train_clean['neighbourhood_cleansed'].value_counts()  
test_only_hoods = [i for i in test_clean['neighbourhood_cleansed'].unique() 
                   if i not in neighbourhood_counts 
                   and i != 'Other']


other_hoods = []
for i in neighbourhood_counts.index:
    if neighbourhood_counts[i] < 150:
        other_hoods.append(i)         
    
def clean_hoods(row):
    if row.loc['neighbourhood_cleansed'] in other_hoods or row.loc['neighbourhood_cleansed'] in test_only_hoods:
        row['neighbourhood_grouped'] = 'Other'
        
    else:    
        row['neighbourhood_grouped'] = row.loc['neighbourhood_cleansed']
        
    return row
   
    
train_clean = train_clean.apply(clean_hoods, axis=1)  
test_clean = test_clean.apply(clean_hoods, axis=1)  

In [8]:
# Clean filler words out of property types
words_to_remove = ['place', 'room', 'private', 'shared', 'entire', ' in', ' room', ' private', ' shared', ' entire', ' in',]

def remove_words(text):
    text=text.lower()
    for word in words_to_remove:
        word = word.lower()
        text = text.replace(word, '')
    return text.strip()

train_clean['property_type'] = train_clean['property_type'].apply(remove_words)
test_clean['property_type'] = test_clean['property_type'].apply(remove_words)


# group properties with less than 10 occurances into 'Other'
property_counts = train_clean['property_type'].value_counts()
keep = [i for i in property_counts.index if property_counts[i] > 10]

def clean_property(row):
    if row not in keep or row == "":
        row = 'Other'
      
    return row

train_clean['property_type_cleansed'] = train_clean['property_type'].apply(clean_property)
test_clean['property_type_cleansed'] = test_clean['property_type'].apply(clean_property)

train_filter_2 = train_clean.copy()
test_filter_2 = test_clean.copy()

In [9]:
try:
    train_filter_2['host_verifications'] = train_filter_2['host_verifications'].apply(ast.literal_eval)
except: pass
try:
    test_filter_2['host_verifications'] = test_filter_2['host_verifications'].apply(ast.literal_eval)
except: pass

In [10]:
train_filter_2['num_verifications'] = train_filter_2['host_verifications'].apply(len)
test_filter_2['num_verifications'] = test_filter_2['host_verifications'].apply(len)

In [11]:
def split_vers(df):
    def update_verification(row):
        ver_phone = 't' if 'phone' in row['host_verifications'] else 'f'
        ver_email = 't' if 'email' in row['host_verifications'] else 'f'
        ver_work_email = 't' if 'work_email' in row['host_verifications'] else 'f'
        return pd.Series({'ver_phone': ver_phone, 'ver_email': ver_email, 'ver_work_email': ver_work_email})

    df[['ver_phone', 'ver_email', 'ver_work_email']] = df.apply(update_verification, axis=1)

    return df


train_filter_2 = split_vers(train_filter_2).drop('host_verifications', axis=1)
test_filter_2 = split_vers(test_filter_2).drop('host_verifications', axis=1)

Fill missing values

In [12]:
# Fill in remaining missing values with median for numerical columns
train_clean.fillna(train_clean.median(numeric_only=True), inplace=True)
train_clean.fillna(train_clean.median(numeric_only=True), inplace=True)

# Create final DataFrames
train_final = train_clean.copy()
test_final = test_clean.copy()

In [13]:
# review scores are very correlated, average review scores to handle this
train_final['review_scores_avg'] = np.mean(train_final[['review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness']], axis=1)
test_final['review_scores_avg'] = np.mean(test_final[['review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness']], axis=1)

train_final['review_scores_avg'] = train_final['review_scores_avg'].fillna(value=train_final['review_scores_avg'].median())
test_final['review_scores_avg'] = test_final['review_scores_avg'].fillna(value=train_final['review_scores_avg'].median())

In [14]:
train_final['host_is_superhost'] = train_final['host_is_superhost'].replace({"f":0, "t":1})

## 2) Hyperparameter tuning

### How many attempts did it take you to tune the model hyperparameters?

### Which tuning method did you use (grid search / Bayes search / etc.)?

### What challenges did you face while tuning the hyperparameters, and what actions did you take to address those challenges?

### How many hours did you spend on hyperparameter tuning?

**Paste the hyperparameter tuning code below. You must show at least one hyperparameter tuning procedure.**

In [15]:
#Hyperparameter tuning code

**Paste the optimal hyperparameter values below.**

## 3) Model

Using the optimal model hyperparameters, train the model, and paste the code below.

In [16]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate, GridSearchCV, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [24]:
numeric_columns = train_final.select_dtypes(include=['number']).drop(columns=['host_is_superhost', 'id']).columns

X_train = train_final.drop(columns=['host_is_superhost', 'id'])
X_test = test_final.drop(columns=['id'])
train_final
X_train_num = X_train[numeric_columns]
y_train = train_final.host_is_superhost

sc = StandardScaler()
sc.fit(X_train_num)

X_train_scaled = sc.transform(X_train[numeric_columns])
X_test_scaled = sc.transform(X_test[numeric_columns])

X_train_num_scaled = pd.DataFrame(X_train_scaled, columns=numeric_columns)
X_test_num_scaled = pd.DataFrame(X_test_scaled, columns=numeric_columns)



train_testing = train_final.drop(columns=['host_is_superhost']) # 'host_location', 'host_neighbourhood', 
test_testing = test_final  #.drop(columns=['host_location', 'host_neighbourhood'])

train_testing_cat = train_testing.select_dtypes(exclude=['number'])
test_testing_cat = test_testing.select_dtypes(exclude=['number'])




enc = OneHotEncoder(drop='if_binary', handle_unknown='ignore')
enc.fit(train_testing_cat)

drop_enc = enc.transform(train_testing_cat)
drop_enc_test = enc.transform(test_testing_cat)

train_encoded_df = pd.DataFrame(drop_enc.toarray(), columns=enc.get_feature_names_out(train_testing_cat.columns))
test_encoded_df = pd.DataFrame(drop_enc_test.toarray(), columns=enc.get_feature_names_out(test_testing_cat.columns))

X_train_final = pd.concat([X_train_num_scaled, train_encoded_df], axis=1)
X_test_final = pd.concat([X_test_num_scaled, test_encoded_df], axis=1)



In [25]:
def dist_power_2(distance):
    return 1/(1e-10+distance**2)
def dist_power_3(distance):
    return 1/(1e-10+distance**3)
def dist_power_4(distance):
    return 1/(1e-10+distance**4)
def dist_power_5(distance):
    return 1/(1e-10+distance**5)

def dist_power_6(distance):
    return 1/(1e-10+distance**6)
def dist_power_7(distance):
    return 1/(1e-10+distance**7)
def dist_power_8(distance):
    return 1/(1e-10+distance**8)
def dist_power_9(distance):
    return 1/(1e-10+distance**9)
def dist_power_10(distance):
    return 1/(1e-10+distance**10)

def dist_power_11(distance):
    return 1/(1e-10+distance**11)
def dist_power_12(distance):
    return 1/(1e-10+distance**12)
def dist_power_13(distance):
    return 1/(1e-10+distance**13)
def dist_power_14(distance):
    return 1/(1e-10+distance**14)
def dist_power_15(distance):
    return 1/(1e-10+distance**15)

def dist_power_16(distance):
    return 1/(1e-10+distance**16)
def dist_power_17(distance):
    return 1/(1e-10+distance**17)
def dist_power_18(distance):
    return 1/(1e-10+distance**18)
def dist_power_19(distance):
    return 1/(1e-10+distance**19)
def dist_power_20(distance):
    return 1/(1e-10+distance**20)


### Model Creation

In [None]:
cv_settings_naive = StratifiedKFold(n_splits=3, shuffle=True, random_state=12)

step_1 = 10
range_1 = 100

model = KNeighborsClassifier()
grid = {'n_neighbors':np.arange(step_1, step_1+range_1, step_1), 'weights':[dist_power_2, dist_power_3, dist_power_4, dist_power_5, dist_power_6, dist_power_7, dist_power_8, dist_power_9, dist_power_10, dist_power_11, dist_power_12, dist_power_13, dist_power_14, dist_power_15, dist_power_16]}

rscv = RandomizedSearchCV(model, grid, n_iter=50, cv=cv_settings_naive, verbose=2)
rscv.fit(X_train_final, y_train)


In [27]:
print(rscv.best_params_)
print(rscv.best_score_)

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

### Model Tuning

In [None]:
prev_k = rscv.best_params_['n_neighbors']
gscv_step = 5
search_radii = int(step_1 + np.ceil(step_1/4))

cv_settings = StratifiedKFold(n_splits=5, shuffle=True, random_state=12) 

model = KNeighborsClassifier()
grid = {'n_neighbors':np.arange(prev_k-search_radii, prev_k+search_radii+gscv_step, gscv_step), 'weights':[dist_power_11, dist_power_12, dist_power_13]}

gscv = GridSearchCV(model, grid, cv=cv_settings, verbose=2)
gscv.fit(X_train_final, y_train)


In [None]:
print(gscv.best_params_)
print(gscv.best_score_)

### RepeatedKFold Tuning

In [None]:
# prev_k = gscv.best_params_['n_neighbors']
# rep_gscv_step = 1
# search_radii = int(gscv_step + np.ceil(gscv_step/4))

# print(prev_k, prev_k-search_radii, prev_k+search_radii+rep_gscv_step, rep_gscv_step)

# cv_settings = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=12) 

# model = KNeighborsClassifier()
# grid = {'n_neighbors':np.arange(prev_k-search_radii, prev_k+search_radii+rep_gscv_step, rep_gscv_step), 'weights':[dist_power_10, dist_power_11, dist_power_12]}

# rep_gscv = GridSearchCV(model, grid, cv=cv_settings, verbose=2)
# rep_gscv.fit(X_train_final, y_train)


In [None]:
# print(rep_gscv.best_params_)
# print(rep_gscv.best_score_)

### Predict

In [None]:
y_preds = pd.DataFrame(rep_gscv.predict(X_train_final))

print(y_preds.value_counts())

acc_score = accuracy_score(y_train, y_preds)*100
precision = precision_score(y_train, y_preds)*100
recall = recall_score(y_train, y_preds)*100
conf_matrix = confusion_matrix(y_train, y_preds)

tn, fp, fn, tp = conf_matrix.ravel()
FPR = fp/(fp + tn)
FNR = fn/(fn + tp)


print(f"\nAccuracy: {round(acc_score, 4)}")
print(f"Precision: {round(precision, 4)}\tRecall: {round(recall, 4)}")
print(f"FPR: {round(FPR*100, 4)}\t\tFNR: {round(FNR*100, 4)}\n")
    
print(f"Confusion Matrix:\n{conf_matrix}\n")


In [None]:
y_preds_test = pd.DataFrame(rep_gscv.predict(X_test_final)).rename({0:'predicted'}, axis=1)

In [None]:
predicted_values = pd.concat([test_final[['id', 'host_id']], y_preds_test], axis=1)
overlapping_hosts = train_final[train_final['host_id'].isin(test_final['host_id'])].drop_duplicates('host_id')[['host_id', 'host_is_superhost']]

def overwrite(row):
    if row['host_id'] in overlapping_hosts['host_id'].values:
        row['predicted'] = overlapping_hosts[overlapping_hosts['host_id'] == row['host_id']]['host_is_superhost'].values[0]
    return row
        
    
predicted_values = predicted_values.apply(overwrite, axis=1)
predicted_values = predicted_values[['id', 'predicted']].set_index('id')
predicted_values


## 4) Put any ad-hoc steps for further improving model accuracy
For example, scaling up or scaling down the predictions, capping predictions, etc.

Put code below.

## 5) Export the predictions in the format required to submit on Kaggle
Put code below.

In [None]:
# predicted_values.to_csv('pred_csvs/KNN_class_model_5.csv')