In [145]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.metrics import accuracy_score, recall_score, precision_score

from datetime import date, datetime

In [146]:
raw_train = pd.read_csv('datasets/train_classification.csv')
raw_test = pd.read_csv('datasets/test_classification.csv')

In [147]:
train = raw_train.copy()
test = raw_test.copy()


train['acceptance_rate'] = train['host_acceptance_rate'].str.replace('%', '').astype(float) / 100
train['response_rate'] = train['host_response_rate'].str.replace('%', '').astype(float) / 100

test['acceptance_rate'] = test['host_acceptance_rate'].str.replace('%', '').astype(float) / 100
test['response_rate'] = test['host_response_rate'].str.replace('%', '').astype(float) / 100


In [148]:
response_time_dict = {'within an hour': 1, 'within a few hours': 12, 'within a day': 24, 'a few days or more': 72}

def replace_response_time(row):
    if pd.notna(row):
        return response_time_dict.get(row)
    else:
        return None

train['response_time'] = train['host_response_time'].apply(replace_response_time)
test['response_time'] = test['host_response_time'].apply(replace_response_time)


train['response_time'].value_counts()

1.0     3846
12.0     433
24.0     230
72.0      51
Name: response_time, dtype: int64

In [149]:
train.drop(columns=['host_acceptance_rate', 'host_response_rate'], inplace=True)
test.drop(columns=['host_acceptance_rate', 'host_response_rate'], inplace=True)

In [150]:
# Convert date columns
def strip_date(row):
    if isinstance(row, str):
        row = datetime.strptime(row, '%Y-%m-%d').date()
    return row

# Apply date conversion to train dataset
train['host_since'] = train['host_since'].apply(strip_date)
train['first_review'] = train['first_review'].apply(strip_date)
train['last_review'] = train['last_review'].apply(strip_date)

# Apply date conversion to test dataset
test['host_since'] = test['host_since'].apply(strip_date)
test['first_review'] = test['first_review'].apply(strip_date)
test['last_review'] = test['last_review'].apply(strip_date)


# Create columns for individual aspects of date 
train['host_since_year'] = train['host_since'].apply(lambda x: x.year if pd.notnull(x) else None)
train['host_since_month'] = train['host_since'].apply(lambda x: x.month if pd.notnull(x) else None)
train['host_since_day'] = train['host_since'].apply(lambda x: x.day if pd.notnull(x) else None)

train['first_review_year'] = train['first_review'].apply(lambda x: x.year if pd.notnull(x) else None)
train['first_review_month'] = train['first_review'].apply(lambda x: x.month if pd.notnull(x) else None)
train['first_review_day'] = train['first_review'].apply(lambda x: x.day if pd.notnull(x) else None)

train['last_review_year'] = train['last_review'].apply(lambda x: x.year if pd.notnull(x) else None)
train['last_review_month'] = train['last_review'].apply(lambda x: x.month if pd.notnull(x) else None)
train['last_review_day'] = train['last_review'].apply(lambda x: x.day if pd.notnull(x) else None)


test['host_since_year'] = test['host_since'].apply(lambda x: x.year if pd.notnull(x) else None)
test['host_since_month'] = test['host_since'].apply(lambda x: x.month if pd.notnull(x) else None)
test['host_since_day'] = test['host_since'].apply(lambda x: x.day if pd.notnull(x) else None)

test['first_review_year'] = test['first_review'].apply(lambda x: x.year if pd.notnull(x) else None)
test['first_review_month'] = test['first_review'].apply(lambda x: x.month if pd.notnull(x) else None)
test['first_review_day'] = test['first_review'].apply(lambda x: x.day if pd.notnull(x) else None)

test['last_review_year'] = test['last_review'].apply(lambda x: x.year if pd.notnull(x) else None)
test['last_review_month'] = test['last_review'].apply(lambda x: x.month if pd.notnull(x) else None)
test['last_review_day'] = test['last_review'].apply(lambda x: x.day if pd.notnull(x) else None)


# Calculate months since various dates for train dataset
train['host_since_in_months'] = round(((datetime.now().date() - train['host_since']).dt.days) / 30, 2)
train['first_review_in_months'] = round(((datetime.now().date() - train['first_review']).dt.days) / 30, 2)
train['last_review_in_months'] = round(((datetime.now().date() - train['last_review']).dt.days) / 30, 2)

# Calculate months since various dates for test dataset
test['host_since_in_months'] = round(((datetime.now().date() - test['host_since']).dt.days) / 30,  2)
test['first_review_in_months'] = round(((datetime.now().date() - test['first_review']).dt.days) / 30, 2)
test['last_review_in_months'] = round(((datetime.now().date() - test['last_review']).dt.days) / 30, 2)


In [151]:
# t_f_vars = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'has_availability', 'instant_bookable']
# t_f_vars_test = ['host_has_profile_pic', 'host_identity_verified', 'has_availability', 'instant_bookable']


# train[t_f_vars] = train[t_f_vars].replace({'f': 0, 't': 1})
# test[t_f_vars_test] = test[t_f_vars_test].replace({'f': 0, 't': 1})


In [152]:
train_clean = train.copy() 
test_clean = test.copy() 

### Clean/Transform Variables

In [153]:

neighbourhood_counts = train_clean['neighbourhood_cleansed'].value_counts()  
    
def clean_rooms(row):    
    if row.loc['room_type'] == 'Hotel room':
        row['room_type'] = 'Private room'
        
    if row.loc['room_type'] == 'Shared room':
        row['room_type'] = 'Entire home/apt'
        
    return row


 
test_only_hoods = [i for i in test_clean['neighbourhood_cleansed'].unique() 
                   if i not in neighbourhood_counts 
                   and i != 'Other']
    
    

In [154]:
train_clean['host_verifications_list'] = train_clean['host_verifications'].apply(lambda x: x.strip("[]").replace("'", "").split(', '))
test_clean['host_verifications_list'] = test_clean['host_verifications'].apply(lambda x: x.strip("[]").replace("'", "").split(', '))

train_clean['num_verifications']  = train_clean['host_verifications_list'].apply(len)
test_clean['num_verifications']  = test_clean['host_verifications_list'].apply(len)


In [155]:
train_clean['phone_verification']  = train_clean['host_verifications_list'].apply(lambda x: 1 if 'phone' in x else 0)
train_clean['email_verification']  = train_clean['host_verifications_list'].apply(lambda x: 1 if 'email' in x else 0)
train_clean['work_email_verification'] = train_clean['host_verifications_list'].apply(lambda x: 1 if 'work_email' in x else 0)
train_clean['any_email_verification'] = train_clean['host_verifications_list'].apply(lambda x: 1 if 'work_email' in x or 'email' in x else 0)
# print(train_clean['num_verifications'].value_counts(), '\n')

test_clean['phone_verification']  = test_clean['host_verifications_list'].apply(lambda x: 1 if 'phone' in x else 0)
test_clean['email_verification']  = test_clean['host_verifications_list'].apply(lambda x: 1 if 'email' in x else 0)
test_clean['work_email_verification'] = test_clean['host_verifications_list'].apply(lambda x: 1 if 'work_email' in x else 0)
test_clean['any_email_verification'] = test_clean['host_verifications_list'].apply(lambda x: 1 if 'work_email' in x or 'email' in x else 0)
# print(train_clean['num_verifications'].value_counts(), '\n')


In [156]:
bins = [0, 14, 30, 60, 90, 180, 365, float('inf')]
labels = ['0-14', '14-30', '30-60', '60-90', '90-180', '180-365', '365<']

train_clean['max_nights_cats'], max_bins = pd.cut(train_clean.maximum_nights, bins=bins, labels=labels, retbins=True, right=False, include_lowest=True)
test_clean['max_nights_cats'] = pd.cut(test_clean.maximum_nights, bins=max_bins, labels=labels, right=False, include_lowest=True)


In [157]:
# for host listing count outliers (>80) set to median
def clean_host_listing(row):
    if row > 80:
        row = None
    return row


train_clean['calculated_host_listings_count'] = train_clean['calculated_host_listings_count'].apply(clean_host_listing)
test_clean['calculated_host_listings_count'] = test_clean['calculated_host_listings_count'].apply(clean_host_listing)

train_clean['calculated_host_listings_count'].fillna(train_clean['calculated_host_listings_count'].median())
test_clean['calculated_host_listings_count'].fillna(test_clean['calculated_host_listings_count'].median())


0       44.0
1        3.0
2       51.0
3       14.0
4       19.0
        ... 
3319    22.0
3320    12.0
3321    12.0
3322     8.0
3323     2.0
Name: calculated_host_listings_count, Length: 3324, dtype: float64

In [158]:
train_clean['reviews_per_listing'] = train_clean['number_of_reviews']/train_clean['calculated_host_listings_count']
train_clean['reviews_per_month'] = train_clean['number_of_reviews']/train_clean['host_since_in_months']
train_clean['reviews_per_listing_per_month'] = train_clean['reviews_per_listing']/train_clean['host_since_in_months']

test_clean['reviews_per_listing'] = test_clean['number_of_reviews']/test_clean['calculated_host_listings_count']
test_clean['reviews_per_month'] = test_clean['number_of_reviews']/test_clean['host_since_in_months']
test_clean['reviews_per_listing_per_month'] = test_clean['reviews_per_listing']/test_clean['host_since_in_months']


#### Sophisticated Cleaning of Neighbourhoods

In [159]:
other_hoods = []
for i in neighbourhood_counts.index:
    if neighbourhood_counts[i] < 150:
        other_hoods.append(i)         
    
def clean_hoods(row):
    if row.loc['neighbourhood_cleansed'] in other_hoods or row.loc['neighbourhood_cleansed'] in test_only_hoods:
        row['neighbourhood_grouped'] = 'Other'
        
    else:    
        row['neighbourhood_grouped'] = row.loc['neighbourhood_cleansed']
        
    return row
       

In [160]:
train_clean = train_clean.apply(clean_hoods, axis=1)  

In [161]:
hood_df = pd.DataFrame()
hood_df.index = train_clean['neighbourhood_grouped'].unique()

In [162]:
avail_hoods = train_clean['neighbourhood_grouped'].unique()

def test_consistent_hoods(row):
    if row not in avail_hoods:
        row = 'Other'       
    return row

test_clean['neighbourhood_grouped'] = test_clean['neighbourhood_cleansed'].apply(test_consistent_hoods)
# test_clean['neighbourhood_grouped'].value_counts()


In [163]:
train_filter_2 = train_clean.copy() #.drop(columns=grab_vars)
test_filter_2 = test_clean.copy() #.drop(columns=grab_vars)


In [164]:
temp_data_train = train_filter_2.dropna(how='any')

for col in temp_data_train.select_dtypes(include='number').columns:
    if train_filter_2.isna().sum()[col] != 0:
        train_filter_2[col].fillna(value=train_filter_2[col].median(), inplace=True) 

train_final = train_filter_2.copy()

value_counts = train_final.isna().sum()
value_counts[value_counts != 0]

review_vars = [name for name in train_final.columns if 'review' in name]
review_corrs = train_final[review_vars].corr()
# review_corrs[(review_corrs != 1.0) & (review_corrs > 0.7)]


  review_corrs = train_final[review_vars].corr()


In [165]:
temp_data_test = test_filter_2.dropna(how='any')

for col in temp_data_test.select_dtypes(include='number').columns:
    if test_filter_2.isna().sum()[col] != 0:
        test_filter_2[col].fillna(value=test_filter_2[col].median(), inplace=True)  

test_final = test_filter_2.copy()


In [166]:
train_final['accommodates_bins'], bins = pd.cut(train_final['accommodates'], retbins=True, bins=6)
test_final['accommodates_bins'] = pd.cut(test['accommodates'], bins=bins)


In [167]:
train_final['latitude_bins'], lat_bins = pd.cut(train_final['latitude'], retbins=True, bins=12)
train_final['longitude_bins'], long_bins = pd.cut(train_final['longitude'], retbins=True, bins=12)

test_final['latitude_bins'] = pd.cut(test['accommodates'], bins=lat_bins)
test_final['longitude_bins'] = pd.cut(test['accommodates'], bins=long_bins)


In [168]:
train_final['accommodates_root'] = np.sqrt(train_final['accommodates'])
test_final['accommodates_root'] = np.sqrt(test_final['accommodates'])

## **Test Space**

In [169]:
# non_us = ['Italy', 'Tulum, Mexico', 'Toronto, Canada', 'United Kingdom', 'Cartagena, Colombia']
# non_us.append('United States')

# host_locals_count = train_final['host_location'].value_counts() #.drop(non_us)
# major_host_locals = host_locals_count[host_locals_count < 3]
# major_host_locals = list(major_host_locals.index)

# temp_host_data = train_final[~train_final['host_location'].isin(non_us)]
# # temp_host_data = temp_host_data[temp_host_data['host_location'] != 'Chicago, IL']

# print(temp_host_data['latitude'].max(), temp_host_data['latitude'].min(), temp_host_data['longitude'].max(), temp_host_data['longitude'].min())

# max_diff = max([temp_host_data['latitude'].max()-temp_host_data['latitude'].min(), temp_host_data['longitude'].min()-temp_host_data['longitude'].max()])
# print(max_diff)
# max_diff_half = max_diff/2






In [170]:
# sns.barplot(data=train_final, x='neighbourhood_grouped', y='host_is_superhost')

### Function to evaluate model

In [171]:
def evaluate_model(train_final, formula, last_rmse, last_mae, best_rmse, best_mae, best_diff, best_rmse_formula, best_mae_formula):
    try:
        model = smf.ols(formula=formula, data=train_final).fit()
        trying_pred = model.predict(train_final)
        trying_pred = np.exp(trying_pred)
        rmse = mean_squared_error(train_final['price'], trying_pred, squared=False)
        mae = mean_absolute_error(train_final['price'], trying_pred)
        mae_rmse_diff = rmse - mae 

        difference_rmse = (rmse - last_rmse)
        difference_mae = (mae - last_mae)   

        if difference_rmse > 0:
            print(f'''RMSE: {round(rmse, 3)}\nMAE: {round(mae, 3)}\nDiff: {mae_rmse_diff}\n''')
            print(f"RMSE Increase by {difference_rmse}")

        elif difference_rmse < 0:
            print(f'''RMSE: {round(rmse, 3)}\nMAE: {round(mae, 3)}\nDiff: {mae_rmse_diff}\n''')
            print(f"RMSE Decreased: {difference_rmse}")
        else: 
            print(f'''\t- No change -\nRMSE: {round(rmse, 3)}\nMAE: {round(mae, 3)}\nDiff: {mae_rmse_diff}\n''')

        if rmse <= best_rmse:
            best_rmse = rmse
            best_rmse_formula = formula
            print(f"Best RMSE!")

        if mae <= best_mae:
            best_mae = mae
            best_mae_formula = formula
            print(f"Best MAE!")    

        if mae_rmse_diff <= best_diff:
            best_diff = mae_rmse_diff
            print(f"Best Diff!\t {best_diff}")

        print(f"\nlast rmse {last_rmse}; last mae {last_mae}")    
        if best_rmse != rmse:
            print(f"Best RMSE {best_rmse}; Best MAE {best_mae}")    

        last_rmse = rmse
        last_mae = mae    

    except Exception as e:
        print("An error occurred:", e)
    

In [172]:
train_final['review_scores_avg'] = np.sum(train_final[['review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness']], axis=1)
test_final['review_scores_avg'] = np.sum(test_final[['review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness']], axis=1)

In [193]:
train_final['host_is_superhost'] = train_final['host_is_superhost'].replace({"f":0, "t":1})

In [270]:
formula= '''host_is_superhost ~
C(response_time) + 

I(host_since_in_months**2) + I(host_total_listings_count**2) +
I(number_of_reviews_ltm**2) +
host_total_listings_count*number_of_reviews_ltm + 
host_since_in_months*host_total_listings_count + 
host_since_in_months*reviews_per_listing + 
host_total_listings_count*reviews_per_month +

response_rate*number_of_reviews_ltm + acceptance_rate*number_of_reviews_ltm +

review_scores_avg*number_of_reviews_ltm + 
review_scores_avg + I(review_scores_cleanliness**3) + 

C(neighbourhood_grouped, Treatment('West Town'))'''

# host_since_months*number_of_reviews_ltm

model = smf.logit(formula=formula, data=train_final).fit()

model.summary()

Optimization terminated successfully.
         Current function value: 0.461256
         Iterations 18


  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))


0,1,2,3
Dep. Variable:,host_is_superhost,No. Observations:,4977.0
Model:,Logit,Df Residuals:,4946.0
Method:,MLE,Df Model:,30.0
Date:,"Fri, 08 Mar 2024",Pseudo R-squ.:,0.3273
Time:,10:53:34,Log-Likelihood:,-2295.7
converged:,True,LL-Null:,-3412.4
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-4.4402,1.437,-3.090,0.002,-7.257,-1.623
C(response_time)[T.12.0],-0.0719,0.125,-0.576,0.565,-0.317,0.173
C(response_time)[T.24.0],-0.6064,0.200,-3.039,0.002,-0.997,-0.215
C(response_time)[T.72.0],-2.0913,1.082,-1.933,0.053,-4.212,0.029
"C(neighbourhood_grouped, Treatment('West Town'))[T.Lake View]",0.2990,0.179,1.674,0.094,-0.051,0.649
"C(neighbourhood_grouped, Treatment('West Town'))[T.Lincoln Park]",0.4459,0.223,1.998,0.046,0.009,0.883
"C(neighbourhood_grouped, Treatment('West Town'))[T.Logan Square]",-0.1232,0.197,-0.624,0.533,-0.510,0.264
"C(neighbourhood_grouped, Treatment('West Town'))[T.Loop]",0.3815,0.236,1.613,0.107,-0.082,0.845
"C(neighbourhood_grouped, Treatment('West Town'))[T.Near North Side]",-0.2272,0.166,-1.368,0.171,-0.553,0.098


In [271]:
def get_acc(thresholds, model, df):
    y_pred_prob = np.array(model.predict(df))
    y_true = df['host_is_superhost'].values
    
    y_pred = y_pred_prob > thresholds[:, np.newaxis]
    
    y_pred_reshaped = np.repeat(y_pred[:, np.newaxis, :], len(y_true), axis=1)
    
    # Manually calculate accuracy so can be vectorized
    accuracies = np.mean(y_pred_reshaped == y_true, axis=1)
    
    max_acc_index = np.argmax(accuracies)
    max_acc = accuracies[max_acc_index]
    max_acc_threshold = thresholds[max_acc_index]
    
    return max_acc, max_acc_threshold, max_acc_index



thresholds_a = np.linspace(0.4, 0.6, num=101)

max_acc_a, max_acc_threshold_a, max_acc_idx_a = get_acc(thresholds_a, model, train_final)


  return 1/(1+np.exp(-X))


In [272]:
print(f'Threshold: {max_acc_threshold_a}\tIndex: max_acc_idx_a')

print('Difference', round(np.abs(thresholds_a[max_acc_idx_a]-thresholds_a[max_acc_idx_a+1]), 5))
print(f'\nRange of Optimal Thresh [{thresholds_a[max_acc_idx_a-1]}, {thresholds_a[max_acc_idx_a+1]}]')

Threshold: 0.4	Index: max_acc_idx_a
Difference 0.002

Range of Optimal Thresh [0.6, 0.402]


In [273]:
thresholds_b = np.linspace(thresholds_a[max_acc_idx_a-1], thresholds_a[max_acc_idx_a+1], num=101)

max_acc_b, max_acc_threshold_b, max_acc_idx_b = get_acc(thresholds_b, model, train_final)


  return 1/(1+np.exp(-X))


In [274]:
print(f'Optimal Threshold: {max_acc_threshold_b}\tIndex: {max_acc_idx_a}')

Optimal Threshold: 0.6	Index: 0


In [275]:
optimal_threshold = max_acc_threshold_b

y_pred = model.predict(train_final) > optimal_threshold

print(y_pred.value_counts())

False    3421
True     1556
dtype: int64


  return 1/(1+np.exp(-X))


In [276]:
# best_score = 75.9493670886076

In [277]:
acc_score = accuracy_score(train_final.host_is_superhost, y_pred)*100
precision = precision_score(train_final.host_is_superhost, y_pred)*100
recall = recall_score(train_final.host_is_superhost, y_pred)*100


try:
    difference = (acc_score-last_acc_score)
        
    if difference > 0:
        print(acc_score)
        print(f"Improved by {difference}")
        print(f"(last acc_score {last_acc_score})")
            
    elif difference < 0:
        print(acc_score)
        print(f"Decreased: {difference}")
        print(f"(last acc_score {last_acc_score})") 

    else: 
        print(acc_score, "(No change)")
        
except:
    print(acc_score)


# if acc_score >= best_score:
#     best_score = acc_score
#     best_formula = formula
    
#     print("\nBest Score!\n")
    
# try:    
#     print(f"\nBest Score: {best_score}\tLast Score: {last_score}")    
# except:
#     print(f"\nBest Score: {best_score}")

    
print(f"Precision: {precision}\tRecall: {recall}")
    
last_acc_score = acc_score 

76.25075346594335
Improved by 0.9041591320072513
(last acc_score 75.3465943339361)
Precision: 82.19794344473009	Recall: 58.562271062271066


In [279]:
test_pred = (model.predict(test_final) > optimal_threshold).replace({False:0, True:1})

overlapping_hosts = train_final[train_final['host_id'].isin(test_final['host_id'])].drop_duplicates('host_id')[['host_id', 'host_is_superhost']]

predicted_values = pd.concat([test_final[['id', 'host_id']], test_pred.rename('predicted')], axis=1)


def overwrite(row):
    if row['host_id'] in overlapping_hosts['host_id'].values:
        row['predicted'] = overlapping_hosts[overlapping_hosts['host_id'] == row['host_id']]['host_is_superhost'].values[0]
    return row
        
          
predicted_values = predicted_values.apply(overwrite, axis=1)
predicted_values = predicted_values[['id', 'predicted']].set_index('id')
predicted_values

# predicted_values.to_csv('classification_model_take_11.csv') 