In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
from datetime import date, datetime

## 1) Exploratory Data Analysis

The acceptance rate column had unnecessary characters so in my cleaning I removed those and converted to numeric values. Along with that, I transformed the date columns to numeric by calculated "months since" each respective date. True/False columns were converted to 1/0. There were many values for neighbourhood so I grouped any neighbourhood with less than 200 value counts into 'Other'.

## 2) Data Cleaning/Preparation

Mention the data cleaning/preparation steps you took to prepare your data. This may include imputing missing values, creating dummy variables, combining levels of categorical variable(s), discarding predictors that are not useful, etc.

In [2]:
raw_train = pd.read_csv('train_classification.csv')
raw_test = pd.read_csv('test_classification.csv')

In [3]:
# Clean and process the data
train = raw_train.copy()

# Process acceptance rates
train['host_acceptance_rate'] = train['host_acceptance_rate'].str.replace('%', '').astype(float)
train['host_response_rate'] = train['host_response_rate'].str.replace('%', '').astype(float)

# Process bathroom column into numeric column
train['bathrooms_num'] = train['bathrooms_text'].str.extract('(\d+)').astype(float)

# Convert date columns
train['host_since_years'] = ((datetime.now() - pd.to_datetime(train['host_since'])).dt.days) / 365
train['first_review_years'] = ((datetime.now() - pd.to_datetime(train['first_review'])).dt.days) / 365
train['last_review_years'] = ((datetime.now() - pd.to_datetime(train['last_review'])).dt.days) / 365

t_f_vars = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'has_availability', 'instant_bookable']
train[t_f_vars] = train[t_f_vars].replace({'f': 0, 't': 1})


def is_shared(x):
    if str(x) == 'nan':
        return False
    else:
        if 'shared' in x:
            return 1
        else:
            return 0
    
    
train['bathrooms_shared'] = train['bathrooms_text'].apply(is_shared).astype(int)

train_clean = train.drop(columns=['host_since', 'first_review', 'last_review'])


In [4]:
# Clean and process the data
test = raw_test.copy()

# Process acceptance rates
test['host_acceptance_rate'] = test['host_acceptance_rate'].str.replace('%', '').astype(float)
test['host_response_rate'] = test['host_response_rate'].str.replace('%', '').astype(float)

# Process bathroom column into numeric column
test['bathrooms_num'] = test['bathrooms_text'].str.extract('(\d+)').astype(float)


# Convert date columns
test['host_since_years'] = ((datetime.now() - pd.to_datetime(test['host_since'])).dt.days) / 365
# test['first_review_diff_days'] = (datetime.now() - pd.to_datetime(test['first_review'])).dt.days
test['last_review_years'] = ((datetime.now() - pd.to_datetime(test['last_review'])).dt.days) / 365


t_f_vars = ['host_has_profile_pic', 'host_identity_verified', 'has_availability', 'instant_bookable']
test[t_f_vars] = test[t_f_vars].replace({'f': False, 't': True})


test['bathrooms_shared'] = test['bathrooms_text'].apply(is_shared).astype(int)


test_clean = test.drop(columns=['host_since', 'first_review', 'last_review'])


In [5]:
## Neighbourhoods with <400 observations are 'Other'

location_counts = train_clean['host_location'].value_counts()

test_only_hoods = [i for i in test_clean['host_location'].unique() 
                   if i not in location_counts 
                   and i != 'Other']

other_hoods = []
for i in location_counts.index:
    if location_counts[i] < 350:
        other_hoods.append(i)         
    
def clean_hoods(row):
    if row.loc['host_location'] in other_hoods:
        row['host_location'] = 'Other'
    if row.loc['host_location'] in test_only_hoods:
        row['host_location'] = 'Other'
    return row
       
def clean_rooms(row):    
    if row.loc['room_type'] == 'Hotel room' or row.loc['room_type'] == 'Private room' or row.loc['room_type'] == 'Shared room':
        row['room_type'] = 'Single room'
    return row
    

    
train_clean = train_clean.apply(clean_rooms, axis=1)
test_clean = test_clean.apply(clean_rooms, axis=1)
print(train_clean['room_type'].value_counts())

Entire home/apt    3801
Single room        1176
Name: room_type, dtype: int64


In [6]:
train_clean = train_clean.apply(clean_hoods, axis=1)  
test_clean = test_clean.apply(clean_hoods, axis=1)  
# train_clean['neighbourhood_cleansed'].value_counts()

### Imputation

In [7]:
acceptance_model = smf.ols(formula='host_acceptance_rate~C(instant_bookable)', data=train_clean).fit()
acceptance_model.summary()

train_clean['imputed_acceptance_rate'] = acceptance_model.predict(train_clean)
train_clean['host_acceptance_rate'].fillna(train_clean['imputed_acceptance_rate'], inplace=True)
train_clean.drop(columns=['imputed_acceptance_rate'], inplace=True)


## 3) Developing the Model

Host_response_rate was included because it makes sense because a higher rate probably means a more attentive and better host. host_since_years was included because more experience would make a better host. host_total_listings_count\*number_of_reviews_ltm should be evaluated together because the number of reviews will directly depend on how many listings a host has. review_scores_cleanliness is another variable that intuitively would connect to the quality of a host. 

## 4) Model

In [8]:
# Put the code that develops the model using the data you processed in Question 2, 
# and then uses the developed model on test data for prediction.
formula= '''host_is_superhost ~ host_total_listings_count*number_of_reviews_ltm + 
number_of_reviews_ltm*host_response_rate + host_acceptance_rate*number_of_reviews_ltm +
availability_90 + 
review_scores_cleanliness + I(review_scores_cleanliness**3) + 
host_since_years*host_total_listings_count'''

model = smf.logit(formula=formula, data=train_clean).fit()

model.summary()

Optimization terminated successfully.
         Current function value: 0.532047
         Iterations 10


0,1,2,3
Dep. Variable:,host_is_superhost,No. Observations:,3660.0
Model:,Logit,Df Residuals:,3647.0
Method:,MLE,Df Model:,12.0
Date:,"Sun, 18 Feb 2024",Pseudo R-squ.:,0.2288
Time:,14:39:45,Log-Likelihood:,-1947.3
converged:,True,LL-Null:,-2525.1
Covariance Type:,nonrobust,LLR p-value:,6.247e-240

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-4.1606,2.011,-2.068,0.039,-8.103,-0.218
host_total_listings_count,-0.0030,0.001,-3.184,0.001,-0.005,-0.001
number_of_reviews_ltm,-0.1015,0.057,-1.779,0.075,-0.213,0.010
host_total_listings_count:number_of_reviews_ltm,-0.0005,7.09e-05,-6.773,0.000,-0.001,-0.000
host_response_rate,0.0162,0.007,2.285,0.022,0.002,0.030
number_of_reviews_ltm:host_response_rate,0.0023,0.000,5.873,0.000,0.002,0.003
host_acceptance_rate,0.0184,0.004,5.128,0.000,0.011,0.025
host_acceptance_rate:number_of_reviews_ltm,-0.0009,0.000,-2.307,0.021,-0.002,-0.000
availability_90,-0.0019,0.001,-1.736,0.083,-0.004,0.000


In [9]:
def get_acc(thresh):
    y_pred_temp = model.predict(train_clean) > thresh
    acc_score = accuracy_score(train_clean.host_is_superhost, y_pred_temp)
    return acc_score

In [10]:
thresholds1 = np.linspace(0.4, 0.6, num=21)
accuracies1 = []
accuracies1 = pd.Series(thresholds1).apply(get_acc)

idx1 = accuracies1.idxmax()
idx1, accuracies1[idx1], thresholds1[idx1]

(14, 0.7578862768736186, 0.54)

In [11]:
thresholds2 = np.linspace(thresholds1[idx1-1], thresholds1[idx1+1], num=101)
    
accuracies2 = []
accuracies2 = pd.Series(thresholds2).apply(get_acc)

idx2 = accuracies2.idxmax()
idx2, accuracies2[idx2], thresholds2[idx2]    


(8, 0.7584890496282901, 0.5316000000000001)

In [12]:
thresholds3 = np.linspace(thresholds2[idx2-1], thresholds2[idx2+1], num=101)

accuracies3 = []
accuracies3 = pd.Series(thresholds3).apply(get_acc)

idx3 = accuracies3.idxmax()
idx3, accuracies3[idx3], thresholds3[idx3]    


(50, 0.7584890496282901, 0.5316000000000001)

In [13]:
threshold = round(thresholds3[idx3], 6)

y_pred = model.predict(train_clean) > threshold

y_pred.value_counts()

False    2813
True     2164
dtype: int64

In [14]:
tn, fp, fn, tp = confusion_matrix(train_clean.host_is_superhost, y_pred).ravel()

FPR = round(100*(fp / (fp + tn)), 4)
FNR = round(100*(fn / (fn + tp)), 4)

print(f"FPR: {FPR}\tFNR: {FNR}")

conf_matrix = confusion_matrix(train_clean.host_is_superhost, y_pred)
conf_df = pd.DataFrame(conf_matrix, columns=["Predicted 0", "Predicted 1"], index=["Actual 0", "Actual 1"])
acc_score = round(accuracy_score(train_clean.host_is_superhost, y_pred) * 100, 4)
precision = round(precision_score(train_clean.host_is_superhost, y_pred) * 100, 4)
recall = round(recall_score(train_clean.host_is_superhost, y_pred)*100, 4)

print(f"Accuracy: {acc_score}\nPrecision: {precision}\nRecall: {recall}")
print(f"\n{conf_df}")

# tn, fp
# fn, tp


FPR: 21.16	FNR: 27.9762
Accuracy: 75.8489
Precision: 72.6895
Recall: 72.0238

          Predicted 0  Predicted 1
Actual 0         2202          591
Actual 1          611         1573


In [15]:
test_pred = (model.predict(test_clean) > threshold).replace({False:0, True:1})

predicted_values = pd.concat([test_clean['id'], test_pred], axis=1).set_index('id').rename(columns={0:'predicted'})

predicted_values.to_csv('classification_model_results.csv') 