## 1) Exploratory Data Analysis

- First, there were columns that had characters in them (%, \$) so I filtered cleaned the columns to be numeric. 
- There were different variations to quantify the number of listings a host has, so using correlations and some trial and error only one of the host listing count predictors was selected. 
- Host response time was categorical so I converted the categorical values to approx. numeric values
- Review scores are averaged because they are very highly correlated.
- There were some columns that were dates so they were converted to datetime objects and also a column was created for 'months since' that date to make the date columns easier to utilize
- Property types and Neighbourhoods had some observations with low occurances so those were grouped into 'Other'
- missing values were imputed naively with the columns median value

## 2) Data Cleaning/Preparation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

from datetime import date, datetime

In [2]:
raw_train = pd.read_csv('datasets/train_classification.csv')
raw_test = pd.read_csv('datasets/test_classification.csv')

In [3]:
# Create copies of the raw datasets
train = raw_train.copy()
test = raw_test.copy()

# Clean 'price' column: remove '$' and ',' characters, and convert to float
train['acceptance_rate'] = train['host_acceptance_rate'].str.replace('%', '').astype(float) / 100
train['response_rate'] = train['host_response_rate'].str.replace('%', '').astype(float) / 100

test['acceptance_rate'] = test['host_acceptance_rate'].str.replace('%', '').astype(float) / 100
test['response_rate'] = test['host_response_rate'].str.replace('%', '').astype(float) / 100

# Drop unnecessary columns
train.drop(columns=['host_acceptance_rate', 'host_response_rate'], inplace=True)
test.drop(columns=['host_acceptance_rate', 'host_response_rate'], inplace=True)


# Extract numeric values from 'bathrooms_text' column and convert to float
train['bathrooms_num'] = train['bathrooms_text'].str.extract('(\d+)').astype(float)
test['bathrooms_num'] = test['bathrooms_text'].str.extract('(\d+)').astype(float)

# Fill missing values in 'bathrooms_num' where 'Half-bath' is mentioned in 'bathrooms_text' with 0.5
train.loc[train['bathrooms_text'].str.contains('Half-bath', case=False, na=False) & train['bathrooms_num'].isna(), 'bathrooms_num'] = 0.5
test.loc[test['bathrooms_text'].str.contains('Half-bath', case=False, na=False) & test['bathrooms_num'].isna(), 'bathrooms_num'] = 0.5


In [5]:
# Convert date columns
def strip_date(row):
    if isinstance(row, str):
        row = datetime.strptime(row, '%Y-%m-%d').date()
    return row

# Apply date conversion to train dataset
train['host_since'] = train['host_since'].apply(strip_date)
train['first_review'] = train['first_review'].apply(strip_date)
train['last_review'] = train['last_review'].apply(strip_date)

# Apply date conversion to test dataset
test['host_since'] = test['host_since'].apply(strip_date)
test['first_review'] = test['first_review'].apply(strip_date)
test['last_review'] = test['last_review'].apply(strip_date)



# Calculate months since various dates for train dataset
train['host_since_in_months'] = round(((datetime.now().date() - train['host_since']).dt.days) / 30, 2)
train['first_review_in_months'] = round(((datetime.now().date() - train['first_review']).dt.days) / 30, 2)
train['last_review_in_months'] = round(((datetime.now().date() - train['last_review']).dt.days) / 30, 2)

# Calculate months since various dates for test dataset
test['host_since_in_months'] = round(((datetime.now().date() - test['host_since']).dt.days) / 30,  2)
test['first_review_in_months'] = round(((datetime.now().date() - test['first_review']).dt.days) / 30, 2)
test['last_review_in_months'] = round(((datetime.now().date() - test['last_review']).dt.days) / 30, 2)


train_clean = train.copy() 
test_clean = test.copy() 

In [4]:
# Create a dictionary for response time category conversions
response_time_dict = {'within an hour': 1, 'within a few hours': 12, 'within a day': 24, 'a few days or more': 72}

def replace_response_time(row):
    if pd.notna(row):
        return response_time_dict.get(row)
    else:
        return None

train['response_time'] = train['host_response_time'].apply(replace_response_time)
test['response_time'] = test['host_response_time'].apply(replace_response_time)

### Clean/Transform Variables

In [6]:
# group hotel room into private room because there is a low number of occurances for hotel
def clean_rooms(row):    
    if row.loc['room_type'] == 'Hotel room':
        row['room_type'] = 'Private room'
        
    return row

train_clean = train_clean.apply(clean_rooms, axis=1)
test_clean = test_clean.apply(clean_rooms, axis=1)


In [7]:
# create variables for rate of reviews for listing count and for host_since_months
train_clean['reviews_per_listing'] = train_clean['number_of_reviews']/train_clean['calculated_host_listings_count']
train_clean['reviews_per_month'] = train_clean['number_of_reviews']/train_clean['host_since_in_months']
train_clean['reviews_per_listing_per_month'] = train_clean['reviews_per_listing']/train_clean['host_since_in_months']

test_clean['reviews_per_listing'] = test_clean['number_of_reviews']/test_clean['calculated_host_listings_count']
test_clean['reviews_per_month'] = test_clean['number_of_reviews']/test_clean['host_since_in_months']
test_clean['reviews_per_listing_per_month'] = test_clean['reviews_per_listing']/test_clean['host_since_in_months']


#### Sophisticated Cleaning of Neighbourhoods

In [8]:
# if neighbourhood has less than 150 occurances group them into 'Other'
neighbourhood_counts = train_clean['neighbourhood_cleansed'].value_counts()  
test_only_hoods = [i for i in test_clean['neighbourhood_cleansed'].unique() 
                   if i not in neighbourhood_counts 
                   and i != 'Other']


other_hoods = []
for i in neighbourhood_counts.index:
    if neighbourhood_counts[i] < 150:
        other_hoods.append(i)         
    
def clean_hoods(row):
    if row.loc['neighbourhood_cleansed'] in other_hoods or row.loc['neighbourhood_cleansed'] in test_only_hoods:
        row['neighbourhood_grouped'] = 'Other'
        
    else:    
        row['neighbourhood_grouped'] = row.loc['neighbourhood_cleansed']
        
    return row
   
    
train_clean = train_clean.apply(clean_hoods, axis=1)  
test_clean = test_clean.apply(clean_hoods, axis=1)  

In [9]:
# Clean filler words out of property types
words_to_remove = ['place', 'room', 'private', 'shared', 'entire', ' in', ' room', ' private', ' shared', ' entire', ' in',]

def remove_words(text):
    text=text.lower()
    for word in words_to_remove:
        word = word.lower()
        text = text.replace(word, '')
    return text.strip()

train_clean['property_type'] = train_clean['property_type'].apply(remove_words)
test_clean['property_type'] = test_clean['property_type'].apply(remove_words)


# group properties with less than 10 occurances into 'Other'
property_counts = train_clean['property_type'].value_counts()
keep = [i for i in property_counts.index if property_counts[i] > 10]

def clean_property(row):
    if row not in keep or row == "":
        row = 'Other'
      
    return row

train_clean['property_type_cleansed'] = train_clean['property_type'].apply(clean_property)
test_clean['property_type_cleansed'] = test_clean['property_type'].apply(clean_property)


In [10]:
# Fill in remaining missing values with median for numerical columns
train_clean.fillna(train_clean.median(numeric_only=True), inplace=True)
train_clean.fillna(train_clean.median(numeric_only=True), inplace=True)

# Create final DataFrames
train_final = train_clean.copy()
test_final = test_clean.copy()


In [11]:
# review scores are very correlated, average review scores to handle this
train_final['review_scores_avg'] = np.mean(train_final[['review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness']], axis=1)
test_final['review_scores_avg'] = np.mean(test_final[['review_scores_rating', 'review_scores_value', 'review_scores_location', 'review_scores_cleanliness']], axis=1)

train_final['review_scores_avg'] = train_final['review_scores_avg'].fillna(value=train_final['review_scores_avg'].median())
test_final['review_scores_avg'] = test_final['review_scores_avg'].fillna(value=train_final['review_scores_avg'].median())

In [12]:
# replace f/t values in train final with numeric 0/1
train_final['host_is_superhost'] = train_final['host_is_superhost'].replace({"f":0, "t":1})

## 3) Developing the Model

Host experience, number of listings, and number of reviews are all influence and have some quadratic relation to host_is_superhost so those predictors are squared. Number of reviews depends on the host's number of listings so these variables interact, similiarly host listing count is connected to reviews per month. How long someone has been a host is connected to their number of listings so those variables interact. 
Review scores average is connected to number of reviews ltm because as there are more reviews, the average is more reliable as well as more reviews that are higher may indicate a host is more likely to be a superhost. Certain neighbourhoods may require more experienced/high-level host's so neighbourhood is added. Lastly reviews scores average has higher order terms because reviews are direclty and strongly correlated to superhosts. 

## 4) Model

In [13]:
formula= '''host_is_superhost ~
C(response_time) + 

I(host_since_in_months**2) + I(host_total_listings_count**2) +
I(number_of_reviews_ltm**2) +

host_total_listings_count*number_of_reviews_ltm + 
host_since_in_months*host_total_listings_count + 
host_since_in_months*reviews_per_listing + 
host_total_listings_count*reviews_per_month +

review_scores_avg*number_of_reviews_ltm + 
review_scores_avg + I(review_scores_avg**2) + I(review_scores_avg**3) + 

C(neighbourhood_grouped)'''

model = smf.logit(formula=formula, data=train_final).fit()


Optimization terminated successfully.
         Current function value: 0.470835
         Iterations 18


In [14]:
# find the optimal threshold for the logistic model
thresholds = np.linspace(0.4, 0.6, num=201)

accuracies = []
for a in thresholds:
    y_pred = model.predict(train_final) > a
    accuracies.append(accuracy_score(train_final.host_is_superhost, y_pred))

max_idx = np.argmax(accuracies)    
    
print(max_idx, round(thresholds[max_idx], 6), round(100*np.max(accuracies), 4))

optimal_threshold = thresholds[max_idx]

88 0.488 77.6773


In [15]:
# predict values and compute metrics for the training data
y_pred = model.predict(train_final) > optimal_threshold

print(y_pred.value_counts())

acc_score = accuracy_score(train_final.host_is_superhost, y_pred)*100
precision = precision_score(train_final.host_is_superhost, y_pred)*100
recall = recall_score(train_final.host_is_superhost, y_pred)*100
conf_matrix = confusion_matrix(train_final.host_is_superhost, y_pred)

tn, fp, fn, tp = conf_matrix.ravel()
FPR = fp/(fp + tn)
FNR = fn/(fn + tp)


print(f"\nAccuracy: {round(acc_score, 4)}")
print(f"Precision: {round(precision, 4)}\tRecall: {round(recall, 4)}")
print(f"FPR: {round(FPR*100, 4)}\t\tFNR: {round(FNR*100, 4)}\n")
    
print(f"Confusion Matrix:\n{conf_matrix}\n")

    
last_acc_score = acc_score 


False    3046
True     1931
dtype: int64

Accuracy: 77.6773
Precision: 77.7835	Recall: 68.7729
FPR: 15.3598		FNR: 31.2271

Confusion Matrix:
[[2364  429]
 [ 682 1502]]



In [16]:
# initial prediction
test_pred = (model.predict(test_final) > optimal_threshold).replace({False:0, True:1})
predicted_values = pd.concat([test_final[['id', 'host_id']], test_pred.rename('predicted')], axis=1)

# hosts that are in both dataframes
overlapping_hosts = train_final[train_final['host_id'].isin(test_final['host_id'])].drop_duplicates('host_id')[['host_id', 'host_is_superhost']]

# if a host is in both dataframes, overwrite the predicted value with the value from the training data
def overwrite(row):
    if row['host_id'] in overlapping_hosts['host_id'].values:
        row['predicted'] = overlapping_hosts[overlapping_hosts['host_id'] == row['host_id']]['host_is_superhost'].values[0]
    return row
        
    
predicted_values = predicted_values.apply(overwrite, axis=1)
predicted_values = predicted_values[['id', 'predicted']].set_index('id')

predicted_values.to_csv('classification_model_predicted_values.csv') 