In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error,plot_roc_curve
import seaborn as sns
import math
import re
from ast import literal_eval
from scipy import stats
import common as c

%matplotlib inline

plt.rcParams["figure.figsize"] = (20,3)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
#Load Data 
listings_df = pd.read_csv('./data/boston/listings.csv')

## Data Study

In [None]:
listings_df.describe()

## Data Cleanup And Normalization 

In [None]:
def fillna_with_mean(df):
    """
    Fills the NAs on numerical columns with the mean of the column.
    """
    num_vars = df.select_dtypes(include=['float', 'int']).columns
    for col in num_vars:
        df[col].fillna((df[col].mean()), inplace=True)
        
    return df

def convert_baths_text_to_numbers(baths_text):
    """
    Converts the bathrooms to number.
    """
    baths_str = str(baths_text).lower()
    if baths_str == "half-bath" or baths_str == "private half-bath" or baths_str == "shared half-bath":
        return 0.5
    else:
        return float(re.match("([\d\.]+)([ \-a-bA-Z]*)", baths_text).group(1))
    
def explode_list_to_col(df, id_col, col_to_explode, prefix=""):
    """
    Explodes a column of the type List into columns with a flag (0 or 1) of observed or not.
    """
    _df = df[[id_col, col_to_explode]].copy()
    _df[col_to_explode] = _df[col_to_explode].apply(literal_eval) #convert to list type
    _df = _df.explode(col_to_explode)
    _df["observed"] = 1
    _df[col_to_explode] = _df[col_to_explode].apply(lambda x: str(prefix) + str(x))
    _df = pd.pivot_table(_df, index=id_col, columns=[col_to_explode], values="observed", aggfunc=np.max, fill_value=0)
    merged_df = pd.merge(df, _df, on=id_col)
    return merged_df

def cleanup_and_normalize_dataset(original_df):
    """
    Applies the necessary operations to clean up the AirBnB listings dataset.
    """
    df = original_df.copy()
    # Remove percentage chars from rates.
    df['host_response_rate'] = df['host_response_rate'].str.replace('%', '').astype(float)
    df['host_acceptance_rate'] = df['host_acceptance_rate'].str.replace('%', '').astype(float)

    # Convert booleans with 0s and 1s
    df['host_is_superhost'] = df['host_is_superhost'].str.replace('f','0').str.replace('t','1').astype(int)
    df['host_has_profile_pic'] = df['host_has_profile_pic'].str.replace('f','0').str.replace('t','1').astype(int)
    df['host_identity_verified'] = df['host_identity_verified'].str.replace('f','0').str.replace('t','1').astype(int)
    df['instant_bookable'] = df['instant_bookable'].str.replace('f','0').str.replace('t','1').astype(int)
    
    #Extract info from bathrooms text 
    df['num_bathrooms'] = df['bathrooms_text'].fillna('0 baths').apply(lambda b_str: convert_baths_text_to_numbers(b_str))
    df['shared_bathrooms'] = df['bathrooms_text'].fillna('0 baths').apply(lambda b_str: 1 if "shared" in b_str.lower() else 0)
    df['half_bathrooms'] = df['bathrooms_text'].fillna('0 baths').apply(lambda b_str: 1 if "half" in b_str.lower() else 0)
    df = df.drop(columns =['bathrooms_text'])

    # Convert License into a boolean
    df['license'] = df['license'].fillna('0').apply(lambda lic: 0 if lic != '0' else 1).astype(int)
    
    # Remove special chars from price and convert it into a float
    df['price'] = c.clean_price(df)
    
    # Consider only listings with price greatter than 0
    df = df[df.price > .0]

    # Fix NaNs with the mean of the column.
    df = fillna_with_mean(df)
    
    #Categorical features
    df['room_type'] = df['room_type'].astype('category').cat.codes
    df['property_type'] = df['property_type'].astype('category').cat.codes
    df['neighbourhood_cleansed'] = df['neighbourhood_cleansed'].astype('category').cat.codes
    df['neighbourhood_group_cleansed'] = df['neighbourhood_group_cleansed'].astype('category').cat.codes
    
    return df

def explode_lists(df):
    """
    Explodes ameneties and host_verifications
    """
    df['amenities'] = df['amenities'].apply(literal_eval) #convert to list type
    df['amenities'] = df['amenities'].apply(lambda data: str(data)) 
    df = explode_list_to_col(df, 'id', 'amenities', 'amenity_')
    
    df['host_verifications'] = df['host_verifications'].apply(literal_eval) #convert to list type
    df['host_verifications'] = df['host_verifications'].apply(lambda data: str(data))
    df = explode_list_to_col(df, 'id', 'host_verifications', 'host_verifications_')
    
    #Drop unnecessary columns
    df = df.drop(columns=['amenities', 'host_verifications'])
    return df

In [None]:
cleaned_df = cleanup_and_normalize_dataset(listings_df.copy())

In [None]:
relevant_features = [
    'id',
    'amenities',
    'host_verifications',
    'price',
    'host_is_superhost', 
    'host_identity_verified', 
    'property_type',
    'room_type',
    'neighbourhood_group_cleansed', 
    'neighbourhood_cleansed',
    'accommodates',
    'bedrooms',
    'beds',
    'num_bathrooms', 
    'shared_bathrooms',
    'review_scores_rating',
    'number_of_reviews',
    'instant_bookable',
    'license',
    'availability_365',
    'review_scores_location',
    'host_response_rate',
    'number_of_reviews_l30d',
    'review_scores_checkin'
]

all_features_df = cleaned_df[relevant_features].copy()
all_features_df = explode_lists(all_features_df)
print("Original: [{}], Cleaned: [{}]".format(listings_df.shape, all_features_df.shape))
all_features_df.describe()

### Clean up data
Remove columns that won't contribute positively to the model.
- Price == 0 or null
- Amenities with all 0s or 1s
- Amenities with less than 10 entries

In [None]:
all_cleaned_df = all_features_df.loc[:, (all_features_df != 1).any(axis=0)]
all_cleaned_df = all_cleaned_df.loc[:, (all_features_df != 0).any(axis=0)]
all_cleaned_df = all_cleaned_df[all_cleaned_df.price > .0]
print("Original: [{}], Cleaned: [{}]".format(all_features_df.shape, all_cleaned_df.shape))

all_cleaned_df.head(5)

## Check and fix price distribution

Our target variable is a continuous variable and we need to check if the distribution is somehow skewed.
As we can see by the charts bellow, our variable is quite skewed in that way we can use some strategies let's explore 3 of them SquareRoot, Log, BoxCox. So, BoxCox give us the best aproximation of the normal distribution for our target variable.


In [None]:
price_distribution = all_features_df[["price"]].copy()
price_distribution["price_log"] = np.log(all_features_df["price"])
price_distribution["price_sqrt"] = np.sqrt(all_features_df["price"])
price_distribution["price_box_cox"],_ = stats.boxcox(all_features_df["price"])
price_distribution.hist(figsize=(20, 20), bins=25)

In [None]:
def train_model(df, cols_to_drop):
    X = df.drop(columns=(['id', 'price'] + cols_to_drop)) #all_features_df.drop(columns=columns_to_drop, axis=1)
    y, _ = stats.boxcox(df["price"])
    
    X['accommodates'],_= stats.boxcox(X['accommodates'])
    X['review_scores_rating'],_= stats.boxcox(X['review_scores_rating'])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1)

    lm_model = LinearRegression(normalize=True) # Instantiate
    lm_model.fit(X_train, y_train) #Fit

    #Predict using your model
    y_test_preds = lm_model.predict(X_test)
    y_train_preds = lm_model.predict(X_train)

    #Score using your model
    test_score = r2_score(y_test, y_test_preds)
    train_score = r2_score(y_train, y_train_preds)
    
    return X, y, X_train, X_test, y_train, y_test, y_test_preds, y_train_preds, test_score, train_score, lm_model

In [None]:
X, y, X_train, X_test, y_train, y_test, y_test_preds, y_train_preds, test_score, train_score, lm_model = train_model(all_features_df, ['id', 'price'])
print("The rsquared on the training data is {}. The rsquared on the test data is {}.".format(train_score, test_score))

In [None]:
coefs = c.get_model_coefs(lm_model, X)

In [None]:
import operator
sorted_coefs = sorted(coefs.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
sorted_coefs