In [71]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

dev_df = pd.read_csv('../Datasets/NYC_Airbnb/development.csv')
eval_df = pd.read_csv('../Datasets/NYC_Airbnb/evaluation.csv')
labels = eval_df['id']

In [72]:
dev_df.shape, eval_df.shape

((39116, 16), (9779, 15))

In [73]:
def clean(df):
    # eliminate unuseful attributes 
    df = df.drop(columns=["name","host_name","last_review"])

    # null values on review_per_month : fill with the average
    df['reviews_per_month'].fillna(0, inplace=True)
    # z-score on availability_365
        
    return df

def encode_categories(dev, ev, columns):
    for column in columns:

        # concat both dataframes in order to have the same encoding
        # the ones with NaN belong the ev_set
        
        df = pd.concat([dev,ev])
        df[column] = df[column].factorize()[0]
        
        ev = df[df['price'].isna()].drop(columns=['price'])
        dev = df.dropna(subset=['price'])
        
    return dev, ev

def normalize(df,columns):
    for col in columns:
        df[col]=(df[col]-df[col].mean())/df[col].std(ddof=0)
    
    return df
    

In [74]:
columns_to_encode = ['neighbourhood_group', 'room_type','neighbourhood']
columns_to_normalize = ['calculated_host_listings_count','number_of_reviews','minimum_nights']

dev_df = clean(dev_df)
eval_df = clean(eval_df)

dev_df = normalize(dev_df,columns_to_normalize)

dev_df_enc, eval_df_enc = encode_categories(dev_df.copy(),eval_df.copy(),columns_to_encode)

In [75]:
dev_df_enc.shape, eval_df_enc.shape

((39116, 13), (9779, 12))

In [76]:
abs(dev_df_enc.corr()['price']).sort_values(ascending=False)

price                             1.000000
room_type                         0.204980
longitude                         0.148891
availability_365                  0.082667
calculated_host_listings_count    0.055070
reviews_per_month                 0.053285
neighbourhood_group               0.049987
number_of_reviews                 0.048254
minimum_nights                    0.044238
latitude                          0.031274
host_id                           0.015168
neighbourhood                     0.013436
id                                0.009273
Name: price, dtype: float64

## Model

In [79]:
y_dev = dev_df_enc.price
X_dev = dev_df_enc.drop(columns=['price','host_id','neighbourhood','id'])
X_eval = eval_df_enc.drop(columns=['host_id','neighbourhood','id'])

In [53]:
## LINEAR REGRESSION ##
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

def doLinearReg(in_X,in_y):
    
    model = LinearRegression()
    parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
    gs = GridSearchCV(estimator=model,  
                         param_grid=parameters,
                         scoring='r2',
                         cv=5,
                         n_jobs=-1)

    ## Lastly, finding the best parameters.
    gs.fit(in_X, in_y)
    best_parameters_LR = gs.best_params_  
    best_score_LR = gs.best_score_ 
    print(best_parameters_LR)
    print(best_score_LR)

doLinearReg(X_dev,y_dev)

{'copy_X': True, 'fit_intercept': True, 'normalize': False}
0.07815010096011774


In [None]:
## RANDOM FOREST ##
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

def doRandomForest(in_X,in_y):
    
    model = RandomForestRegressor(n_jobs=-1)
    parameters = {
        'n_estimators':[400,600,700], 
        'max_depth':[400,500]
    }
    
    gs = GridSearchCV(estimator=model,  
                         param_grid=parameters,
                         scoring='r2',
                         cv=3,
                         n_jobs=-1,
                         verbose=True)

    ## Lastly, finding the best parameters.
    gs.fit(in_X, in_y)
    best_parameters = gs.best_params_  
    best_score = gs.best_score_ 
    print(best_parameters)
    print(best_score)
    
doRandomForest(X_dev,y_dev)

In [78]:
## TRAIN ON RANDOM FOREST ì
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

X_train, X_test, y_train, y_test = train_test_split(X_dev,y_dev,test_size=0.2,random_state=105)

rf = RandomForestRegressor(max_depth=400, n_estimators=700)
rf.fit(X_train, y_train)
rf_pred = rf4.predict(X_test)

print(r2_score(y_test2,rf_pred))

0.14813950077511662


## Final train

In [23]:
rf_final = RandomForestRegressor(max_depth=400, n_estimators=700)
rf_final.fit(X_dev, y_dev)

RandomForestRegressor(max_depth=400, n_estimators=700)

In [24]:
rf_pred_final = rf_final.predict(X_eval)

In [63]:
pd.DataFrame( dict(Id = eval_df['id'],
                   Predicted = rf_pred_final)
            ).to_csv("submissionL9_V7.csv",sep=",",index=False)