In [44]:
# Packages for data manipulation
import numpy as np
import pandas as pd
import json

In [45]:
# Packages for dataviz
import plotly.express as px
import plotly.tools as tls
import plotly.graph_objects as go
import cufflinks as cf

In [46]:
# configuration for plotly
# template = "plotly_dark"

# offline configuration of cufflinks
cf.go_offline()

In [47]:
pd.set_option('display.max_columns', 110)

In [48]:
# Load the data
data = pd.read_csv('../data/clean.csv')

with open('../data/meta_data.json') as json_file:
    meta = json.load(json_file)

In [50]:
meta['y']
data.head()

Unnamed: 0,review_scores_rating,accommodates,guests_included,availability_365,number_of_reviews_ltm,minimum_nights_avg_ntm,maximum_nights_avg_ntm,bathrooms,bedrooms,beds,price,extra_people,security_deposit,cleaning_fee,host_response_rate,maximum_nights,minimum_nights,city,property_type,room_type,bed_type,cancellation_policy,is_location_exact,requires_license,instant_bookable,require_guest_profile_picture,require_guest_phone_verification,host_is_superhost,host_has_profile_pic,host_identity_verified
0,97.0,3,2,85,52,1,30,1.0,1.0,2.0,170,25,100.0,100.0,100.0,30,1,San Francisco,Apartment,Entire home/apt,Real Bed,moderate,True,True,False,False,False,True,True,True
1,94.0,2,2,62,36,1,5,1.0,1.0,1.0,99,20,0.0,10.0,100.0,5,1,San Francisco,House,Private room,Real Bed,strict_14_with_grace_period,True,True,False,False,False,True,True,True
2,98.0,5,2,0,0,30,60,1.0,2.0,3.0,235,0,,100.0,80.0,60,30,San Francisco,Apartment,Entire home/apt,Real Bed,strict_14_with_grace_period,True,True,False,False,False,False,True,True
3,86.0,2,1,365,1,32,60,4.0,1.0,1.0,65,12,200.0,50.0,86.0,60,32,San Francisco,Apartment,Private room,Real Bed,strict_14_with_grace_period,True,True,False,False,False,True,True,True
4,93.0,2,1,365,0,32,90,4.0,1.0,1.0,65,12,200.0,50.0,86.0,90,32,San Francisco,Apartment,Private room,Real Bed,strict_14_with_grace_period,True,True,False,False,False,True,True,True


# Discretisation de Y

Le prof veut qu'on fasse de la clssification et pas de la regression, il faut choisir les categories de la variable Y

In [51]:
px.histogram(data, x = meta['y'], marginal="box", template = template).show()

On peut couper selon le 1er quartil et la moyenne. Cela permet de distribuer les valeurs de manière à peu près homogène. 

In [52]:
y_cat = pd.cut(data[meta['y']], bins=[0, 95, 98, 100], labels=['bad', 'good', 'excellent'])

In [56]:
print(y_cat.value_counts())

excellent    2625
bad          1979
good         1848
Name: review_scores_rating, dtype: int64


In [54]:
data[meta['y']] = y_cat

# Separation entrainement/validation

In [55]:
from sklearn.model_selection import train_test_split

# separate target from predictors
y = data[meta['y']]
X = data.drop(columns = meta['y'])

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [41]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x = ["excellent", "bad", "good"],
    y = y_train.value_counts(),
    name = 'train',
    #marker_color = 'indianred',
))

fig.add_trace(go.Bar(
    x = ["excellent", "bad", "good"],
    y = y_valid.value_counts(),
    name = 'validation',
    #marker_color = 'lightsalmon',
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
#fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()

In [57]:
print(y_train.value_counts())
print()
print(y_valid.value_counts())

excellent    2103
bad          1573
good         1485
Name: review_scores_rating, dtype: int64

excellent    522
bad          406
good         363
Name: review_scores_rating, dtype: int64


On constate que les classes sont à peu pres bien reparties dans les ensembles "train" et "validation"

# Preprocessing et entrainement du modele

In [132]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree


### Trie à faire dans les imports

In [59]:
####### Etudier les SimpleImputer

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('ordinal', OrdinalEncoder())
])

# Preprocessing for boolean data
boolean_transformer = OrdinalEncoder()


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, meta['quant']),
        #('cat', categorical_transformer, meta['cat']),
        #('bool'), boolean_transformer, meta['bool']
    ])

In [60]:
# Define model
model_rf = RandomForestClassifier(n_estimators=100, random_state=0)
model_tree = DecisionTreeClassifier(random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf_rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model_rf)
                     ])

clf_tree = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model_tree)
                     ])

# Etude des résultats

In [70]:
# Preprocessing of training data, fit model 
clf_rf.fit(X_train, y_train)
clf_tree.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds_rf = clf_rf.predict(X_valid)
preds_tree = clf_tree.predict(X_valid)

In [127]:
def show_results(name, y_valid, preds):
    
    label = ["bad", "good", "excellent"]
    label_r = label.copy()
    label_r.reverse()
    
    # precision
    print("Accuracy", name, ":", metrics.accuracy_score(y_valid, preds),"\n")

    # matrice de confusion
    m = metrics.confusion_matrix(y_valid, preds, labels = label)
    #print("Confusion matrix", name, ":\n%s" % m)
    
    # heatmap
    h = []
    for elt in reversed(m):
        h.append(elt)
    
    go.Figure(data=go.Heatmap(
                       z = h,
                       x = label,
                       y = label_r,
                       colorscale = "oryel")
             ).show()

## Accuracy et matrice de confusion

In [128]:
show_results("random forest", y_valid, preds_rf)

Accuracy random forest : 0.5445391169635941 



In [129]:
show_results("classification tree", y_valid, preds_tree)

Accuracy classification tree : 0.4469403563129357 



## Importance des variables

In [133]:
# etude de l'importance des variables
imp_tree = clf_tree.steps[1][1].feature_importances_
imp_rf = clf_rf.steps[1][1].feature_importances_

In [134]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x = imp_tree,
    y = meta['quant'],
    name = 'tree',
    #marker_color = 'indianred',
    orientation = 'h'
))
fig.add_trace(go.Bar(
    x = imp_rf,
    y = meta['quant'],
    name = 'random forest',
    #marker_color = 'lightsalmon',
    orientation = 'h'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
#fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()

# Inutile : ne pas executer

In [None]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant') # Your code here

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
]) # Your code here

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define model
model = RandomForestRegressor(n_estimators=1000, random_state=0) # Your code here

# Check your answer
step_1.a.check()