In [None]:
# Packages for data manipulation
import numpy as np
import pandas as pd
import json

In [None]:
# Packages for dataviz
import plotly.express as px
import plotly.tools as tls
import plotly.graph_objects as go
import cufflinks as cf
import matplotlib.pyplot as plt

In [None]:
# offline configuration of cufflinks
cf.go_offline()

In [None]:
pd.set_option('display.max_columns', 110)

In [None]:
# Load the data
data = pd.read_csv('../data/clean.csv')

with open('../data/meta_data.json') as json_file:
    meta = json.load(json_file)

In [None]:
meta['y']
data.head()

# 1. Modification des variables

## 1.1 Discretisation de Y

Le prof veut qu'on fasse de la clssification et pas de la regression, il faut choisir les categories de la variable Y

In [None]:
px.histogram(data, x = meta['y'], marginal="box").show()

On peut couper selon le 1er quartil et la moyenne. Cela permet de distribuer les valeurs de manière à peu près homogène. 

In [None]:
y_cat = pd.cut(data[meta['y']], bins=[0, 95, 98, 100], labels=['bad', 'good', 'excellent'])

In [None]:
print(y_cat.value_counts())

In [None]:
data[meta['y']] = y_cat

## 1.2 Traitement des donnees geospatiales

Pour les donnees geospatiales, on propose de diviser les quartiers (neighbourhoods_cleansed) en trois categories (good, neutral, bad). On preconise que le l'indicateur qui aura le plus grand impact sur la perception de la qualité des appartements, directement et indirectement, sera le taux de criminalite dans le quartier. Donnees tirees de https://www.areavibes.com/san+francisco-ca/crime/

Selon le score de criminalite de chaque quartier, attribue un score 0, 1, 2.

F a D- : 2
D a B- : 1
B a A+ : 0

On ajoute une colonne 'crime' aux données, qui attribue ce score 0/1/2.

In [None]:
#Ajout d'une colonne 'crime'

neighbourhood_crime=[['Bayview', 'Bernal Heights', 'Castro/Upper Market', 'Chinatown', 'Diamond Heights', 'Downtown/Civic Center','Haight Ashbury','Lakeshore', 'Marina', 'Mission', 'Nob Hill', 'Outer Mission', 'Potrero Hill', 'Russian Hill', 'South of Market', 'Visitacion Valley', 'Western Addition'],
                     ['Excelsior', 'Financial District', 'Golden Gate Park', 'Glen Park', 'Inner Richmond', 'North Beach', 'Pacific Heights', 'Parkside', 'Presidio Heights', 'Seacliff', 'Twin Peaks', 'West of Twin Peaks'],
                     ['Crocker Amazon', 'Inner Sunset', 'Noe Valley', 'Ocean View', 'Outer Richmond', 'Outer Sunset', 'Treasure Island/YBI']]

data['crime'] = float('nan')
crime_color = []

In [None]:
#Remplissage de la colonne 'crime'

for i in range(len(data)):
    
    if data['neighbourhood_cleansed'][i] in neighbourhood_crime[0]:
        data['crime'][i] = 2
        crime_color.append('maroon')
    
    elif data['neighbourhood_cleansed'][i] in neighbourhood_crime[1]:
        data['crime'][i] = 1
        crime_color.append('darkorange')
    
    elif data['neighbourhood_cleansed'][i] in neighbourhood_crime[2]:
        data['crime'][i] = 0
        crime_color.append('orangered')
    
    else:
        #print('review neighbourhood_crime for missing neighbourhoods')
        crime_color.append('grey')

In [None]:
# Visualisation des donnees sur une carte :


#Box = (data.longitude.min(),   data.longitude.max(),      
#        data.latitude.min(), data.latitude.max())
    
Box = (-122.5132, -122.3686,      
       37.7045, 37.8290)

carte = plt.imread("../data/map.png")

fig, ax = plt.subplots(figsize = (11.48,12.48))
ax.scatter(data.longitude, data.latitude, zorder=1, alpha= 1, c=crime_color, s=10)
#ax.scatter(data.longitude, data.latitude, zorder=1, alpha= 1, c='fuchsia', s=10)
ax.set_title('Plotting Spatial Data on SF Map')
ax.set_xlim(Box[0],Box[1])
ax.set_ylim(Box[2],Box[3])
ax.imshow(carte, zorder=0, extent = Box, aspect= 'equal')

#fig.savefig('testeo')

## 1.3 Traitement des variables categoriques

In [None]:
data['property_type'].value_counts()

In [None]:
data['property_type'] = data['property_type'].replace(['Serviced apartment', 'Loft'], 'Apartment')

data['property_type'] = data['property_type'].replace(['Boutique hotel', 'Hostel','Aparthotel'], 'Hotel')

data['property_type'] = data['property_type'].replace(['Townhouse', 'Guesthouse', 'Villa'], 'House')

data['property_type'] = data['property_type'].replace(['Bed and breakfast'], 'Guest suite')

data['property_type'] = data['property_type'].replace(['Bungalow', 'Cottage', 'Earth house', 'Cabin', 'In-law',
                                       'Dome house', 'Resort', 'Castle', 'Tiny house'], 'Other')

Il faut verifier les categories que j'ai crée, j'ai des doutes sur certains regroupements, comme par exemple pour **Loft, Aparthotel, ...**

In [None]:
data['city'] = data['city'].replace(['San Francisco, Hayes Valley', 'Noe Valley - San Francisco', 
                                     'San Francisco ', 'Brisbane'], 'San Francisco')

In [None]:
print(data['crime'].isna().sum())
data['crime'].value_counts()

In [None]:
#data["name"] + " " + data["summary"]

for var in meta["text"]:
    data[var] = data[var].replace(np.nan, "", regex=True)

In [None]:
data["name"] = data["name"] + " " + data["summary"] + " " + data["space"] + " " + data["description"]

data["neighborhood_overview"] = data["neighborhood_overview"] + " " + data["notes"] + " " + data["interaction"] + " " + data["house_rules"] + " " + data["host_about"]

data["transit"] = data["transit"] + " " + data["access"]

In [None]:
data = data.drop(['summary', 'space', 'description', 'notes', 'interaction', 'house_rules', 'host_about', 'access'], axis=1)

data = data.rename(columns={"name": "housing_description", "neighborhood_overview": "context_description"})

In [None]:
data.head()

# 2. Separation entrainement/validation

In [None]:
from sklearn.model_selection import train_test_split

# separate target from predictors
y = data[meta['y']]
X = data.drop(columns = meta['y'])

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x = ["excellent", "bad", "good"],
    y = y_train.value_counts(),
    name = 'train',
    #marker_color = 'indianred',
))

fig.add_trace(go.Bar(
    x = ["excellent", "bad", "good"],
    y = y_valid.value_counts(),
    name = 'validation',
    #marker_color = 'lightsalmon',
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
#fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()

In [None]:
print(y_train.value_counts())
print()
print(y_valid.value_counts())

On constate que les classes sont à peu pres bien reparties dans les ensembles "train" et "validation"

## 3.2 Données textuelles

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.cluster import KMeans
from nltk.stem import LancasterStemmer
import nltk
nltk.download('punkt')
import re
import string

In [None]:
stemmer = LancasterStemmer()

def tokenize(text):
        return [stemmer.stem(re.sub(r'\d+', '', word).translate(str.maketrans('', '', string.punctuation))) for word in nltk.word_tokenize(text) if len(word) > 1]


In [None]:
var_text = ["housing_description", "context_description", "transit"]

In [None]:
for var in var_text:
    
    # On construit la matrice terme-document avec la ponderation TF-IDF. On note le vocabulaire obtenu
    vectorizer = TfidfVectorizer(lowercase = True,
                             stop_words = 'english',
                             min_df = 200,
                             tokenizer = tokenize
                            )

    td = vectorizer.fit_transform(X_train[var])
    voc = vectorizer.get_feature_names()
    
    print(voc)
    
    # On crée des clusters sur la matrice terme-document. On remplace chaque document par son numero de cluster 
    kmeans = KMeans(n_clusters=5, init = 'random', random_state=0).fit( preprocessing.scale(td.todense()) )
    X_train[var] = kmeans.predict( preprocessing.scale(td.todense()) )
    
    # Pour l'ensemble de validation, on crée ensuite la matrice terme-document avec le vocabulaire precedent
    new_vectorizer = TfidfVectorizer(lowercase = True,
                             tokenizer = tokenize,
                             vocabulary = voc)

    # On regarde ensuite dans quel cluster tombe chaque document
    new_td = new_vectorizer.fit_transform(X_valid[var])
    X_valid[var] = kmeans.predict( preprocessing.scale(new_td.todense()) )
    
    print( var + " : done" )

# 3. Preprocessing et entrainement du modele

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree


### Trie à faire dans les imports

In [None]:
####### Etudier les SimpleImputer

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy = 'most_frequent')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('ordinal', OrdinalEncoder())
    #('label', LabelEncoder())
])


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, meta['quant'] + meta['date']),
        #('cat', categorical_transformer, meta['cat'] + meta['bool'] + ['crime'])
        ('cat', categorical_transformer, meta['cat'] + meta['bool'] + ['crime'] + var_text)
    ])

label = meta['quant'] + meta['date'] + meta['cat'] + meta['bool'] + ['crime'] + var_text
#label = meta['quant'] + meta['date'] + meta['cat'] + meta['bool'] + ['crime']

In [None]:
# Define model
model_rf = RandomForestClassifier(n_estimators=100, random_state=0)
model_tree = DecisionTreeClassifier(random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf_rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model_rf)
                         ])

clf_tree = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model_tree)
                         ])

# 4. Etude des résultats

In [None]:
# Preprocessing of training data, fit model 
clf_rf.fit(X_train, y_train)
clf_tree.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds_rf = clf_rf.predict(X_valid)
preds_tree = clf_tree.predict(X_valid)

In [None]:
import plotly.figure_factory as ff

def show_results(name, y_valid, preds):
    
    label = ["bad", "good", "excellent"]
    label_r = label.copy()
    label_r.reverse()
    
    # precision
    print("Accuracy", name, ":", metrics.accuracy_score(y_valid, preds),"\n")

    # matrice de confusion
    m = metrics.confusion_matrix(y_valid, preds, labels = label)
    #print("Confusion matrix", name, ":\n%s" % m)
    
    # heatmap
    h = []
    for elt in reversed(m):
        h.append(elt)
    
    ff.create_annotated_heatmap(
            z = h,
            x = label,
            y = label_r,
            colorscale = "oryel"
        ).show()

## 4.1 Accuracy et matrice de confusion

In [None]:
show_results("random forest", y_valid, preds_rf)

In [None]:
show_results("classification tree", y_valid, preds_tree)

## 4.2 Importance des variables

In [None]:
# etude de l'importance des variables
imp_tree = clf_tree.steps[1][1].feature_importances_
imp_rf = clf_rf.steps[1][1].feature_importances_

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x = imp_tree,
    y = label,
    name = 'tree',
    orientation = 'h'
))

fig.add_trace(go.Bar(
    x = imp_rf,
    y = label,
    name = 'random forest',
    orientation = 'h'
))

fig.update_layout(
    #autosize = True,
    # width=500,
    height = 1000
)

fig.show()

## 4.3 Affichage de l'arbre

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(100,20))
a = plot_tree(clf_tree[1], 
              feature_names = label, 
              filled = True, 
              rounded = True, 
              fontsize = 14,
              max_depth=5,
              impurity = False)

value : [ bad, excellent, good ]

In [None]:
y_train.value_counts()

In [None]:
data_to_print = X_train.join(y_train)

In [None]:
data_to_print.head()

In [None]:
color = []

for row in data_to_print.itertuples(index=False):
    
    if row.review_scores_rating == "excellent":
        color.append('maroon')
        
    elif row.review_scores_rating == "good":
        color.append('darkorange')
        
    elif row.review_scores_rating == "bad":
        color.append('orangered')
    
    else:
        #print('review neighbourhood_crime for missing neighbourhoods')
        color.append('grey')

In [None]:
# Visualisation des donnees sur une carte :


#Box = (data.longitude.min(),   data.longitude.max(),      
#        data.latitude.min(), data.latitude.max())
    
Box = (-122.5132, -122.3686,      
       37.7045, 37.8290)

carte = plt.imread("../data/map.png")

fig, ax = plt.subplots(figsize = (11.48,12.48))
ax.scatter(data_to_print.longitude, data_to_print.latitude, zorder=1, alpha= 1, c=color, s=10)
ax.set_title('Plotting rating on SF Map')
ax.set_xlim(Box[0],Box[1])
ax.set_ylim(Box[2],Box[3])
ax.imshow(carte, zorder=0, extent = Box, aspect= 'equal')

#fig.savefig('testeo')