In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('cleaned_data.csv')

In [3]:
df = data.copy()

# PREPROCESSING

In [4]:
# transform True to 1 and False to 0
boolean_features = ["No Transfer Duty", "Office","Pets Allowed", "Standalone Building"]
def process_boolean_features(value):
    if value==True:
        return 1
    else:
        return 0

In [5]:
for col in boolean_features:
    df[col] = df[col].map(process_boolean_features)

In [6]:
#taux de valeur manquante
missing_rate = df.isna().sum()/df.shape[0]

In [7]:
#On elimine les variables à valeur manquante à 90%
df = df[df.columns[missing_rate < 0.90]]

In [8]:
# Drop Price from X
df = df[~df["Price"].isna()]

# TrainTest - Nettoyage - Encodage

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [10]:
trainset, testset = train_test_split(df, test_size=0.2, random_state=0)

In [11]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector

In [12]:
def encodage():

    categorical_features = [
                            'Wall','Security','Roof','City','Province','Suburbaine',
                            'Type of Property'
                        ]
    
    integer_features = [
        "Bathrooms", "Bedrooms", "Garage", "Parking", 
          "Reception Rooms", "Special Features", 
        "Temperature Control", "Dining Rooms", "Garden", "Kitchens", "Lounges", 
        "No Transfer Duty", "Office","Pets Allowed", "Standalone Building"
    ]
    
    # integer_features = make_column_selector(dtype_include=np.number)
    
    # print(len(categorical_features))
    # print(len(integer_features))
    
    size_features = ['Erf Size', 'Floor Size', 'Price per m²', 'Rates and Taxes', 'Levies']
    
    categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'),OneHotEncoder(handle_unknown = "ignore"))
    
    
    size_pipeline = make_pipeline(SimpleImputer(),StandardScaler())
    
    preprocessor = make_column_transformer((categorical_pipeline,categorical_features),
                                          (SimpleImputer(strategy='constant'),integer_features),
                                          (size_pipeline,size_features))
    return preprocessor

In [13]:
def preprocessing(df):
    X = df.drop('Price',axis=1)
    y = df['Price']
    
    return X, y

In [14]:
X_train, y_train = preprocessing(trainset)

In [15]:
X_test, y_test = preprocessing(testset)

# Modelisation

In [16]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
preprocessor = encodage()

In [18]:
RandomForestReg = make_pipeline(preprocessor, RandomForestRegressor(n_estimators=100, random_state=0))

In [19]:
RandomForestReg.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Wall', 'Security', 'Roof',
                                                   'City', 'Province',
                                                   'Suburbaine',
                                                   'Type of Property']),
                                                 ('simpleimputer',
                                                  SimpleImputer(strategy='constant'),
                                                  ['Bath...
          

In [None]:
RandomForestReg.base_estimator_

In [57]:
import pickle
import json

In [21]:
pickl = {
    'regressor': RandomForestReg
}
pickle.dump( pickl, open( 'modelsPickle' + ".p", "wb" ) )

In [75]:
X_test.iloc[0:1,:]

Unnamed: 0,Bathrooms,Bedrooms,Erf Size,Floor Size,Garage,Levies,No Transfer Duty,Office,Parking,Pets Allowed,...,Temperature Control,Type of Property,Wall,City,Province,Suburbaine,Dining Rooms,Garden,Kitchens,Lounges
16223,1.0,2.0,,62.0,,1050.0,0,0,2.0,1,...,0,Apartment / Flat,,sandton,gauteng,broadacres,,,1.0,1.0


In [76]:
RandomForestReg.predict(X_test.iloc[0:1,:])

array([922885.5])

In [55]:
# RandomForestReg.predict(X_test.iloc[2,:].to_numpy().reshape(1,-1)
a = "{\"Bathrooms\":4,\"Bedrooms\":5,\"Erf Size\":992,\"Floor Size\":null,\"Garage\":2,\"Levies\":1050,\"No Transfer Duty\":0,\"Office\":1,\"Parking\":0,\"Pets Allowed\":1,\"Price per m²\":null,\"Rates and Taxes\":2300,\"Reception Rooms\":null,\"Roof\":null,\"Security\":null,\"Special Features\":0,\"Standalone Building\":0,\"Temperature Control\":0,\"Type of Property\":\"House\",\"Wall\":null,\"City\":\"edenvale\",\"Province\":\"gauteng\",\"Suburbaine\":\"dowerglen-ext-5\",\"Dining Rooms\":1,\"Garden\":1,\"Kitchens\":1,\"Lounges\":1}"

In [61]:
a = json.loads(a)

In [77]:
obj = pd.DataFrame()
for key in a:
    if a[key]==None:
        obj[key] = [np.nan]
    else:
        obj[key] = [a[key]]

obj

Unnamed: 0,Bathrooms,Bedrooms,Erf Size,Floor Size,Garage,Levies,No Transfer Duty,Office,Parking,Pets Allowed,...,Temperature Control,Type of Property,Wall,City,Province,Suburbaine,Dining Rooms,Garden,Kitchens,Lounges
0,4,5,992,,2,1050,0,1,0,1,...,0,House,,edenvale,gauteng,dowerglen-ext-5,1,1,1,1


In [78]:
RandomForestReg.predict(obj)

array([3506360.])

In [None]:
RandomForestReg.predict(X)