## Tunisian Property Price prediction

Given data about properties in Tunisian, let's try to predict the price of a given property.

We will use a variety of regression models to make our predictions.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression,Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = pd.read_csv("C:/Users/Ashraf/Documents/Extra-folder/Property_Prices_in_Tunisia.csv")

In [3]:
data

Unnamed: 0,category,room_count,bathroom_count,size,type,price,city,region,log_price
0,Terrains et Fermes,-1.0,-1.0,-1.0,À Vendre,100000.0,Ariana,Raoued,5.000000
1,Terrains et Fermes,-1.0,-1.0,-1.0,À Vendre,316000.0,Ariana,Autres villes,5.499687
2,Appartements,2.0,1.0,80.0,À Louer,380.0,Ariana,Autres villes,2.579784
3,Locations de vacances,1.0,1.0,90.0,À Louer,70.0,Ariana,Autres villes,1.845098
4,Appartements,2.0,2.0,113.0,À Vendre,170000.0,Ariana,Ariana Ville,5.230449
...,...,...,...,...,...,...,...,...,...
12743,Terrains et Fermes,-1.0,-1.0,-1.0,À Vendre,3200000.0,Tunis,Sidi Bou Said,6.505150
12744,Appartements,1.0,1.0,100.0,À Louer,600.0,Tunis,Autres villes,2.778151
12745,Maisons et Villas,3.0,1.0,760.0,À Vendre,1950000.0,Tunis,La Marsa,6.290035
12746,Maisons et Villas,3.0,1.0,190.0,À Vendre,240000.0,Tunis,La Marsa,5.380211


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12748 entries, 0 to 12747
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   category        12748 non-null  object 
 1   room_count      12748 non-null  float64
 2   bathroom_count  12748 non-null  float64
 3   size            12748 non-null  float64
 4   type            12748 non-null  object 
 5   price           12748 non-null  float64
 6   city            12748 non-null  object 
 7   region          12748 non-null  object 
 8   log_price       12748 non-null  float64
dtypes: float64(5), object(4)
memory usage: 896.5+ KB


## Preprocessing

In [5]:
def preprocess_inputs(df):
    df = df.copy()
    # Encode missing values properly
    df = df.replace(-1, np.NaN)
    
    # Fill missing values with column median
    for column in ['room_count', 'bathroom_count', 'size']:
        df[column]=df[column].fillna(df[column].median())
    # Binary_encoding
    df['type'] = df['type'].replace({'À Louer':0, 'À Vendre':1})
    
    # OneHot_encoding
    for column in ['category', 'city', 'region']:
        dummies=pd.get_dummies(df[column],prefix=column)
        df = pd.concat([df,dummies], axis=1)
        df = df.drop(column,axis=1)
        
    # Drop log_price
    df - df.drop('log_price',axis=1)
    
    # Split df into X and y
    y = df['price']
    X = df.drop('price',axis=1)
    
    # Train_Test_Split
    X_train,X_test,y_train,y_test = train_test_split(X,y, train_size=0.7, shuffle=True, random_state=42)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train),index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test),index=X_test.index, columns=X_test.columns)
    
    
    return  X_train,X_test,y_train,y_test

In [6]:
 X_train,X_test,y_train,y_test = preprocess_inputs(data)

In [7]:
X_train

Unnamed: 0,room_count,bathroom_count,size,type,log_price,category_Appartements,category_Bureaux et Plateaux,category_Colocations,category_Locations de vacances,"category_Magasins, Commerces et Locaux industriels",...,region_Tozeur,region_Tunis,region_Téboulba,region_Téboursouk,region_Utique,region_Zaghouan,region_Zaouit-Ksibat Thrayett,region_Zarzis,region_Zarzouna,region_Zéramdine
4991,-0.594067,-1.852280,-0.771113,0.802990,0.574474,-0.769951,-0.196891,-0.070395,-0.154108,4.240885,...,-0.056106,-0.110168,-0.021177,-0.018339,-0.02802,-0.084996,-0.056106,-0.021177,-0.023678,-0.010587
6224,0.126648,-0.422036,-0.256937,0.802990,0.733808,-0.769951,-0.196891,-0.070395,-0.154108,-0.235800,...,-0.056106,-0.110168,-0.021177,-0.018339,-0.02802,-0.084996,-0.056106,-0.021177,-0.023678,-0.010587
2448,-0.594067,-0.422036,-0.514025,-1.245346,-1.237482,1.298785,-0.196891,-0.070395,-0.154108,-0.235800,...,-0.056106,-0.110168,-0.021177,-0.018339,-0.02802,-0.084996,-0.056106,-0.021177,-0.023678,-0.010587
474,0.126648,-0.422036,-0.373795,0.802990,0.613515,-0.769951,-0.196891,-0.070395,-0.154108,-0.235800,...,-0.056106,-0.110168,-0.021177,-0.018339,-0.02802,-0.084996,-0.056106,-0.021177,-0.023678,-0.010587
6434,-0.594067,1.008207,-0.589983,0.802990,0.733808,1.298785,-0.196891,-0.070395,-0.154108,-0.235800,...,-0.056106,-0.110168,-0.021177,-0.018339,-0.02802,-0.084996,-0.056106,-0.021177,-0.023678,-0.010587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11964,-0.594067,-0.422036,-0.490653,-1.245346,-1.363953,1.298785,-0.196891,-0.070395,-0.154108,-0.235800,...,-0.056106,-0.110168,-0.021177,-0.018339,-0.02802,-0.084996,-0.056106,-0.021177,-0.023678,-0.010587
5191,-0.594067,-0.422036,0.385783,0.802990,0.721075,-0.769951,-0.196891,-0.070395,-0.154108,-0.235800,...,-0.056106,-0.110168,-0.021177,-0.018339,-0.02802,-0.084996,-0.056106,-0.021177,-0.023678,-0.010587
5390,0.126648,1.008207,1.963370,0.802990,0.790677,-0.769951,-0.196891,-0.070395,-0.154108,-0.235800,...,-0.056106,-0.110168,-0.021177,-0.018339,-0.02802,-0.084996,-0.056106,-0.021177,-0.023678,-0.010587
860,0.126648,-0.422036,-0.256937,0.802990,0.563900,-0.769951,-0.196891,-0.070395,-0.154108,-0.235800,...,-0.056106,-0.110168,-0.021177,-0.018339,-0.02802,-0.084996,-0.056106,-0.021177,-0.023678,-0.010587


In [8]:
y_train

4991     150000.0
6224     250000.0
2448        450.0
474      170000.0
6434     250000.0
           ...   
11964       300.0
5191     240000.0
5390     300000.0
860      145000.0
7270      33000.0
Name: price, Length: 8923, dtype: float64

## Training

In [9]:
models = {
    '                     LinearRegression:': LinearRegression(),
    ' LinearRegression (L2 Regularization):': Ridge(),
    ' LinearRegression (L1 Regularization):': Lasso(),
    '                  K-Nearest Neighbors:': KNeighborsRegressor(),
    '                      Neural Networks:': MLPRegressor(),
    '                        Decision Tree:': DecisionTreeRegressor(),
    '                        Random Forest:': RandomForestRegressor(),
    '                   Gradient Boosting:': GradientBoostingRegressor()
 
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + "trained.")

                     LinearRegression:trained.
 LinearRegression (L2 Regularization):trained.
 LinearRegression (L1 Regularization):trained.
                  K-Nearest Neighbors:trained.
                      Neural Networks:trained.
                        Decision Tree:trained.
                        Random Forest:trained.
                   Gradient Boosting:trained.


## Results

In [10]:
for name, model in models.items():
    y_pred=model.predict(X_test)
    rmse = np.sqrt(np.mean((y_test - y_pred)**2))
    print(name + "RMSE: {:.2f}".format(rmse))

                     LinearRegression:RMSE: 43644032221070721024.00
 LinearRegression (L2 Regularization):RMSE: 155379349.37
 LinearRegression (L1 Regularization):RMSE: 155563399.06
                  K-Nearest Neighbors:RMSE: 33394106.05
                      Neural Networks:RMSE: 31833386.11
                        Decision Tree:RMSE: 1683062.09
                        Random Forest:RMSE: 3365957.88
                   Gradient Boosting:RMSE: 1301139.91


In [13]:
for name, model in models.items():
    r2 = model.score(X_test, y_test)
    print(name + " R^2: {:.4f}".format(r2))

                     LinearRegression: R^2: -1891604095234189845069824.0000
 LinearRegression (L2 Regularization): R^2: -22.9755
 LinearRegression (L1 Regularization): R^2: -23.0323
                  K-Nearest Neighbors: R^2: -0.1074
                      Neural Networks: R^2: -0.0063
                        Decision Tree: R^2: 0.9972
                        Random Forest: R^2: 0.9887
                   Gradient Boosting: R^2: 0.9983


In [14]:
np.mean(y_test)

2530941.939084967