In [1]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib_inline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

%matplotlib inline
# set random seed
import random
random.seed(335)
# magic word
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
# for better viz
import pprint
import warnings
warnings.filterwarnings('ignore')

### reference
-------------------

- [pandas cheat sheet](https://github.com/pandas-dev/pandas/tree/master/doc/cheatsheet)
- [numpy cheat sheet(data camp)](https://www.datacamp.com/community/blog/python-numpy-cheat-sheet)
- [scikit-learn cheat sheet(data camp)](datacamp.com/community/blog/scikit-learn-cheat-sheet)

# modeling
---------------------
In this phase, various modeling techniques are selected and applied and their parameters are calibrated to optimal values. Typically, there are several techniques for the same data mining problem type. Some techniques have specific requirements on the form of data. Therefore, stepping back to the data preparation phase is often necessary.

In [2]:
df = pd.read_csv('/Users/esadmazi/Desktop/INNO/COURSES/DMining/LAB/CRISP-DMtemplate/listing_preprocessed.csv', encoding='utf-8')
df.head()


Unnamed: 0,price,beds,livings,wc,area,street_width,age,street_direction,furnished,location_lat,location_lng,user_review,width,length,advertiser_type,distance_to_center,price_per_m2,"Land, sell","Villa, sell","Building, sell","House, sell","Esterahah, sell","Farm, sell","Store, sell",Furnished apartment,"Floor, sell",total_rooms,district_encoded,city_encoded
0,800000.0,0.0,0.0,0.0,750.0,15.0,0.0,4.0,0.0,25.160826,46.42178,4.35,30.0,25.0,agent,6454.112014,1066.666667,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,791056.3,2153276.0
1,250000.0,0.0,0.0,0.0,920.0,30.0,0.0,3.0,0.0,24.639906,39.306129,4.29,30.0,30.0,agent,10937.86842,271.73913,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,483293.1,1288045.0
2,1200000.0,5.0,2.0,3.0,700.0,20.0,10.0,4.0,0.0,24.501982,39.721149,3.8,20.0,35.0,agent,10972.831431,1714.285714,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,10.0,1663143.0,1288045.0
3,1312500.0,0.0,0.0,0.0,375.0,20.0,0.0,2.0,0.0,24.846313,46.796188,4.3,13.0,28.0,normal_marketer,6495.591391,3500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1625998.0,2153276.0
4,400000.0,4.0,1.0,3.0,600.0,10.0,0.0,1.0,1.0,17.702763,42.267738,5.0,30.0,20.0,owner,10682.8812,666.666667,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,551731.7,551731.7


In [3]:
one_hot_encoded = pd.get_dummies(df['advertiser_type'])

# Concatenate the one-hot encoded columns with the original DataFrame
df = pd.concat([df, one_hot_encoded], axis=1)
df = df.drop(columns=['advertiser_type'])
df

Unnamed: 0,price,beds,livings,wc,area,street_width,age,street_direction,furnished,location_lat,location_lng,user_review,width,length,distance_to_center,price_per_m2,"Land, sell","Villa, sell","Building, sell","House, sell","Esterahah, sell","Farm, sell","Store, sell",Furnished apartment,"Floor, sell",total_rooms,district_encoded,city_encoded,agent,exclusive_marketer,normal_marketer,owner
0,800000.0,0.0,0.0,0.0,750.0,15.0,0.0,4.0,0.0,25.160826,46.421780,4.35,30.0,25.0,6454.112014,1066.666667,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.910563e+05,2.153276e+06,1,0,0,0
1,250000.0,0.0,0.0,0.0,920.0,30.0,0.0,3.0,0.0,24.639906,39.306129,4.29,30.0,30.0,10937.868420,271.739130,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.832931e+05,1.288045e+06,1,0,0,0
2,1200000.0,5.0,2.0,3.0,700.0,20.0,10.0,4.0,0.0,24.501982,39.721149,3.80,20.0,35.0,10972.831431,1714.285714,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,10.0,1.663143e+06,1.288045e+06,1,0,0,0
3,1312500.0,0.0,0.0,0.0,375.0,20.0,0.0,2.0,0.0,24.846313,46.796188,4.30,13.0,28.0,6495.591391,3500.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.625998e+06,2.153276e+06,0,0,1,0
4,400000.0,4.0,1.0,3.0,600.0,10.0,0.0,1.0,1.0,17.702763,42.267738,5.00,30.0,20.0,10682.881200,666.666667,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,5.517317e+05,5.517317e+05,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229717,1000050.0,3.0,0.0,3.0,312.0,15.0,0.0,4.0,0.0,25.173681,46.321789,5.00,13.0,24.0,6443.936332,3205.288462,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,7.910563e+05,2.153276e+06,0,1,0,0
229718,900000.0,0.0,0.0,0.0,900.0,36.0,0.0,5.0,0.0,24.454569,46.736145,3.40,30.0,30.0,6494.647391,1000.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.097373e+05,2.153276e+06,0,1,0,0
229719,1750000.0,0.0,0.0,0.0,625.0,15.0,0.0,8.0,0.0,24.626093,46.797058,5.00,25.0,25.0,6498.524028,2800.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.795889e+05,2.153276e+06,1,0,0,0
229720,1450000.0,4.0,2.0,4.0,288.0,19.0,0.0,4.0,0.0,24.889165,46.840835,3.99,12.0,24.0,6499.525462,5034.722222,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,1.625998e+06,2.153276e+06,0,1,0,0


In [4]:
# split data
## closed test
sample_size = 20000
sampled_df = df.sample(n=sample_size, random_state=42)

X = sampled_df.drop('price', axis=1)  # Drop the target column to get features
y = sampled_df['price']  # Select
# Split the sampled data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Support Vector Machine": SVR()
}

results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    rmse_scores = np.sqrt(-scores)
    results[name] = rmse_scores.mean()

In [6]:
# Select the best model
best_model = min(results, key=results.get)
print("Best Model:", best_model)

# Train the best model on the full training set
best_model = models[best_model]
best_model.fit(X_train, y_train)

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Test RMSE:", np.sqrt(mse))

Best Model: Random Forest
Test RMSE: 40457.81510164364


In [7]:
# Assuming best_model has been fit with the Random Forest model
feature_importances = best_model.feature_importances_
features = X_train.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})

# Sort the dataframe by importance score in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display the sorted dataframe
print(importance_df)

                Feature    Importance
3                  area  3.444110e-01
25     district_encoded  3.420372e-01
14         price_per_m2  3.118260e-01
8          location_lat  2.118995e-04
13   distance_to_center  2.107342e-04
9          location_lng  1.947025e-04
12               length  1.669394e-04
10          user_review  1.504034e-04
11                width  1.273728e-04
4          street_width  1.138170e-04
6      street_direction  1.092627e-04
5                   age  7.971720e-05
26         city_encoded  4.846183e-05
24          total_rooms  4.773023e-05
1               livings  3.590603e-05
0                  beds  3.585797e-05
7             furnished  3.477326e-05
29      normal_marketer  2.369804e-05
2                    wc  2.175170e-05
28   exclusive_marketer  2.087169e-05
15           Land, sell  1.962135e-05
27                agent  1.862499e-05
16          Villa, sell  1.520852e-05
21          Store, sell  1.390114e-05
17       Building, sell  1.323568e-05
30          