In [1]:
import pandas as pd
import numpy as np
import sklearn 
from sklearn.linear_model import LinearRegression as lr
from sklearn.model_selection import train_test_split as tts, cross_val_score

In [2]:
data = pd.read_csv('C:/Users/EGBUNA/Jiji_web_data/jiji_house_data(engineered).csv')

## first creat a function for cross validataion and model fitting


In [3]:

def trained(model, x, y):
    X_train,X_test,y_train,y_test = tts(X,np.log(y), test_size = 0.2, random_state = 42)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    cvs = cross_val_score(model, X_train, y_train, scoring = 'neg_mean_absolute_error', cv = 10)
    print('mean = ', np.mean(cvs))
    print('std = ', np.std(cvs))
    return model

## using LinearRegression to check our model

In [4]:
from sklearn.linear_model import LinearRegression as LR
model = LR()
X,y = data.drop(['price'], axis = 1), data.price
trained(model, X,y)

mean =  -0.04527072349418293
std =  0.006603782700596411


LinearRegression()

## using GridSearchCV for model tunning

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
clf = GridSearchCV(RandomForestRegressor(),param_grid = {'n_estimators' : [100,200,300,400,500],'max_depth': [7,9,11,13,15,17]}, cv = 5, return_train_score = False)
X_train,X_test,y_train,y_test = tts(X,y,test_size = 0.2, random_state = 42)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [7, 9, 11, 13, 15, 17],
                         'n_estimators': [100, 200, 300, 400, 500]})

In [6]:
clf.best_params_

{'max_depth': 7, 'n_estimators': 300}

## using Random Forest Regressor to check our model

In [7]:

model = RandomForestRegressor(n_estimators = 100, max_depth = 11)
trained(model, X,y)

mean =  -0.03962469810788373
std =  0.00443395846175076


RandomForestRegressor(max_depth=11)

In [17]:
model = RandomForestRegressor(n_estimators = 400, max_depth = 7)
trained(model, X,y)

mean =  -0.03957826116640015
std =  0.003709165648313345


RandomForestRegressor(max_depth=7, n_estimators=400)

## using StackingRegressor Regressor to check our model

In [9]:
## creating my stacking algorithms 
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
# define the base models
level0 = list()
level0.append(('knn', KNeighborsRegressor()))
level0.append(('cart', DecisionTreeRegressor(max_depth = 15)))
level0.append(('svm', SVR()))
level0.append(('linear_Regression', LinearRegression()))
# define meta learner model
level1 = RandomForestRegressor(n_estimators = 300, max_depth = 7)
# define the stacking ensemble
model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
trained(model,X,y)

mean =  -0.04246813694406033
std =  0.004917054022437533


StackingRegressor(cv=5,
                  estimators=[('knn', KNeighborsRegressor()),
                              ('cart', DecisionTreeRegressor(max_depth=15)),
                              ('svm', SVR()),
                              ('linear_Regression', LinearRegression())],
                  final_estimator=RandomForestRegressor(max_depth=7,
                                                        n_estimators=300))

In [16]:

model = RandomForestRegressor(n_estimators = 300, max_depth = 7)
trained(model, X,y)

mean =  -0.03965814325938169
std =  0.0038481002395931384


RandomForestRegressor(max_depth=7, n_estimators=300)

## i'll be using the random forest model with n_estimators of 400 and max_depth of 7

In [18]:
predict = model.predict(X_test)


### revert it back to normal price and check the diffrence between the actual value and predicted value 

In [19]:
pred = np.exp(np.exp(predict))

In [20]:
np.round(pred)

array([  810803.,   360160.,  7131713.,   993592.,  4284974.,   408361.,
         671871.,   866469.,  1889351.,  2233528.,   256125.,   750741.,
        2133380.,  2508311.,  1858493.,  6657751.,   563290.,  1601422.,
         676860.,  5373567.,  1010794.,  1152488., 10512658.,   875856.,
         938951.,  1372027.,  2526550.,  3022897.,   562669.,  3047299.,
         651568.,   743396.,  3910098.,  2002000.,  1103097.,  1137483.,
         243811.,   448114.,  4055040.,  1037468.,  3200455.,   872350.,
        1970003.,  8728546.,  9373816.,   468755.,  2360420., 14418048.,
        7413262., 15226515.,   621703.,  2711382.,  7744134.,  9126591.,
        2782479., 11037506.,   354844.,  9909589.,   668929.,  3395260.,
         659264.,  1434759.,  1206083.,   452771., 22732990.,  4039625.,
         813102.,   507551.,   671871.,   447149., 11375512.,  1152488.,
        1715832., 12287798.,   454053.,   214569.,   980252.,  9909589.,
         223758.,  1354863.,  2069995.,  1350356., 

In [21]:
actual = np.exp(y_test)

In [22]:
index = y_test.index

In [23]:
new_data = pd.DataFrame({'actual':actual, 'predicted':np.round(pred)})
new_data['difference'] = np.abs(new_data.actual - np.round(new_data.predicted))
new_data

Unnamed: 0,actual,predicted,difference
641,600000.0,810803.0,210803.0
302,1700000.0,360160.0,1339840.0
369,8000000.0,7131713.0,868287.0
493,10800000.0,993592.0,9806408.0
579,10000000.0,4284974.0,5715026.0
...,...,...,...
51,720000.0,332698.0,387302.0
204,1800000.0,3901141.0,2101141.0
544,4800000.0,1646521.0,3153479.0
428,10000000.0,5136640.0,4863360.0


In [24]:
new_data.head(20)

Unnamed: 0,actual,predicted,difference
641,600000.0,810803.0,210803.0
302,1700000.0,360160.0,1339840.0
369,8000000.0,7131713.0,868287.0
493,10800000.0,993592.0,9806408.0
579,10000000.0,4284974.0,5715026.0
54,230000.0,408361.0,178361.0
645,1500000.0,671871.0,828129.0
257,400000.0,866469.0,466469.0
663,1500000.0,1889351.0,389351.0
478,1700000.0,2233528.0,533528.0


## saving the model to a Pickle file

In [25]:
import joblib
joblib.dump(model, '/Users/EGBUNA/jiji_web_data/jiji_prediction_model_new.pkl')

['/Users/EGBUNA/jiji_web_data/jiji_prediction_model_new.pkl']

In [None]:
joblib.load('/Users/EGBUNA/jiji_web_data/jiji_prediction_model_new.pkl')

In [None]:
data.columns