In [None]:
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
from IPython import get_ipython


In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt # dataviz
import seaborn as sns # dataviz
from pandas.plotting import scatter_matrix

Rental= pd.read_csv("./dataset/immo_data.csv")

get_ipython().run_line_magic('matplotlib', 'inline')


In [None]:
Rental.info()

In [None]:
Rental.describe() #shows a summary of the numerical attributes


In [None]:
Berlin=Rental.loc[Rental["regio2"]=='Berlin']



In [None]:
Berlin.shape

In [None]:
corr_matrix = Berlin.corr()
corr_matrix["totalRent"].sort_values(ascending=False)


In [None]:
attributes = ["baseRent","totalRent","livingSpace", "serviceCharge", "noRooms","heatingCosts","picturecount"]
scatter_matrix(Berlin[attributes], figsize=(16, 12))
scatter_matrix


In [None]:
Berlin["totalRent"].describe()


In [None]:
Berlin['totalRent'].hist(bins=30, range=(100,4000), grid=True, color='#86bf91')
plt.title('Distribution of Base Rents')
plt.xlabel('Total Rent')
plt.ylabel('Count')


In [None]:
Berlin.plot(kind="scatter", x="livingSpace", y="totalRent", alpha=0.1)


In [None]:
Berlin.plot(kind="scatter", x="yearConstructed", y="totalRent", alpha=0.1)

In [None]:
m=Berlin.groupby(['regio3'])['baseRent'].mean()
m.sort_values()


In [None]:
#droping initial columns
cols_to_drop = ["telekomHybridUploadSpeed", "picturecount", "telekomUploadSpeed",
                "geo_bln", "houseNumber", "geo_krs", "geo_plz", "regio3", "description",
                "facilities"]

Berlin = Berlin.drop(cols_to_drop, axis=1)

#Columns with several NULL entries are dropped too.

Berlin.isna().sum()

#filter columns for berlin
Berlin = Berlin[Berlin["regio2"]=="Berlin"]

#sorting and re_indexing regarding to the price
Berlin = Berlin.sort_values(by=['totalRent'])
Berlin = Berlin.reset_index(drop=True)

#filter some columns between specific amount of values
Berlin = Berlin.query("totalRent >= 100").query("totalRent<10000")
Berlin = Berlin.query("baseRent >= 100").query("baseRent<10000")
Berlin = Berlin.query("livingSpace >= 10").query("livingSpace<400")
Berlin = Berlin.query("noRooms >= 0").query("noRooms<15")




# Replacing columns with f/t with 0/1
Berlin.replace({False: 0, True: 1}, inplace=True)


In [None]:
#make a single binary variable to indicate if the apartment is refurbished/new
Berlin['refurbished'] = (Berlin.condition == 'refurbished') | (Berlin.condition == 'first_time_use') | (Berlin.condition == 'mint_condition') | (Berlin.condition == 'fully_renovated') | (Berlin.condition == 'first_time_use_after_refurbishment')

#make a binary variable to indicate if the rental property has good interior
Berlin['greatInterior'] = (Berlin.interiorQual == 'sophisticated') | (Berlin.interiorQual == 'luxury')

#make a binary variable to indicated if the rental property has good heating
Berlin['goodHeating'] = (Berlin.heatingType == 'central_heating') | (Berlin.heatingType == 'floor_heating') | (Berlin.heatingType == 'self_contained_central_heating')

#make a binary variable to identify rental ads from last year to factor in any inflationary effects.
Berlin['2018_ads'] = (Berlin.date == 'Sep18')

#transform totalRent into log(totalRent) to get a better distribution + better interpretive quality
Berlin['logRent'] = np.log(Berlin['totalRent'])


In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

y_var = ['logRent']
X_var = ['balcony', 'hasKitchen', 'cellar', 'livingSpace', 'noRooms', 'garden', 
         'refurbished', 'greatInterior', 'newlyConst',
         '2018_ads', 'lift']

#print(Berlin[X_var])

y = Berlin[y_var].iloc[:,0].values
X = Berlin[X_var].iloc[:,3].values
#y = Berlin[y_var].values
#X = Berlin[X_var].values

#print(X)
#print(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
X_train.shape

In [None]:
X_test.shape

# Fine tunning

## 1. Linear Regression model

In [None]:
from sklearn.linear_model import LinearRegression

linear_regressor = LinearRegression()
#linear_regressor.fit(np.array(X_train.reshape(-1, 1)), y_train.reshape(-1, 1))
linear_regressor.fit(X_train.reshape(-1, 1), y_train.reshape(-1, 1))
                     

y_predict = linear_regressor.predict(X_train.reshape(-1, 1))
print(y_predict)

In [None]:
# Plot points and fit line for training data
plt.scatter(X_train.reshape(-1, 1), y_train.reshape(-1, 1), color='teal', edgecolors='black', label='Training-set observation points')
plt.plot(X_train, y_predict, color='grey', label='Fit Regression Line')
plt.title('totalRent vs features')
plt.xlabel('features')
plt.ylabel('totalRent (in USD)')

# plot scatter points and line for test data
plt.scatter(X_test, y_test, color='red', edgecolors='black', label='Test-set observation points')
plt.legend()
plt.show()

## 2. Decision Tree model

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train.reshape(-1, 1), y_train.reshape(-1, 1))
tree_scores = cross_val_score(tree_reg, X_train.reshape(-1, 1), y_train.reshape(-1, 1),
                              scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)
tree_rmse_scores

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

In [None]:
from sklearn.linear_model import LinearRegression

lin_scores = cross_val_score(tree_reg, X_test.reshape(-1, 1), y_test.reshape(-1, 1),
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

## 3. Random forest model

In [None]:
y_var = ['logRent']
X_var = ['balcony', 'hasKitchen', 'cellar', 'livingSpace', 'noRooms', 'garden', 'baseRent',
         'refurbished', 'greatInterior', 'newlyConst',
         '2018_ads', 'lift']

#print(Berlin[X_var])
#y = Berlin[y_var].iloc[:,0].values
#X = Berlin[X_var].iloc[:,0].values
y = Berlin[y_var].values
X = Berlin[X_var].values

print(X)
#print(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=0)

In [None]:
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)
prediction = forest_reg.predict(X_test)
forest_mse21 = mean_squared_error(y_test, prediction)
forest_rmse21 = np.sqrt(forest_mse21)
print("rmse:", forest_rmse21)
print("mse:", forest_mse21)

In [None]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, X_test, y_test,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

In [None]:

scores = cross_val_score(forest_reg, X_test, y_test, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()

### Fine tunning Random forest Regressor(Grid Search)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
        {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 11]},
        {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
    ]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_


In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

### fine tunning Random forest regressor(RandomizedSearch)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg1 = RandomForestRegressor()
rnd_search = RandomizedSearchCV(forest_reg1, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error')
rnd_search.fit(X_train, y_train)

In [None]:
rnd_search.best_params_

In [None]:
rnd_search.best_estimator_

In [None]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_importances = rnd_search.best_estimator_.feature_importances_
feature_importances

In [None]:
final_model = grid_search.best_estimator_

#X_test = strat_test_set.drop("median_house_value", axis=1)
#y_test = strat_test_set["median_house_value"].copy()

#X_test_transformed = preparation_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test)

final_mse0 = mean_squared_error(y_test, final_predictions)
final_rmse0 = np.sqrt(final_mse0)
print("rmse:", final_rmse0)
print("mse:", final_mse0)

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

y_var = ['logRent']
X_var = ['balcony', 'hasKitchen', 'cellar', 'livingSpace', 'noRooms', 'garden', 
         'refurbished', 'greatInterior', 'newlyConst',
         '2018_ads', 'lift']

#print(Berlin[X_var])

y = Berlin[y_var].iloc[:,0].values
X = Berlin[X_var].iloc[:,3:4].values
#y = Berlin[y_var].values
#X = Berlin[X_var].values

#print(X)
#print(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

y_var = ['logRent']
X_var = ['balcony', 'hasKitchen', 'cellar', 'livingSpace', 'noRooms', 'garden', 
         'refurbished', 'greatInterior', 'newlyConst',
         '2018_ads', 'lift']

#print(Berlin[X_var])

y = Berlin[y_var].iloc[:,0].values
…
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestRegressor
"""forest_regressor = RandomForestRegressor(n_estimators = 30, random_state = 1111,
                                         max_depth=30, max_features=6, min_samples_leaf=10)"""

forest_regressor = RandomForestRegressor(n_estimators = 30, random_state = 42)
forest_regressor.fit(X_train, y_train)


X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape(len(X_grid), 1)

In [None]:

# Plot points and fit line for training data
plt.scatter(X_train, y_train, color='teal', edgecolors='black', label='Training-set observation points')
plt.plot(X_grid, forest_regressor.predict(X_grid), color='grey', label='Random Regressor Line')
plt.title('totalRent vs features')
plt.xlabel('features')
plt.ylabel('totalRent (in USD)')

# plot scatter points and line for test data
plt.scatter(X_test, y_test, color='red', edgecolors='black', label='Test-set observation points')
plt.legend()
plt.show()

## 4. GradientBoosting 

In [None]:
y_var = ['logRent']
X_var = ['balcony', 'hasKitchen', 'cellar', 'livingSpace', 'noRooms', 'garden',
         'refurbished', 'greatInterior', 'newlyConst',
         '2018_ads', 'lift']

#print(Berlin[X_var])
#y = Berlin[y_var].iloc[:,0].values
#X = Berlin[X_var].iloc[:,0].values
y = Berlin[y_var].values
X = Berlin[X_var].values

print(X)
#print(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=0)


In [None]:
"""from sklearn.model_selection import cross_val_score

from sklearn.ensemble import GradientBoostingRegressor


gradient_reg = GradientBoostingRegressor()
gradient_reg.fit(X_train, y_train)

gradient_scores = cross_val_score(gradient_reg, X_train, y_train,
                              scoring="neg_mean_squared_error", cv=10)
gradient_rmse_scores = np.sqrt(-gradient_scores)"""

In [None]:
"""gradient_scores = cross_val_score(gradient_reg, X_train, y_train,
                             scoring="neg_mean_squared_error", cv=10)
gradient_rmse_scores = np.sqrt(-gradient_scores)
display_scores(gradient_rmse_scores)"""

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

grad_reg = GradientBoostingRegressor()
grad_reg.fit(X_train, y_train)
housing_predictions = grad_reg.predict(X_train)
grad_mse12 = mean_squared_error(y_train, housing_predictions)
grad_rmse12 = np.sqrt(grad_mse12)
print("final_rmse:", grad_rmse12)
print("final_mse:", grad_mse12)

In [None]:
from sklearn.model_selection import cross_val_score

grad_scores = cross_val_score(grad_reg, X_train, y_train,
                                scoring="neg_mean_squared_error", cv=10)
grad_rmse_scores = np.sqrt(-grad_scores)
display_scores(grad_rmse_scores)

In [None]:
scores = cross_val_score(grad_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()

### fine tunning gradientBoosting(Gridsearch)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
param_grid = [
        {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 11]},
        {'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
    ]

gradientBoosting_reg = GradientBoostingRegressor()

gradientBoosting_search = GridSearchCV(gradientBoosting_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
gradientBoosting_search.fit(X_train, y_train)

In [None]:
gradientBoosting_search.best_params_


In [None]:
gradientBoosting_search.best_estimator_


In [None]:
cvres = gradientBoosting_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

### fine tunning GradientBoosting (RandomizedSearch)

In [None]:

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

gradientBoost_reg1 = GradientBoostingRegressor()
gradientBoost_search = RandomizedSearchCV(gradientBoost_reg1, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error')
gradientBoost_search.fit(X_train, y_train)

In [None]:
gradientBoost_search.best_params_

In [None]:
gradientBoost_search.best_estimator_

In [None]:
cvres = gradientBoost_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_boost_importances = gradientBoost_search.best_estimator_.feature_importances_
feature_boost_importances

In [None]:
final_model1 = gradientBoosting_search.best_estimator_
#X_test = strat_test_set.drop("median_house_value", axis=1)
#y_test = strat_test_set["median_house_value"].copy()

#X_test_transformed = preparation_pipeline.transform(X_test)
final_predictions = final_model1.predict(X_test)

final_mse2 = mean_squared_error(y_test, final_predictions)
final_rmse2 = np.sqrt(final_mse2)
print("final_rmse:", final_rmse2)
print("final_mse:", final_mse2)

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

y_var = ['logRent']
X_var = ['balcony', 'hasKitchen', 'cellar', 'livingSpace', 'noRooms', 'garden', 
         'refurbished', 'greatInterior', 'newlyConst',
         '2018_ads', 'lift']

#print(Berlin[X_var])

y = Berlin[y_var].iloc[:,0].values
X = Berlin[X_var].iloc[:,3].values
#y = Berlin[y_var].values
#X = Berlin[X_var].values

#print(X)
#print(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
"""forest_regressor = RandomForestRegressor(n_estimators = 30, random_state = 1111,
                                         max_depth=30, max_features=6, min_samples_leaf=10)"""

gradient_regressor = GradientBoostingRegressor(n_estimators = 30, random_state =0)
gradient_regressor.fit(X_train.reshape(-1, 1), y_train.reshape(-1, 1))


X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape(len(X_grid), 1)

In [None]:
plt.scatter(X_train, y_train, color='blue', label='Actual observation points')
plt.plot(X_grid, gradient_regressor.predict(X_grid), label='Gradient regressor')
plt.title('totalRent vs features (Gradient Boosting)')
plt.xlabel('features')
plt.ylabel('totalRent')

plt.legend()
plt.show()

# Models ----------------------------------------

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

y_var = ['logRent']
X_var = ['balcony', 'hasKitchen', 'cellar', 'livingSpace', 'noRooms', 'garden', 
         'refurbished', 'greatInterior', 'newlyConst',
         '2018_ads', 'lift']

#print(Berlin[X_var])

y = Berlin[y_var].iloc[:,0].values
X = Berlin[X_var].iloc[:,3].values
#y = Berlin[y_var].values
#X = Berlin[X_var].values

#print(X)
#print(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
#LINEAR REGRESSION
from sklearn.linear_model import LinearRegression
from sklearn import metrics

def linear__regression(xtrain, ytrain, xtest, ytest):
    linreg = LinearRegression()
    linreg.fit(xtrain, ytrain)
    y_pred = linreg.predict(xtest)
    
    print('MAE:', metrics.mean_absolute_error(ytest, y_pred))
    print('MSE:', metrics.mean_squared_error(ytest, y_pred))

linear__regression(X_train.reshape(-1, 1), y_train.reshape(-1, 1),
                 X_test.reshape(-1, 1), y_test.reshape(-1, 1))


In [None]:

from sklearn.linear_model import LinearRegression

linear_regressor = LinearRegression()
#linear_regressor.fit(np.array(X_train.reshape(-1, 1)), y_train.reshape(-1, 1))
linear_regressor.fit(X_train.reshape(-1,1), y_train.reshape(-1,1))
                     

y_predict = linear_regressor.predict(X_train.reshape(-1,1))
print(y_predict)

In [None]:
# Plot points and fit line for training data
plt.scatter(X_train, y_train, color='teal', edgecolors='black', label='Training-set observation points')
plt.plot(X_train, y_predict, color='grey', label='Fit Regression Line')
plt.title('totalRent vs Ex_features')
plt.xlabel('Ex_features')
plt.ylabel('totalRent (in USD)')

# plot scatter points and line for test data
plt.scatter(X_test, y_test, color='red', edgecolors='black', label='Test-set observation points')
plt.legend()
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

y_var = ['logRent']
X_var = ['balcony', 'hasKitchen', 'cellar', 'livingSpace', 'noRooms', 'garden', 
         'refurbished', 'greatInterior', 'newlyConst',
         '2018_ads', 'lift']

#print(Berlin[X_var])

y = Berlin[y_var].iloc[:,0].values
X = Berlin[X_var].iloc[:,3:4].values
#y = Berlin[y_var].values
#X = Berlin[X_var].values

#print(X)
#print(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:

#RANDOM FOREST
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

def randomforestreg(msl, mf, md, xtrain, ytrain, xtest, ytest):
    rfr_best = RandomForestRegressor(n_estimators=70, random_state=1111,
                                     max_depth=md, max_features=mf, min_samples_leaf=msl)
    rfr_best.fit(xtrain,ytrain)
    y_pred_rfr = rfr_best.predict(xtrain)
    print('MAE:', metrics.mean_absolute_error(ytest, y_pred_rfr))
    print('MSE:', metrics.mean_squared_error(ytest, y_pred_rfr))
    return y_pred_rfr
#forest_regressor = randomforestreg(10, 6, 30, X_train, y_train, X_test, y_test)


In [None]:
"""from sklearn.svm import SVR

scale_X = StandardScaler()
scale_y = StandardScaler()

X = scale_X.fit_transform(X.reshape(-1,1))
y = scale_y.fit_transform(y.reshape(-1,1))"""

In [None]:
from sklearn.ensemble import RandomForestRegressor
"""forest_regressor = RandomForestRegressor(n_estimators = 30, random_state = 1111,
                                         max_depth=30, max_features=6, min_samples_leaf=10)"""

forest_regressor = RandomForestRegressor(n_estimators = 30, random_state = 42)
forest_regressor.fit(X_train, y_train)


X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape(len(X_grid), 1)

In [None]:

# Plot points and fit line for training data
plt.scatter(X_train, y_train, color='teal', edgecolors='black', label='Training-set observation points')
plt.plot(X_grid, forest_regressor.predict(X_grid), color='grey', label='Random Regressor Line')
plt.title('totalRent vs features')
plt.xlabel('features')
plt.ylabel('totalRent (in USD)')

# plot scatter points and line for test data
plt.scatter(X_test, y_test, color='red', edgecolors='black', label='Test-set observation points')
plt.legend()
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

y_var = ['logRent']
X_var = ['balcony', 'hasKitchen', 'cellar', 'livingSpace', 'noRooms', 'garden', 
         'refurbished', 'greatInterior', 'newlyConst',
         '2018_ads', 'lift']

#print(Berlin[X_var])

y = Berlin[y_var].iloc[:,0].values
X = Berlin[X_var].iloc[:,3].values
#y = Berlin[y_var].values
#X = Berlin[X_var].values

#print(X)
#print(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
"""forest_regressor = RandomForestRegressor(n_estimators = 30, random_state = 1111,
                                         max_depth=30, max_features=6, min_samples_leaf=10)"""

gradient_regressor = GradientBoostingRegressor(n_estimators = 30, random_state =0)
gradient_regressor.fit(X_train.reshape(-1, 1), y_train.reshape(-1, 1))


X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape(len(X_grid), 1)

In [None]:
plt.scatter(X_train, y_train, color='blue', label='Actual observation points')
plt.plot(X_grid, gradient_regressor.predict(X_grid), label='Gradient regressor')
plt.title('totalRent vs features (Gradient Boosting)')
plt.xlabel('features')
plt.ylabel('totalRent')

plt.legend()
plt.show()

## more models --------------------------------------------------------------------- end of final models

In [None]:
#GRADIENT BOOSTING
from sklearn.ensemble import GradientBoostingRegressor

def gradientboostingmachine(md, msl, n, mf, lr, xtrain, ytrain, xtest, ytest):
    gbm_best = GradientBoostingRegressor(n_estimators=n, random_state=1111,
                                         max_depth=md, max_features=mf, 
                                         min_samples_leaf=msl, learning_rate=lr
                                         )
    gbm_best.fit(xtrain, ytrain)
    y_pred_gbm = gbm_best.predict(xtest)
    print('MAE:', metrics.mean_absolute_error(ytest, y_pred_gbm))
    print('MSE:', metrics.mean_squared_error(ytest, y_pred_gbm))
    
#gradientboostingmachine(16, 117, 73, 10, 0.07, X_train, y_train, X_test, y_test)
gradientboostingmachine(16, 117, 157, 5, 0.07, X_train, y_train, X_test, y_test)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
lin_reg = LinearRegression()


scores = cross_val_score(lin_reg, X_train, y_train,
                        scoring="neg_mean_squared_error", cv=10)

# find root mean squared error, scores is an array of negative numbers
rmse_scores = np.sqrt(-scores)

print("Mean:\t\t ", rmse_scores.mean(), "\nStandard Deviation:", rmse_scores.std())


In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

y_var = ['logRent']
X_var = ['balcony', 'hasKitchen', 'cellar', 'livingSpace', 'noRooms', 'garden', 'baseRent', 
         'refurbished', 'greatInterior', 'newlyConst',
         '2018_ads', 'lift']

y = Berlin[y_var].iloc[:,0].values
X = Berlin[X_var].iloc[:,3].values

#print(X)
#print(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=0)


print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
y_var = ['logRent']
X_var = ['balcony', 'hasKitchen', 'cellar', 'livingSpace', 'noRooms', 'garden',
         'refurbished', 'greatInterior', 'newlyConst',
         '2018_ads', 'lift']

y = Berlin[y_var].iloc[:,0].values
X = Berlin[X_var].iloc[:,3:4].values



## Simple Vector Regression

In [None]:
from sklearn.svm import SVR

scale_X = StandardScaler()
scale_y = StandardScaler()

X = scale_X.fit_transform(X.reshape(-1,1))
y = scale_y.fit_transform(y.reshape(-1,1))

In [None]:
svr_regressor = SVR(kernel='rbf', gamma='auto')
svr_regressor.fit(X, y)

In [None]:
plt.scatter(X, y, color='red', label='Actual observation points')
plt.plot(X, svr_regressor.predict(X), label='SVR regressor')
plt.title('totalRent vs Ex_features (SVR Regression) ))')
plt.xlabel('Ex_features')
plt.ylabel('Salary')

plt.legend()
plt.show()

## Decision Tree - Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_regressor = DecisionTreeRegressor(random_state = 0)
tree_regressor.fit(X, y)

In [None]:
X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape(len(X_grid), 1)

plt.scatter(X, y, color='red', label='Actual observation points')
plt.plot(X_grid, tree_regressor.predict(X_grid), label='Tree regressor')
plt.title('totalRent vs Ex_features (Tree Regression)')
plt.xlabel('Ex_features')
plt.ylabel('totalRent')

plt.legend()
plt.show()