# Random Forest

In [1]:
import pandas as pd      
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_error,mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("final_scout_20201204.csv")

In [3]:
X=df.drop(["price"], axis=1)
y=df["price"]

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [5]:
'''
Feature sayisi sistikce, arttikca sentetik sekilde R2_score artmaya meyillidir. 
R2_score, icinde fazla feature i cezalandiran yontem yoktur. 
‘overfitting’ durumunda yüksek R-kare değerlerine ulaşabilirsiniz.
R2_score fikir verir ama burada en onemli metrik RMSE dir.

R2_score, dusukse model basarisizdir, yuksekse baska metriklere bakip model degerlendirmesi yapilir.

'''


def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    mse = mean_squared_error(actual, pred)
    score = r2_score(actual, pred)
    return print(f'R2_Score:{score}', # y deki degisimin X lerle ne kadar anlatilabildini ifade eder. bagimli degiskeni olcme gucudur.
                 f'MAE:{mae}', # y tahminlerinde yapilan ortalama hata degeri.
                 f'MSE:{mse}', # RMSE nin karesidir.
                 f'RMSE:{rmse}', sep='\n') # y tahminlerinde yapilan ortalama hata degerini verirken outlier lari daha fazla cezalandiran bir deger verir.



### Random Forest

In [6]:
from sklearn.ensemble import RandomForestRegressor

In [7]:
rf_model=RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

In [8]:
eval_metrics(y_test,y_pred)

R2_Score:0.949455761319509
MAE:905.2840660824039
MSE:2749808.107576605
RMSE:1658.2545364257578


In [9]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(estimator=rf_model, X=X_train, y=y_train, cv=10)
print('R2_score:',cv_scores.mean())

cv_scores = cross_val_score(estimator=rf_model, X=X_train, y=y_train, scoring = "neg_mean_squared_error", cv=10)
print('RMSE:',np.sqrt(-cv_scores.mean()))

KeyboardInterrupt: 

In [None]:
print('Train R2_score:', rf_model.score(X_train, y_train))
print('Test R2_score:', rf_model.score(X_test, y_test))

### > RF Tunning

In [None]:
rf=RandomForestRegressor()

* n_estimators=100 (default)
* decision tree 1 tree de sonuc buluyordu. burada [50, 100, 300] tree yap, bunlarin ortalamasini al dedik.
* 500, 1000 de listeye eklenebilir, ancak sure artar.

* max_depth, tree kac defa asagi gidecek, kac defa bolunecek
* defaultu None olanlara deger vermek genelde modelin sonuclarini kotulestirir.(richard Instructor)

* max_features,En iyi bölünmeyi ararken göz önünde bulundurulması gereken feature sayısıdir. defaultu number of featuresdur. 

* min_samples_split, bir node u bolmeden once gerekli olan minimum sample sayisidir. Bu sayiya dusunce split duruyor.

In [None]:
# rf_params = {"n_estimators":[100, 200, 300],
#               "max_depth":[7,13,19],
#               "max_features": [8,10,12],
#               "min_samples_split": [2,4,6]}

In [None]:
# rf_cv_model = GridSearchCV(rf, rf_params, cv = 5, n_jobs = -1, verbose = 2).fit(X_train, y_train)

In [None]:
# rf_cv_model.best_params_

In [None]:
rf_tuned = RandomForestRegressor(max_depth = 33,             
                                  max_features = 50, 
                                  min_samples_split = 2, 
                                  n_estimators = 500).fit(X_train, y_train)

In [None]:
y_pred = rf_tuned.predict(X_test)

In [None]:
eval_metrics(y_test, y_pred)

In [None]:
'''
Cross Validation Score:

CSV, default olarak, 10 iterasyonla R2 score hesaplatip, ortalamasini verir, 
Yukarida buldugumuzdan daha guvenilir bir R2 scorudur. 
Default iterasyon sayisi ise cv=10 dur, degistirilebilir.
'''

from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(estimator=rf_tuned, X=X_train, y=y_train, cv=10)
print('R2_score:',cv_scores.mean())

In [None]:
'''
Cross Validation Score:

scoring = "neg_mean_squared_error" olarak degistirilerek 10 iterasyonla bulunan daha guvenilir
RMSE degerine ulasilir. 
'''

cv_scores = cross_val_score(estimator=rf_tuned, X=X_train, y=y_train, scoring = "neg_mean_squared_error", cv=10)
print('RMSE:',np.sqrt(-cv_scores.mean()))

### XGBOOST

In [6]:
from xgboost import XGBRegressor

In [7]:
xgb_model = XGBRegressor().fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

In [8]:
eval_metrics(y_test,y_pred)

R2_Score:0.953230740498514
MAE:959.3030976971531
MSE:2544434.189136971
RMSE:1595.1282672992072


###  Tunning XGBOOST

In [9]:
xgb=XGBRegressor()

In [None]:
# xgb_params={"learning_rate":[0.1,0.01,0.3,0.5],
#              "max_depth":[21,25,33],
#              "n_estimators":[50,100,300],
#              "subsample":[0.1,0.5,0.8,1]}


In [None]:
# xgb_cv_model = GridSearchCV(xgb, xgb_params, cv = 5, n_jobs = -1, verbose = 2).fit(X_train, y_train)

In [None]:
# xgb_cv_model.best_params_

In [13]:
xgb_tuned = XGBRegressor(learning_rate= 0.1, 
                                max_depth= 33, 
                                n_estimators= 1000, 
                                subsample= 0.8).fit(X_train, y_train)

In [14]:
y_pred = xgb_tuned.predict(X_test)

In [15]:
eval_metrics(y_test, y_pred)

R2_Score:0.9565291109955208
MAE:850.6498740183246
MSE:2364989.68326964
RMSE:1537.8522956609454
