In [1]:
# Random Forests
# Random forests are a way of averaging multiple deep decision trees, trained on different parts of the same training set,
# with the goal of reducing the variance.

In [3]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score, ShuffleSplit
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale 
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor

from warnings import filterwarnings

from sklearn.ensemble import RandomForestRegressor

data = pd.read_csv("C:/Users/Derya/Downloads/Team_Basketball.csv")
df = data.copy()
df = df.dropna()
dms = pd.get_dummies(df[["League","Division","NewLeague"]])
y = df["Salary"]
X_ = df.drop(["Salary","League","Division","NewLeague"], axis=1).astype("float64")
X = pd.concat([X_,dms[["League_N","Division_W","NewLeague_N"]]], axis=1)
X_train, X_test,y_train, y_test = train_test_split(X,y, test_size=0.25,random_state=42)
X_train = pd.DataFrame(X_train["Hits"])
X_test = pd.DataFrame(X_test["Hits"])

In [4]:
rf_model = RandomForestRegressor(random_state = 42)

In [5]:
rf_model.fit(X_train, y_train)

In [6]:
rf_model.predict(X_test)[:7]

array([ 598.57816667, 1161.28166667,  598.57816667,  453.21931533,
        155.95      ,  229.59830952,  846.44931872])

In [7]:
y_pred = rf_model.predict(X_test)

In [8]:
np.sqrt(mean_squared_error(y_test, y_pred))

462.01087000489565

In [9]:
rf_params = {'max_depth': list(range(1,10)),
            'max_features': [3,5,10,15],
            'n_estimators' : [100, 200, 500, 1000, 2000]}

In [10]:
rf_model = RandomForestRegressor(random_state = 42)

In [None]:
rf_cv_model = GridSearchCV(rf_model, 
                           rf_params, 
                           cv = 10, 
                            n_jobs = -1)

In [None]:
rf_cv_model.fit(X_train, y_train)

In [None]:
rf_cv_model.best_params_

In [None]:
rf_tuned = RandomForestRegressor(max_depth  = 8, 
                                 max_features = 3, 
                                 n_estimators =200)

In [None]:
rf_tuned.fit(X_train, y_train)

In [None]:
y_pred = rf_tuned.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
Importance = pd.DataFrame({"Importance": rf_tuned.feature_importances_*100},
                         index = X_train.columns)

In [None]:
Importance.sort_values(by = "Importance", axis = 0, ascending = True).plot(kind ="barh", color = "r")

plt.xlabel("The importance of Variables")