# In this practice session, we will learn to code Random Forest Regression. 
# We will perform the following steps to build a simple classifier using the popular Iris dataset.

 
 
  - **Data Preprocessing**

    - Importing the libraries.
    - Importing dataset (Dataset Link https://archive.ics.uci.edu/ml/datasets/iris).
    - Dealing with the categorical variable.
    - Classifying dependent and independent variables.
    - Splitting the data into a training set and test set.
    - Feature scaling.
 

  -  **Random Forest Regression**

    - Create a Random Forest Regressor.
    - Feed the training data to the regression model.
    - Predicting the species for the test set.
    - Using the RMSE to calculate the error metric.

# Load the Dependencies

In [None]:
import ipywidgets as widgets
from IPython.display import display

style = {'description_width': 'initial'}

In [None]:
#1 Importing essential libraries
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

# Load the Dataset

In [None]:
#2 Importing the dataset

file_name = 'beer_data.csv'
dataset = pd.read_csv(file_name)

In [None]:
#Displaying the dataset
dataset.head(8)

In [None]:
print(f"Dataset has {dataset.shape[0]} rows and {dataset.shape[1]} columns.")

## Feature Engineering

#### Drop Nulls and Fill Nulls Based on Mean

In [None]:
#check nulls..

dataset.isnull().sum()

In [None]:
dataset = dataset[~dataset['Cellar Temperature'].isna()]
dataset.reset_index(inplace=True, drop=True)

In [None]:
dataset['ABV'].fillna(dataset['ABV'].mean(), inplace=True)

In [None]:
dataset['Ratings'] = dataset['Ratings'].apply(lambda x : np.float32(x.replace(",", "")))

In [None]:
# Dealing with the categorical data

# Spliting Cellar Temperature into Maximum and Minimum based on the given data and converting the type from str to int

dataset.loc[:, 'Minimum_Cellar_Temp'] = dataset['Cellar Temperature'].apply(lambda x : int(str(x).split('-')[0].strip()))
dataset.loc[:, 'Maximum_Cellar_Temp'] = dataset['Cellar Temperature'].apply(lambda x : int(str(x).split('-')[1].strip()))

In [None]:
dataset.drop('Cellar Temperature', inplace=True, axis=1)
dataset.columns.tolist()

In [None]:
# classify dependent and independent variables
X = dataset[[col for col in dataset.columns if col not in ('Score')]].values  #independent variables 
y = dataset['Score'].values  #dependent variable 

In [None]:
print("\nIdependent Variables :\n\n", X[:5])
print("\nDependent Variable (Score):\n\n", y[:5])

# Create Train and Test Sets

In [None]:
#4 Creating training set and testing set
from sklearn.model_selection import train_test_split
test_size = widgets.FloatSlider(min=0.01, max=0.6, value=0.2, description="Test Size :", tooltips=['Usually 20-30%'])
display(test_size)

In [None]:
#Divide the dataset into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X ,y, test_size=test_size.value, random_state = 0) 

In [None]:
print("Training Set :\n----------------\n")
print("X = \n", X_train[:5])
print("y = \n", y_train[:5])

print("\n\nTest Set :\n----------------\n")
print("X = \n",X_test[:5])
print("y = \n", y_test[:5])

In [None]:
print(f"Shape of Training set is {X_train.shape}")
print(f"Shape of Testing set is {X_test.shape}")

# Apply Random Forest Regression 

In [None]:
# import random forest library
from sklearn.ensemble import RandomForestRegressor

# configure params for the model.
max_feat_wig = widgets.ToggleButtons(options=['log2', 'sqrt', 'auto'],
                                    description='Number of features for the best split :',
                                    disabled=False,
                                    style=style)

display(max_feat_wig)

max_depth_wig = widgets.Dropdown(options=[10, 20, 30, 50],
                            description='The maximum depth of the Tree. :',
                            style=style)

display(max_depth_wig)

min_split_wig = widgets.Dropdown(options=[100, 200, 300, 500],
                            description='Minimum Number of Splits. :',
                            style=style)

display(min_split_wig)

njobs_wig = widgets.Dropdown(options=[('One', 1), ('Two', 2), ('Three', 3), ('All Cores', -1)], 
                             description="Number of CPU Cores :", style=style)

display(njobs_wig)

# Predict and Evaluate the Model 

In [None]:
# Train the Regressor with training set
regressor = RandomForestRegressor(max_features=max_feat_wig.value,
                                  max_depth=max_depth_wig.value,
                                  min_samples_split=min_split_wig.value,
                                  n_jobs=njobs_wig.value)

#fit the linear model
regressor.fit(X_train, y_train)

#7 predict the outcome of test sets
y_Pred = regressor.predict(X_test)
print("\nPredictions = ", y_Pred)

In [None]:
# Calculating score from Root Mean Log Squared Error
def rmlse(y_test, y_pred):
    error = np.square(np.log10(y_pred +1) - np.log10(y_test +1)).mean() ** 0.5
    score = 1 - error
    return score

# Printing the score
print("\n----------------------------\nRMLSE Score = ", rmlse(y_test, y_Pred))

#9 Comparing Actual and Predicted Salaries for he test set
print("\nActual vs Predicted Scores \n------------------------------\n")
error_df = pd.DataFrame({"Actual" : y_test,
                         "Predicted" : y_Pred,
                         "Abs. Error" : np.abs(y_test - y_Pred)})

error_df

# Feature Importance

In [None]:
feat_names = [col for col in dataset.columns if col not in ('Score')]

pd.Series(regressor.feature_importances_, \
          index=feat_names).sort_values(ascending=True).plot(kind='barh', figsize=(16,9));

plt.title('Feature Importance Random Forest Regressor');

# Actual vs. Predicted 

In [None]:
#Plotting Actual observation vs Predictions
plt.figure(figsize=(16, 9));
plt.scatter(y_test, y_Pred, s = 70)
plt.xlabel('Actual');
plt.ylabel('Predicted');
plt.grid();
plt.show();