In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

In [2]:
grd = pd.read_csv("../data/graphene_data_final.csv")

We are here creating 2 sets of features(input). In `X` variable we are storing a dataframe containing Graphene Percentage, Feed, Rotation and DOC. In `X2` we are storing only the two labels which have higher correlation with the MRR. Both X and X2 will be used to train the models and we will see which set of features produces better model.

Similarly in `Y` variable we are storing MRR and in `Y2` the `Ra` value to use them as labels because we want to predict them.

In [3]:
X, Y = grd[['Graphene_percentage', 'FEED', 'RPM', 'DOC']], grd['MRR_gm_per_sec']
Y2 = grd['Ra']

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2)#)
X_train2, X_test2, Y2_train, Y2_test = train_test_split(X, Y2, test_size=0.3, random_state=16)

In [5]:
rfr=RandomForestRegressor(random_state=42)

In [6]:
param_grid = { 
    'n_estimators': [5,10,15,20,30,70,100],
    'min_samples_split': [2,4,8,12,16],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [1,3,5,7,9,11,13,15],
}

In [7]:
CV_rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, cv= 5)
CV_rfr.fit(X_train, Y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid={'max_depth': [1, 3, 5, 7, 9, 11, 13, 15],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_split': [2, 4, 8, 12, 16],
                         'n_estimators': [5, 10, 15, 20, 30, 70, 100]})

In [8]:
CV_rfr.score(X_train, Y_train)

0.9379428352354985

In [9]:
CV_rfr.score(X_test, Y_test)

0.8687536821707537

In [10]:
print(CV_rfr.best_params_)

{'max_depth': 5, 'max_features': 'auto', 'min_samples_split': 8, 'n_estimators': 5}


In [11]:
rfr2 = RandomForestRegressor(random_state=21)

In [12]:
CV_rfr2 = GridSearchCV(estimator=rfr2, param_grid=param_grid, cv= 5)
CV_rfr2.fit(X_train2, Y2_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=21),
             param_grid={'max_depth': [1, 3, 5, 7, 9, 11, 13, 15],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_split': [2, 4, 8, 12, 16],
                         'n_estimators': [5, 10, 15, 20, 30, 70, 100]})

In [13]:
CV_rfr2.score(X_test2, Y2_test)

0.5041910321714542

In [14]:
print(CV_rfr2.best_params_)

{'max_depth': 7, 'max_features': 'sqrt', 'min_samples_split': 4, 'n_estimators': 30}


Now let's save the models for further usage.

In [15]:
import pickle
with open('../trained_models/random_forest_MRR.pkl','wb') as f:
    pickle.dump(CV_rfr,f)

In [16]:
with open('../trained_models/random_forest_RA.pkl','wb') as f:
    pickle.dump(CV_rfr2,f)