In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, accuracy_score

#### Read data

In [None]:
df = pd.read_csv('../../../datasets/parte2/treino/dataset_prepared.csv', na_filter= False)

#### X and y arrays

In [None]:
X = df.drop('injection', axis=1)
y = df['injection']

#### Train Test Split

Now let's split the data into a training set and a testing set. We will train out model on the training set and then use the test set to evaluate the model.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2023)

#### Training 

Using GridSearchCV to find the best hyperparameters

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy','ratio'],
    'max_depth': [None, 10, 15, 20, 25, 30, 35, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],
}
model = RandomForestClassifier(random_state=2023)

# Use GridSearchCV para encontrar os melhores hiperparâmetros
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, refit=True, verbose=2, cv=5, n_jobs=-1) # , scoring="f1"
grid_search.fit(X_train, y_train)

Inspect the best parameters

In [None]:
grid_search.best_params_

Get the best estimator

In [None]:
model = grid_search.best_estimator_

Get the predictions using the trained model

In [None]:
predictions = model.predict(X_test)

#### Model Evaluation

Classification report

In [None]:
print(classification_report(y_test, predictions))

Accuracy Score

In [None]:
accuracy_score(y_test, predictions)

Confusion Matrix

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, predictions)
plt.show()

Get the predictions using the trained model

In [None]:
#predictions = model.predict(df_test)

#df_predicoes = pd.DataFrame({'Result': predictions})
#df_predicoes['RowId'] = range(1, len(predictions) + 1)
#df_predicoes = df_predicoes[['RowId', 'Result']]

#df_predicoes.to_csv('../../../datasets/parte2/teste/previsoesRandom.csv', index=False)