**Best model for the given dataset using Random Forest Classifier and running it with the new dataset**

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
import joblib

# Load the dataset
df = pd.read_csv("https://raw.githubusercontent.com/andvise/DataAnalyticsDatasets/main/train_dataset.csv", index_col=0)

# Replace infinite and missing values with 0
df = df.replace([np.inf, -np.inf], 0)
df = df.fillna(0)

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Feature normalization using StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create the Random Forest Classifier
rfc = RandomForestClassifier(random_state=42)

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Train and tune the Random Forest Classifier using GridSearchCV
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best estimator and its parameters
best_rfc = grid_search.best_estimator_
best_params = grid_search.best_params_

# Evaluate the model on the test dataset
y_pred = best_rfc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy obtained by Random Forest Classifier:', accuracy)
print('\nClassification report:\n', classification_report(y_test, y_pred))


# Save the best model to a file
joblib.dump(best_rfc, 'random_forest_model.joblib')

# Load the saved model
rf_bestmodel = joblib.load('random_forest_model.joblib')

# Load the test dataset
test_df = pd.read_csv('https://raw.githubusercontent.com/andvise/DataAnalyticsDatasets/main/test_dataset.csv', index_col=0)
test_df = test_df.replace([np.inf, -np.inf], 0)
test_df = test_df.fillna(0)

# Split into features and target
X_test_dat = test_df.iloc[:, :-1]
y_test_dat = test_df.iloc[:, -1]

# Feature normalization
scaler = StandardScaler()
X_test_dat = scaler.fit_transform(X_test_dat)

# Make predictions on the test data
y_pred_b = rf_bestmodel.predict(X_test_dat)

# Calculate the accuracy of the classifier
accuracy_final = accuracy_score(y_test_dat, y_pred_b)
print('\nAccuracy of the best model on the new dataset:', accuracy_final)


Accuracy obtained by Random Forest Classifier: 0.9986187845303868

Classification report:
                 precision    recall  f1-score   support

       3clique       1.00      1.00      1.00        28
        3color       1.00      1.00      1.00        32
       4clique       1.00      0.95      0.98        22
        4color       1.00      1.00      1.00        37
       5clique       0.97      1.00      0.98        30
        5color       1.00      1.00      1.00        35
cliquecoloring       1.00      1.00      1.00        85
    dominating       1.00      1.00      1.00        85
      matching       1.00      1.00      1.00        27
            op       1.00      1.00      1.00        58
           php       1.00      1.00      1.00        70
    subsetcard       1.00      1.00      1.00        77
        tiling       1.00      1.00      1.00        42
       tseitin       1.00      1.00      1.00        96

      accuracy                           1.00       724
     macro 