In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import joblib
import os
import logging
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import make_scorer, f1_score
import csv
import numpy as np

class DataLoader:
    @staticmethod
    def safe_load_csv_dataset(file_path: str, is_train=True):
        try:
            with open(file_path, 'r') as file:
                reader = csv.reader(file)
                next(reader)  # Skip header row
                data = np.array([row for row in reader], dtype=float)

            if is_train:
                # If it's the training data, assume labels are in the last column
                labels = data[:, -1]
                features = data[:, :-1]
            else:
                # If it's the testing data, there are no labels
                labels = None
                features = data

            return features, labels

        except FileNotFoundError as e:
            logging.error(f"File not found: {str(e)}")
            return None, None
        except ValueError as e:
            logging.error(f"Value error: {str(e)}")
            return None, None
logging.basicConfig(level=logging.INFO)

logging.basicConfig(level=logging.INFO)

data_path = "/content/drive/MyDrive/climateDoc/classification-of-extreme-weather-events-udem"
file_names = {'train': 'train.csv', 'test': 'test.csv'}

loader = DataLoader()
train_data, train_labels = loader.safe_load_csv_dataset(os.path.join(data_path, file_names['train']), True)

X_train, X_val, y_train, y_val = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)
param_grid_xgb = {
    'n_estimators': [50, 100, 200, 500, 1000],
    'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 6, 7, 8, 10, 12],
    'colsample_bytree': [0.3, 0.5, 0.7, 0.9, 1],
    'subsample': [0.3, 0.5, 0.7, 0.9, 1],
    'gamma': [0, 0.1, 0.2, 0.3, 0.5, 1, 2],
    'min_child_weight': [1, 2, 3, 4, 5, 6],
    'reg_alpha': [0, 0.1, 0.5, 1, 2],  # L1 regularization term on weights
    'reg_lambda': [1, 1.5, 2, 3, 4.5]  # L2 regularization term on weights
}

xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

f1_scorer = make_scorer(f1_score, average='weighted')

# Using RandomizedSearchCV with the XGBoost classifier
random_search_xgb = RandomizedSearchCV(estimator=xgb_clf, param_distributions=param_grid_xgb,
                                      n_iter=200,  # increased iterations
                                      scoring=f1_scorer, cv=5, verbose=4, n_jobs=-1, random_state=42)

logging.info("Starting Randomized Search for XGBoost...")

random_search_xgb.fit(X_train, y_train)

logging.info("Randomized Search for XGBoost complete.")

joblib.dump(random_search_xgb, '/content/drive/MyDrive/climateDoc/saved_models/random_search_xgb.joblib')

logging.info("Saved RandomizedSearchCV object for XGBoost to random_search_xgb.joblib.")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [None]:
import joblib  # saving and loading Python objects efficiently.
import os  # providing a way of using operating system dependent functionality.
from sklearn.ensemble import RandomForestClassifier  # Import the RandomForest algorithm.
from sklearn.model_selection import GridSearchCV, train_test_split  # tools for hyperparameter tuning and splitting
from sklearn.metrics import make_scorer, f1_score  # tools for custom scoring function and the F1 score metric.
import logging  # Import logging to provide event logging to sys.stderr.
import csv
import numpy as np


class DataLoader:
    @staticmethod
    def safe_load_csv_dataset(file_path: str, is_train=True):
        try:
            with open(file_path, 'r') as file:
                reader = csv.reader(file)
                next(reader)  # Skip header row
                data = np.array([row for row in reader], dtype=float)

            if is_train:
                # If it's the training data, assume labels are in the last column
                labels = data[:, -1]
                features = data[:, :-1]
            else:
                # If it's the testing data, there are no labels
                labels = None
                features = data

            return features, labels

        except FileNotFoundError as e:
            logging.error(f"File not found: {str(e)}")
            return None, None
        except ValueError as e:
            logging.error(f"Value error: {str(e)}")
            return None, None


# Configure logging to log informational messages.
logging.basicConfig(level=logging.INFO)

# Define the path to the data.
data_path = "/content/drive/MyDrive/climateDoc/classification-of-extreme-weather-events-udem"
file_names = {'train': 'train.csv', 'test': 'test.csv'}

# Instantiate a DataLoader object.
loader = DataLoader()

# Use os.path.join to construct a pathname with the path and filename.
# Load the training data and labels using the custom DataLoader class.
train_data, train_labels = loader.safe_load_csv_dataset(os.path.join(data_path, file_names['train']), True)

# Split the training data into training and validation subsets.
# 80% of the data is used for training and 20% is used for validation.
# random_state is a seed value to ensure reproducibility between runs.
X_train, X_val, y_train, y_val = train_test_split(
    train_data,
    train_labels,
    test_size=0.2,
    random_state=42
)

# Define an expanded grid of hyperparameters for tuning the RandomForestClassifier.
# This grid will be explored during the grid search to find the best performing set of hyperparameters.
param_grid_rf = {
    # More trees may increase accuracy but also computational cost.
    'n_estimators': [10, 50, 100, 200, 300],  # Number of trees in the forest.
    # Criterion to split on at each node.
    # 'gini' refers to Gini Impurity which is a measure of misclassification,
    # indicating how mixed the classes are in two groups created by a potential split.
    # A Gini Impurity of 0 indicates perfect separation of classes.
    # 'entropy' refers to Information Gain which measures the reduction in entropy (disorder)
    # achieved by partitioning the dataset.
    # A higher information gain indicates a better split that results in purer subgroups.
    # Both 'gini' and 'entropy' are heuristics used to select the best split at each node by
    # evaluating the splits on all features and all possible threshold values for those features.
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40],  # Maximum depth of the trees.
    # None means nodes are expanded until they contain less than min_samples_split samples.
    'min_samples_split': [2, 5, 10, 15],  # Minimum number of samples required to split an internal node.
    'min_samples_leaf': [1, 2, 4, 8],  # Minimum number of samples required to be at a leaf node.
    'bootstrap': [True, False],  # Method for sampling data points (with or without replacement).
    'class_weight': [None, 'balanced', 'balanced_subsample'],  # Weights associated with classes
    'max_features': ['auto', 'sqrt', 'log2'],  # The number of features to consider when looking for the best split.
    'max_leaf_nodes': [None, 10, 50, 100],  # Grow trees with a certain maximum number of leaf nodes.
    # Splitting node only if this split induces a decrease of the impurity greater than or equal to this value.
    'min_impurity_decrease': [0.0, 0.01, 0.05]
}

# Instantiate a RandomForestClassifier object with a fixed random state for reproducibility.
rf = RandomForestClassifier(random_state=42)

# Create a custom scoring function using the make_scorer function and the F1 score metric.
# The F1 score is a measure model's precision and recall.
f1_scorer = make_scorer(f1_score, average='weighted')

# Instantiate a GridSearchCV object to perform a grid search of the RandomForestClassifier hyperparameters.
# This object will explore the parameter grid using cross-validation to find the best set of hyperparameters.
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf,
                              scoring=f1_scorer,  # Use the custom F1 scoring function.
                              cv=5,  # Perform 5-fold cross-validation.
                              verbose=4,  # Output messages to the console.
                              n_jobs=-1)  # Use all available cores on the machine for parallel processing.

# Log the start of the grid search process to the console.
logging.info("Starting Grid Search...")

# Fit the GridSearchCV object to the training data.
# train a RandomForestClassifier for each combination of hyperparameters in the grid,
# and evaluate them using cross-validation.
grid_search_rf.fit(X_train, y_train)

# Log the completion of the grid search process to the console.
logging.info("Grid Search complete.")

# Save the fitted GridSearchCV object to disk for later use.
# This object contains the best set of hyperparameters found during the grid search,
# as well as the fitted RandomForestClassifier with those hyperparameters.
joblib.dump(grid_search_rf, '/content/drive/MyDrive/climateDoc/big_grid_search_rf.joblib')

# Log the saving of the GridSearchCV object to the console.
logging.info("Saved GridSearchCV object to grid_search_rf.joblib.")


Fitting 5 folds for each of 172800 candidates, totalling 864000 fits


In [None]:
import joblib
import os
import logging
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import make_scorer, f1_score
import csv
import numpy as np

class DataLoader:
    @staticmethod
    def safe_load_csv_dataset(file_path: str, is_train=True):
        try:
            with open(file_path, 'r') as file:
                reader = csv.reader(file)
                next(reader)  # Skip header row
                data = np.array([row for row in reader], dtype=float)

            if is_train:
                # If it's the training data, assume labels are in the last column
                labels = data[:, -1]
                features = data[:, :-1]
            else:
                # If it's the testing data, there are no labels
                labels = None
                features = data

            return features, labels

        except FileNotFoundError as e:
            logging.error(f"File not found: {str(e)}")
            return None, None
        except ValueError as e:
            logging.error(f"Value error: {str(e)}")
            return None, None
logging.basicConfig(level=logging.INFO)

data_path = "/content/drive/MyDrive/climateDoc/classification-of-extreme-weather-events-udem"
file_names = {'train': 'train.csv', 'test': 'test.csv'}

loader = DataLoader()
train_data, train_labels = loader.safe_load_csv_dataset(os.path.join(data_path, file_names['train']), True)

X_train, X_val, y_train, y_val = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)

# Define an expanded grid of hyperparameters for tuning the RandomForestClassifier.
# This grid will be explored during the grid search to find the best performing set of hyperparameters.
param_grid_rf = {
    # More trees may increase accuracy but also computational cost.
    'n_estimators': [10, 50, 100, 200, 300],  # Number of trees in the forest.
    # Criterion to split on at each node.
    # 'gini' refers to Gini Impurity which is a measure of misclassification,
    # indicating how mixed the classes are in two groups created by a potential split.
    # A Gini Impurity of 0 indicates perfect separation of classes.
    # 'entropy' refers to Information Gain which measures the reduction in entropy (disorder)
    # achieved by partitioning the dataset.
    # A higher information gain indicates a better split that results in purer subgroups.
    # Both 'gini' and 'entropy' are heuristics used to select the best split at each node by
    # evaluating the splits on all features and all possible threshold values for those features.
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40],  # Maximum depth of the trees.
    # None means nodes are expanded until they contain less than min_samples_split samples.
    'min_samples_split': [2, 5, 10, 15],  # Minimum number of samples required to split an internal node.
    'min_samples_leaf': [1, 2, 4, 8],  # Minimum number of samples required to be at a leaf node.
    'bootstrap': [True, False],  # Method for sampling data points (with or without replacement).
    'class_weight': [None, 'balanced', 'balanced_subsample'],  # Weights associated with classes
    'max_features': ['auto', 'sqrt', 'log2'],  # The number of features to consider when looking for the best split.
    'max_leaf_nodes': [None, 10, 50, 100],  # Grow trees with a certain maximum number of leaf nodes.
    # Splitting node only if this split induces a decrease of the impurity greater than or equal to this value.
    'min_impurity_decrease': [0.0, 0.01, 0.05]
}

rf = RandomForestClassifier(random_state=42)

f1_scorer = make_scorer(f1_score, average='weighted')

# Using RandomizedSearchCV instead of GridSearchCV
random_search_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_grid_rf,
                                     n_iter=100,  # You can adjust the number of iterations
                                     scoring=f1_scorer, cv=5, verbose=4, n_jobs=-1, random_state=42)

logging.info("Starting Randomized Search...")

random_search_rf.fit(X_train, y_train)

logging.info("Randomized Search complete.")

joblib.dump(random_search_rf, '/content/drive/MyDrive/climateDoc/saved_models/random_search_rf.joblib')

logging.info("Saved RandomizedSearchCV object to random_search_rf.joblib.")


Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
import os
import csv
import logging
import joblib
import numpy as np
from datetime import datetime
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
import xgboost as xgb

# Keep your logging setup and DataLoader class
logging.basicConfig(level=logging.INFO)


class DataLoader:
    @staticmethod
    def safe_load_csv_dataset(file_path: str, is_train=True):
        try:
            with open(file_path, 'r') as file:
                reader = csv.reader(file)
                next(reader)  # Skip header row
                data = np.array([row for row in reader], dtype=float)

            if is_train:
                # If it's the training data, assume labels are in the last column
                labels = data[:, -1]
                features = data[:, :-1]
            else:
                # If it's the testing data, there are no labels
                labels = None
                features = data

            return features, labels

        except FileNotFoundError as e:
            logging.error(f"File not found: {str(e)}")
            return None, None
        except ValueError as e:
            logging.error(f"Value error: {str(e)}")
            return None, None


class SubmissionSaver:
    @staticmethod
    def save_submission(predictions, file_path='/content/drive/MyDrive/climateDoc/submission.csv'):
        with open(file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNo', 'Label'])
            for i, label in enumerate(predictions, 1):
                writer.writerow([i, label])
        logging.info(f'Submission saved to {file_path}')


class ModelSaver:
    @staticmethod
    def save_model(_model):
        try:
            os.makedirs("saved_models", exist_ok=True)
            class_name = _model.__class__.__name__
            timestamp = datetime.now().strftime("%d_%H_%M_%S")
            filename = f"saved_models/{class_name}_{timestamp}.joblib"
            joblib.dump(_model, filename)
            logging.info(f"Model saved to {filename}")
        except Exception as e:
            logging.error(f"Error saving model: {str(e)}")


class Preprocessor:
    def __init__(self):
        self.mean = None
        self.std = None
        self.min = None
        self.max = None

    # Compute the values along each column, independently
    def fit(self, X):
        # Compute and store the mean of each feature/column in the dataset
        self.mean = np.mean(X, axis=0)
        # Compute and store the standard deviation of each feature/column in the dataset
        self.std = np.std(X, axis=0)
        # Compute and store the minimum value of each feature/column in the dataset
        self.min = np.min(X, axis=0)
        # Compute and store the maximum value of each feature/column in the dataset
        self.max = np.max(X, axis=0)

    # Method to apply either standardization or normalization to the dataset
    def transform(self, X, method='standardize'):
        # Check the method argument to determine the transformation to apply
        if method == 'standardize':
            # Call the standardize method (not defined in provided code) to standardize the dataset
            return self.standardize(X)
        elif method == 'normalize':
            # Call the normalize method to normalize the dataset
            return self.normalize(X)
        else:
            # If an unknown method is provided, raise a ValueError with a descriptive message
            raise ValueError(f"Unknown method: {method}")

    def standardize(self, X):
        # Standardization formula: (X - mean) / std
        # Ensure to add a small value to the denominator to avoid division by zero
        return (X - self.mean) / (self.std + 1e-8)

    def normalize(self, X):
        # Normalization formula: (X - min) / (max - min)
        # Ensure to add a small value to the denominator to avoid division by zero
        return (X - self.min) / (self.max - self.min + 1e-8)


class SimpleDummyClassifier:
    def __init__(self):
        self.unique_labels = None

    def fit(self, _: np.array, y: np.array) -> None:
        self.unique_labels = np.unique(y)

    def predict(self, _: np.array) -> np.array:
        num_samples = _.shape[0]
        return np.random.choice(self.unique_labels, size=num_samples)


class SoftLogisticRegression:
    # initialize the hyperparameters and sets up the initial values of weights and bias.
    def __init__(self, learning_rate=0.01, num_iterations=1000, regularization_strength=0.1):
        self.learning_rate = learning_rate  # step size used during optimization to find the minimum of loss function.
        self.num_iterations = num_iterations  # number of steps the optimizer will take to minimize loss function.
        self.regularization_strength = regularization_strength  # Controls regularization strength, prevent overfitting.
        self.weights = None  # Placeholder for the weights vector that will be learned from the data.
        self.bias = None  # Placeholder for the bias term that will be learned from the data.

    @staticmethod
    def softmax(z):
        # For numerical stability, subtract the maximum value of z for each sample.
        z -= np.max(z, axis=1, keepdims=True)
        # Compute the exponential of z to get unnormalized probabilities.
        exp_z = np.exp(z)
        # Sum the unnormalized probabilities for each sample to normalize them.
        sum_exp_z = np.sum(exp_z, axis=1, keepdims=True)
        # Divide each unnormalized probability by the sum to get the normalized probabilities.
        return exp_z / sum_exp_z

    def fit(self, X, y):
        y = y.astype(int)  # Convert the labels to integer type in case they are not for indexing
        num_samples, num_features = X.shape
        num_classes = len(np.unique(y))  # Get the number of unique labels, which equals the number of classes.
        self.weights = np.zeros((num_features, num_classes))  # Initialize the weights matrix with zeros.
        self.bias = np.zeros(num_classes)  # Initialize the bias vector with zeros.
        y_one_hot = np.eye(num_classes)[y]
        for _ in range(self.num_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            probabilities = self.softmax(linear_model)
            error = probabilities - y_one_hot
            gradient_weights = (1 / num_samples) * np.dot(X.T, error) + self.regularization_strength * self.weights
            gradient_bias = (1 / num_samples) * np.sum(error, axis=0)
            self.weights -= self.learning_rate * gradient_weights
            self.bias -= self.learning_rate * gradient_bias

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        probabilities = self.softmax(linear_model)
        return np.argmax(probabilities, axis=1)

def calculate_metrics(y_true, y_pred, label):
    tp = np.sum((y_true == label) & (y_pred == label))
    fp = np.sum((y_true != label) & (y_pred == label))
    fn = np.sum((y_true == label) & (y_pred != label))
    precision = tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = tp / (tp + fn) if tp + fn > 0 else 0.0
    # Step 6: Calculate F1 Score
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0.0
    return precision, recall, f1


def classification_report_custom(y_true, y_pred):

    labels = np.unique(y_true)

    # Step 2: Iterate Over Each Unique Label
    for label in labels:
        precision, recall, f1 = calculate_metrics(y_true, y_pred, label)
        # Step 3: Display Metrics for Each Label
        print(f'Label: {label}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}')


class ClimateAnalysisPipeline:
    def __init__(self):
        logging.info("Initializing ClimateAnalysisPipeline")
        self.training_set = {'data': None, 'labels': None}
        self.inference_set = {'data': None, 'labels': None}

    def load_datasets(self, _path, _file_names):
        loader = DataLoader()
        self.training_set['data'], self.training_set['labels'] = \
            loader.safe_load_csv_dataset(os.path.join(_path, _file_names['train']), True)
        self.inference_set['data'], _ = \
            loader.safe_load_csv_dataset(os.path.join(_path, _file_names['test']), False)

    def train_and_evaluate(self):
        logging.info("Starting model training and evaluation")
        if self.training_set['data'] is not None:
            X_train, X_val, y_train, y_val = train_test_split(
                self.training_set['data'],
                self.training_set['labels'],
                test_size=0.2, random_state=42
            )

            # Adding XGBoost to the model list
            xg_reg = xgb.XGBClassifier(objective='binary:logistic', colsample_bytree=0.3, learning_rate=0.1,
                                       max_depth=5, alpha=10, n_estimators=10)
            # Run gridsearch.py before this line
            grid_search_rf = joblib.load('/content/drive/MyDrive/climateDoc/saved_models/random_search_rf.joblib')
            # Get the best estimator from the GridSearch
            best_rf = grid_search_rf.best_estimator_

              # Run gridsearch.py before this line
            grid_search_xgb = joblib.load('/content/drive/MyDrive/climateDoc/saved_models/random_search_xgb.joblib')
            # Get the best estimator from the GridSearch
            best_xgb = grid_search_xgb.best_estimator_


            baseline_models = [
                ('SimpleDummy', SimpleDummyClassifier()),
                ('SoftLogisticRegression', SoftLogisticRegression()),
                ('Dummy', DummyClassifier(strategy="uniform")),
                ('SGD', SGDClassifier(class_weight='balanced')),
                ('SVC', SVC(class_weight='balanced')),
                ('RandomForest_best_rf', best_rf),
                ('RandomForest', RandomForestClassifier(class_weight='balanced')),
                ('LogisticRegression', LogisticRegression(max_iter=1000, class_weight='balanced')),
                ('XGBoost', xg_reg),
                ('XGBoost_best_xgb', xg_reg)
            ]

            best_f1 = 0.0
            best_model_name = ""
            _best_model = None

            for name, model in baseline_models:
                model.fit(X_train, y_train)
                _predictions = model.predict(X_val)
                print(f'Performance of {name}:')
                classification_report_custom(y_val, _predictions)
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", category=UndefinedMetricWarning)
                    report = classification_report(y_val, _predictions, output_dict=True, zero_division=0)
                f1 = report['weighted avg']['f1-score']
                logging.info("Model {} F1-score: {:.4f}".format(name, f1))

                if f1 > best_f1:
                    best_f1 = f1
                    best_model_name = name
                    _best_model = model

            logging.info("Best model is {} with F1-score of {:.4f}".format(best_model_name, best_f1))
            ModelSaver.save_model(_best_model)
            return _best_model
        else:
            logging.error("Training data is missing. Cannot proceed.")
            return None



data_path = "/content/drive/MyDrive/climateDoc/classification-of-extreme-weather-events-udem"
file_names = {'train': 'train.csv', 'test': 'test.csv'}

loader = DataLoader()
train_data, train_labels = loader.safe_load_csv_dataset(os.path.join(data_path, file_names['train']), True)
test_data, _ = loader.safe_load_csv_dataset(os.path.join(data_path, file_names['test']), False)

if train_data is None or test_data is None:
    logging.error("Data loading failed. Cannot proceed.")
    exit(1)

preprocessor = Preprocessor()
preprocessor.fit(train_data)  # Compute statistics based on the training data
# Standardize the training data
train_data = preprocessor.transform(train_data, method='standardize')
# Standardize the test data using the same statistics
test_data = preprocessor.transform(test_data, method='standardize')

# Create an instance of ClimateAnalysisPipeline
pipeline = ClimateAnalysisPipeline()

# Set the training and inference datasets
pipeline.training_set['data'], pipeline.training_set['labels'] = train_data, train_labels
pipeline.inference_set['data'] = test_data

# Call train_and_evaluate
best_model = pipeline.train_and_evaluate()

if best_model:
    test_predictions = best_model.predict(test_data)
    SubmissionSaver.save_submission(test_predictions)
else:
    logging.error("No best model, sorry")




Performance of SimpleDummy:
Label: 0.0, Precision: 0.79, Recall: 0.34, F1: 0.47
Label: 1.0, Precision: 0.04, Recall: 0.35, F1: 0.08
Label: 2.0, Precision: 0.17, Recall: 0.33, F1: 0.22
Performance of SoftLogisticRegression:
Label: 0.0, Precision: 0.81, Recall: 0.98, F1: 0.89
Label: 1.0, Precision: 0.72, Recall: 0.04, F1: 0.07
Label: 2.0, Precision: 0.63, Recall: 0.13, F1: 0.21
Performance of Dummy:
Label: 0.0, Precision: 0.80, Recall: 0.33, F1: 0.47
Label: 1.0, Precision: 0.04, Recall: 0.33, F1: 0.07
Label: 2.0, Precision: 0.17, Recall: 0.34, F1: 0.23
Performance of SGD:
Label: 0.0, Precision: 0.88, Recall: 0.84, F1: 0.86
Label: 1.0, Precision: 0.28, Recall: 0.60, F1: 0.38
Label: 2.0, Precision: 0.58, Recall: 0.55, F1: 0.56
Performance of SVC:
Label: 0.0, Precision: 0.98, Recall: 0.76, F1: 0.86
Label: 1.0, Precision: 0.41, Recall: 0.96, F1: 0.57
Label: 2.0, Precision: 0.54, Recall: 0.91, F1: 0.67


  warn(


Performance of RandomForest_best_rf:
Label: 0.0, Precision: 0.98, Recall: 0.86, F1: 0.92
Label: 1.0, Precision: 0.59, Recall: 0.90, F1: 0.72
Label: 2.0, Precision: 0.64, Recall: 0.90, F1: 0.75
Performance of RandomForest:
Label: 0.0, Precision: 0.91, Recall: 0.91, F1: 0.91
Label: 1.0, Precision: 0.66, Recall: 0.59, F1: 0.62
Label: 2.0, Precision: 0.64, Recall: 0.64, F1: 0.64
Performance of LogisticRegression:
Label: 0.0, Precision: 0.95, Recall: 0.58, F1: 0.72
Label: 1.0, Precision: 0.23, Recall: 0.91, F1: 0.36
Label: 2.0, Precision: 0.41, Recall: 0.85, F1: 0.55
Performance of XGBoost:
Label: 0.0, Precision: 0.85, Recall: 0.97, F1: 0.91
Label: 1.0, Precision: 0.83, Recall: 0.35, F1: 0.49
Label: 2.0, Precision: 0.77, Recall: 0.35, F1: 0.48
