In [None]:
# add google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Cleaning


In [None]:
from pandas.api.types import is_numeric_dtype
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/content/drive/MyDrive/dataset_clean.csv')
num_cols_before = df.shape[1]
df = df.drop(['Order Zipcode','Shipping Date'], axis=1)
num_cols_after = df.shape[1]
num_cols_dropped = num_cols_before - num_cols_after
print("Number of columns dropped:", num_cols_dropped)

Number of columns dropped: 2


In [None]:
numeric_columns = [col for col in df.select_dtypes(include=np.number) if col not in df.filter(like='non_numeric')]
numeric_columns

In [None]:
def get_column_dtypes(df):
  dtype_dict = {}
  for col in df.columns:
    # Get the data type
    dtype = df[col].dtype
    # Check if the data type is numeric
    if not is_numeric_dtype(dtype):
      dtype_dict[col] = 'category'
    else:
      # Remove 'dtype(' and ')' from the string representation
      dtype = str(dtype).strip("dtype(')").strip(")")
      dtype_dict[col] = dtype
  return dtype_dict


data_types = get_column_dtypes(df)
print(data_types)

{'Type': 'category', 'Days for shipping (real)': 'int64', 'Days for shipment (scheduled)': 'int64', 'Sales per customer': 'float64', 'Delivery Status': 'category', 'Late_delivery_risk': 'int64', 'Category Id': 'int64', 'Category Name': 'category', 'Customer City': 'category', 'Customer Country': 'category', 'Customer Fname': 'category', 'Customer Id': 'int64', 'Customer Lname': 'category', 'Customer Segment': 'category', 'Customer State': 'category', 'Customer Street': 'category', 'Customer Zipcode': 'float64', 'Department Id': 'int64', 'Department Name': 'category', 'Latitude': 'float64', 'Longitude': 'float64', 'Market': 'category', 'Order City': 'category', 'Order Country': 'category', 'Order Customer Id': 'int64', 'Order Date': 'category', 'Order Id': 'int64', 'Order Item Cardprod Id': 'int64', 'Order Item Discount': 'float64', 'Order Item Discount Rate': 'float64', 'Order Item Id': 'int64', 'Order Item Profit Ratio': 'float64', 'Order Item Quantity': 'int64', 'Sales': 'float64', '

In [None]:
df.columns

Index(['Type', 'Days for shipping (real)', 'Days for shipment (scheduled)',
       'Sales per customer', 'Delivery Status', 'Late_delivery_risk',
       'Category Id', 'Category Name', 'Customer City', 'Customer Country',
       'Customer Fname', 'Customer Id', 'Customer Lname', 'Customer Segment',
       'Customer State', 'Customer Street', 'Customer Zipcode',
       'Department Id', 'Department Name', 'Latitude', 'Longitude', 'Market',
       'Order City', 'Order Country', 'Order Customer Id', 'Order Date',
       'Order Id', 'Order Item Cardprod Id', 'Order Item Discount',
       'Order Item Discount Rate', 'Order Item Id', 'Order Item Profit Ratio',
       'Order Item Quantity', 'Sales', 'Order Profit Per Order',
       'Order Region', 'Order State', 'Order Status', 'Product Card Id',
       'Product Category Id', 'Product Name', 'Product Price',
       'Product Status', 'Shipping Mode'],
      dtype='object')

In [None]:
data = pd.read_csv('/content/drive/MyDrive/dataset_clean.csv', delimiter=',', dtype=data_types)

numerical_features = numeric_columns

In [None]:
columns_to_drop = ['Department Name', 'Latitude', 'Longitude','Customer Zipcode','Type','Product Category Id','Customer State', 'Customer Street','Product Status','Order Zipcode','Shipping Date']
data = data.drop(columns=columns_to_drop)

In [None]:
# truncuate data to 100000 rows
data = data.head(100000)

In [None]:
# create a dictionary with numerical features and as keys get their types from data_types
numeric_columns = [col for col in data.select_dtypes(include=np.number) if col not in data.filter(like='non_numeric')]
numeric_columns

['Days for shipping (real)',
 'Days for shipment (scheduled)',
 'Sales per customer',
 'Late_delivery_risk',
 'Category Id',
 'Customer Id',
 'Department Id',
 'Order Customer Id',
 'Order Id',
 'Order Item Cardprod Id',
 'Order Item Discount',
 'Order Item Discount Rate',
 'Order Item Id',
 'Order Item Profit Ratio',
 'Order Item Quantity',
 'Sales',
 'Order Profit Per Order',
 'Product Card Id',
 'Product Price']

In [None]:
# print which column contains how many nan values
for col in data.columns:
  print(col, data[col].isna().sum())

Days for shipping (real) 0
Days for shipment (scheduled) 0
Sales per customer 0
Delivery Status 0
Late_delivery_risk 0
Category Id 0
Category Name 0
Customer City 0
Customer Country 0
Customer Fname 0
Customer Id 0
Customer Lname 0
Customer Segment 0
Department Id 0
Market 0
Order City 0
Order Country 0
Order Customer Id 0
Order Date 0
Order Id 0
Order Item Cardprod Id 0
Order Item Discount 0
Order Item Discount Rate 0
Order Item Id 0
Order Item Profit Ratio 0
Order Item Quantity 0
Sales 0
Order Profit Per Order 0
Order Region 0
Order State 0
Order Status 0
Product Card Id 0
Product Name 0
Product Price 0
Shipping Mode 0


In [None]:
# print row count before and after dropping null
print("Before dropping null values:", data.shape)
data = data.dropna()
print("After dropping null values:", data.shape)

Before dropping null values: (20000, 35)
After dropping null values: (20000, 35)


# Feature Selection


In [None]:
# Select the features and target variable
X = data.drop(['Category Name'], axis=1)  # Exclude the 'Category Name' column
X = pd.get_dummies(X)
y = data['Category Name']

In [None]:
# print row counts
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (20000, 34)
y shape: (20000,)


In [None]:
from sklearn.ensemble import RandomForestClassifier
# Train a Random Forest classifier
model = RandomForestClassifier()
model.fit(X, y)

# Retrieve feature importance scores
feature_importances = model.feature_importances_

# Create a dataframe with feature names and importance scores
feature_importances_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the dataframe by importance scores in descending order
feature_importances_df = feature_importances_df.sort_values('Importance', ascending=False)

# Select the top-k features (e.g., top 10)
k = 5
selected_features = feature_importances_df.head(k)['Feature'].tolist()

# Subset the X dataframe with the selected features
X_selected = X[selected_features]

# Print the selected features
print("Selected Features:")
print(X_selected.columns)

In [None]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import time
# Identify categorical and numerical columns
categorical_cols = data.select_dtypes(include=['object', 'category']).columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply preprocessing
X_preprocessed = preprocessor.fit_transform(data)

In [None]:
start_time = time.time()

# Calculate information gain (mutual information) for each feature
info_gain = mutual_info_classif(X_preprocessed, y)

end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds")

# Retrieve feature names after one-hot encoding
feature_names = (numerical_cols.tolist() +
                 preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols).tolist())

# Create a dataframe with feature names and information gain scores
feature_info_gain_df = pd.DataFrame({'Feature': feature_names, 'Information_Gain': info_gain})

# Sort the dataframe by information gain scores in descending order
feature_info_gain_df = feature_info_gain_df.sort_values('Information_Gain', ascending=False)

# Select the top-k features (e.g., top 5)
k = 10
selected_features = feature_info_gain_df.head(k)['Feature'].tolist()
selected_features
# Since selected_features are transformed feature names, we can't directly subset X with them.
# For practical purposes, you might need to map these back to original features or just use these top features for your model.


In [None]:
k = 5
selected_features = feature_info_gain_df.head(k)['Feature'].tolist()
selected_features

['Order Item Cardprod Id',
 'Category Id',
 'Product Card Id',
 'Order Item Id',
 'Product Price']

In [None]:
from sklearn.feature_selection import RFE

# # Select the features and target variable
# X = data.drop(['URGENCY'], axis=1)  # Exclude the 'URGENCY' column
# y = data['URGENCY']

# # Perform one-hot encoding on categorical variables
# X_encoded = pd.get_dummies(X)

# Create a base model (e.g., Random Forest)
model = RandomForestClassifier()

# Perform Recursive Feature Elimination (RFE)
rfe = RFE(estimator=model, n_features_to_select=5)
X_selected = rfe.fit_transform(X, y)

# Get the selected feature names
selected_feature_names = X.columns[rfe.support_]

# Print the selected feature names
print("Selected Features:")
print(selected_feature_names)

# Models


In [None]:
def take_average(res: list, algorithm_name: str):
    avg_accuracy, avg_precision, avg_recall, avg_f1 = 0, 0, 0, 0
    avg_f1_micro, avg_f1_macro = 0, 0
    for acc, prec, rec, f1, f1_micro, f1_macro in res:
        avg_accuracy += acc
        avg_precision += prec
        avg_recall += rec
        avg_f1 += f1
        avg_f1_micro += f1_micro
        avg_f1_macro += f1_macro

    avg_accuracy = avg_accuracy / len(res)
    avg_precision = avg_precision / len(res)
    avg_recall = avg_recall / len(res)
    avg_f1 = avg_f1 / len(res)
    avg_f1_micro = avg_f1_micro / len(res)
    avg_f1_macro = avg_f1_macro / len(res)

    print(f"Average Metrics for {algorithm_name}")
    print(f"Accuracy {avg_accuracy}")
    print(f"Precision {avg_precision}")
    print(f"Recall {avg_recall}")
    print(f"F1 {avg_f1}\n")
    print(f"F1-micro {avg_f1_micro}\n")
    print(f"F1-macro {avg_f1_macro}\n")

    val_dict[algorithm_name] = [avg_accuracy, avg_precision, avg_recall, avg_f1, avg_f1_micro, avg_f1_macro]

def print_confusion_matrix(metrics_cv: list, all_y_tests: list, all_y_preds: list, algorithm_name: str):
    # finds best result based on f1 metric
    f1_index = 3
    f1_results = list(map(lambda m: m[f1_index], metrics_cv))
    best_metric_index = f1_results.index(max(f1_results))

    y_test = all_y_tests[best_metric_index]
    y_pred = all_y_preds[best_metric_index]

    print(f"Best Metric based on F1 for {algorithm_name}")
    print(f"Accuracy {metrics_cv[best_metric_index][0]}")
    print(f"Precision {metrics_cv[best_metric_index][1]}")
    print(f"Recall {metrics_cv[best_metric_index][2]}")
    print(f"F1 {metrics_cv[best_metric_index][3]}\n")
    print(f"F1-micro {metrics_cv[best_metric_index][4]}\n")
    print(f"F1-macro {metrics_cv[best_metric_index][5]}\n")

    print(f"Confusion Matrix {algorithm_name}")
    print(confusion_matrix(y_test, y_pred))


# SVM

In [None]:
def svm(X, y):
    svm = SVC()
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42    )
    metrics_cv = []
    all_y_tests = []
    all_y_preds = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        svm.fit(X_train, y_train)
        y_pred = svm.predict(X_test)


        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        f1_micro = f1_score(y_test, y_pred, average='micro', zero_division=0)
        f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)


        all_y_tests.append(y_test)
        all_y_preds.append(y_pred)

        metrics_cv.append([accuracy, precision, recall, f1, f1_micro, f1_macro])
    return metrics_cv, all_y_tests, all_y_preds

# Stochastic Gradient Descent

In [None]:
from sklearn.linear_model import SGDClassifier

def sgd(X, y):
    sgd = SGDClassifier()
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    metrics_cv = []

    all_y_tests = []
    all_y_preds = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        sgd.fit(X_train, y_train)
        y_pred = sgd.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        f1_micro = f1_score(y_test, y_pred, average='micro', zero_division=0)
        f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)

        metrics_cv.append([accuracy, precision, recall, f1, f1_micro, f1_macro])
        all_y_tests.append(y_test)
        all_y_preds.append(y_pred)

    return metrics_cv, all_y_tests, all_y_preds

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

def decision_tree(X, y):
    dt = DecisionTreeClassifier()
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    metrics_cv = []
    all_y_preds = []
    all_y_tests = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        dt.fit(X_train, y_train)
        y_pred = dt.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        f1_micro = f1_score(y_test, y_pred, average='micro', zero_division=0)
        f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)

        metrics_cv.append([accuracy, precision, recall, f1, f1_micro, f1_macro])
        all_y_tests.append(y_test)
        all_y_preds.append(y_pred)

    return metrics_cv, all_y_tests, all_y_preds

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

def naive_bayes(X, y):
    nb = GaussianNB()
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    metrics_cv = []

    all_y_preds = []
    all_y_tests = []


    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        nb.fit(X_train, y_train)
        y_pred = nb.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        f1_micro = f1_score(y_test, y_pred, average='micro', zero_division=0)
        f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)

        metrics_cv.append([accuracy, precision, recall, f1, f1_micro, f1_macro])
        all_y_preds.append(y_pred)
        all_y_tests.append(y_test)

    return metrics_cv, all_y_tests, all_y_preds

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

def random_forest(X, y):
    nn = RandomForestClassifier()
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    metrics_cv = []

    all_y_preds = []
    all_y_tests = []


    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        nn.fit(X_train, y_train)
        y_pred = nn.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        f1_micro = f1_score(y_test, y_pred, average='micro', zero_division=0)
        f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)

        metrics_cv.append([accuracy, precision, recall, f1, f1_micro, f1_macro])

        all_y_preds.append(y_pred)
        all_y_tests.append(y_test)

    return metrics_cv, all_y_tests, all_y_preds

#Multi-Layer Perceptron

In [None]:
from sklearn.neural_network import MLPClassifier

def mlp(X, y):
    nn = MLPClassifier(hidden_layer_sizes=(5, 2), max_iter=500)

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    metrics_cv = []

    all_y_preds = []
    all_y_tests = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        nn.fit(X_train, y_train)
        y_pred = nn.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        f1_micro = f1_score(y_test, y_pred, average='micro', zero_division=0)
        f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)

        metrics_cv.append([accuracy, precision, recall, f1, f1_micro, f1_macro])

        all_y_preds.append(y_pred)
        all_y_tests.append(y_test)

    return metrics_cv, all_y_tests, all_y_preds

# Experiment

In [None]:
for i in range(len(selected_features)):
    X_selected = X[selected_features[i]]
    print(f"Feature Set #{i}")
    print(selected_features[i])
    results = [svm(X_selected, y), sgd(X_selected, y), decision_tree(X_selected, y), naive_bayes(X_selected, y), random_forest(X_selected, y), mlp(X_selected, y)]
    names = ["Support Vector Machine", "Stochastic Gradient Descent", "Decision Tree", "Naive Bayes", "Random Forest", "Multi-Layer Perceptron"]
    j = 0
    np.save(f"results_f{i}.npy", [row[0] for row in results])
   # with open('results.json', 'w') as f:
     # json.dump([result.tolist() for result in results], f)
    for metrics_cv, all_y_tests, all_y_preds in results:
        neural_network_result = take_average(metrics_cv, names[j])
        print_confusion_matrix(metrics_cv, all_y_tests, all_y_preds, names[j])
        j += 1

In [None]:
from scipy.stats import ttest_ind
results = np.load("results_f0.npy", allow_pickle=True)
d = dict(enumerate(results.flatten(), 1))
print(results)

# names = ["Support Vector Machine", "Stochastic Gradient Descent", "Decision Tree", "Naive Bayes", "Random Forest", "Multi-Layer Perceptron"]
# best models: 4 and 2
# Perform the t-test
t_statistic, p_value = ttest_ind(results[4].flatten(), results[2].flatten())

# Display the results
print("t-statistic:", t_statistic)
print("p-value:", p_value)

def f1_bar_chart_best(results):
    algorithms = ["Support Vector Machine", "Stochastic Gradient Descent", "Decision Tree", "Naive Bayes", "Random Forest", "Multi-Layer Perceptron"]
    values = [value[3][3] for value in results]  # Extract the fourth value from each key-value pair
    print(values)

    num_bars = len(results)

    # Create an array of indices for the bars
    indices = np.arange(num_bars)

    # Set the width of each bar
    bar_width = 0.5

    fig, ax = plt.subplots(figsize=(10, 8))  # Set the figure size

    # Generate a list of colors using the 'tab10' colormap
    colors = plt.cm.tab10(np.arange(num_bars))

    # Plot the bars with tab10 colors
    ax.bar(indices, values, width=bar_width, color=colors)

    # Set the x-axis ticks and labels
    ax.set_xticks(indices)
    ax.set_xticklabels(algorithms, rotation='vertical')  # Rotate x tick labels vertically

    # Set the y-axis label
    ax.set_ylabel('Values')

    # Set the plot title
    ax.set_title('F1 Scores (Feature Set I)')

    # Show the plot
    plt.show()

f1_bar_chart_best(results)

def plot_chart_best(data_dict):
    algorithms = ["Support Vector Machine", "Stochastic Gradient Descent", "Decision Tree", "Naive Bayes", "Random Forest", "Multi-Layer Perceptron"]
    values = [value[3] for value in results]  # Extract the fourth value from each key-value pair
    #values = list(data_dict.values())

    legend_names = ['Accuracy', 'Precision', 'Recall', 'F1']

    num_bars = len(algorithms)
    num_values = len(values[0])

    # Create an array of indices for the bars
    indices = np.arange(num_bars)

    # Set the width of each bar
    bar_width = 0.07

    # Set the spacing between bars
    spacing = 0.05

    fig, ax = plt.subplots(figsize=(10, 8))

    for i in range(num_values):
        # Calculate the x-coordinate for each group of bars
        x = indices + i * (bar_width + spacing)

        # Extract the values for the current iteration
        y = [value[i] for value in values]

        # Plot the bars
        ax.bar(x, y, width=bar_width)

    # Set the x-axis ticks and labels
    ax.set_xticks(indices + (bar_width * num_values + spacing * (num_values - 1)) / 2)
    ax.set_xticklabels(algorithms, rotation='vertical')

    # Set the y-axis label
    ax.set_ylabel('Values')

    # Set the plot title
    ax.set_title('All Metrics per Model')

    # Display the legend
    ax.legend(legend_names)

    # Show the plot
    plt.show()

plot_chart_best(results)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_bar_chart(data_dict):
    algorithms = list(data_dict.keys())
    values = list(data_dict.values())

    legend_names = ['Accuracy', 'Precision', 'Recall', 'F1']

    num_bars = len(algorithms)
    num_values = len(values[0])

    # Create an array of indices for the bars
    indices = np.arange(num_bars)

    # Set the width of each bar
    bar_width = 0.07

    # Set the spacing between bars
    spacing = 0.05

    fig, ax = plt.subplots(figsize=(10, 8))

    for i in range(num_values):
        # Calculate the x-coordinate for each group of bars
        x = indices + i * (bar_width + spacing)

        # Extract the values for the current iteration
        y = [value[i] for value in values]

        # Plot the bars
        ax.bar(x, y, width=bar_width)

    # Set the x-axis ticks and labels
    ax.set_xticks(indices + (bar_width * num_values + spacing * (num_values - 1)) / 2)
    ax.set_xticklabels(algorithms, rotation='vertical')

    # Set the y-axis label
    ax.set_ylabel('Values')

    # Set the plot title
    ax.set_title('All Metrics per Model')

    # Display the legend
    ax.legend(legend_names)

    # Show the plot
    plt.show()

def f1_bar_chart(data_dict):
    algorithms = list(data_dict.keys())
    values = [value[3] for value in data_dict.values()]  # Extract the fourth value from each key-value pair

    num_bars = len(algorithms)

    # Create an array of indices for the bars
    indices = np.arange(num_bars)

    # Set the width of each bar
    bar_width = 0.5

    fig, ax = plt.subplots(figsize=(10, 8))  # Set the figure size

    # Generate a list of colors using the 'tab10' colormap
    colors = plt.cm.tab10(np.arange(num_bars))

    # Plot the bars with tab10 colors
    ax.bar(indices, values, width=bar_width, color=colors)

    # Set the x-axis ticks and labels
    ax.set_xticks(indices)
    ax.set_xticklabels(algorithms, rotation='vertical')  # Rotate x tick labels vertically

    # Set the y-axis label
    ax.set_ylabel('Values')

    # Set the plot title
    ax.set_title('F1 scores Bar Plot')

    # Show the plot
    plt.show()

In [None]:
plot_bar_chart(val_dict)

In [None]:
f1_bar_chart(val_dict)