In [None]:
import pandas as pd
import numpy as np
import lightgbm
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
from pathlib import Path

In [None]:
# Better rendering
from IPython.core.display import HTML
HTML("<style>.rendered_html th {max-width: 120px;}</style>")

warnings.filterwarnings('ignore')

# settings to display all columns
pd.set_option("display.max_columns", None)

In [None]:
# Import data
# Specify the path to your CSV file
brand1_csv = "/Users/galuhprisillia/PycharmProjects/forecasting/brand1.csv"
brand2_csv = "/Users/galuhprisillia/PycharmProjects/forecasting/brand2.csv"
brand3_csv = "/Users/galuhprisillia/PycharmProjects/forecasting/brand3.csv"
brand4_csv = "/Users/galuhprisillia/PycharmProjects/forecasting/brand4.csv"
# Display the DataFrame

In [None]:
# df_brand1 = pd.read_csv(brand1_csv)
df_brand1 = pd.read_csv(brand1_csv)
df_brand2 = pd.read_csv(brand2_csv)
df_brand3 = pd.read_csv(brand3_csv)
df_brand4 = pd.read_csv(brand4_csv)

df = pd.concat([df_brand1, df_brand2, df_brand3, df_brand4])

In [None]:
# change date to datetime
df['date'] = pd.to_datetime(df['date'])

# rename default_code column to product_id
df = df.rename(columns={'default_code': 'product_id'})
# make a new column buy_quantity and sell_quantity
df['buy_quantity'] = df['product_qty'].apply(lambda x: x if x > 0 else 0)
df['sell_quantity'] = df['product_qty'].apply(lambda x: x if x < 0 else 0)

# make buy_quntity positive
df['buy_quantity'] = df['buy_quantity'].apply(lambda x: abs(x))
df['sell_quantity'] = df['sell_quantity'].apply(lambda x: abs(x))

In [None]:
# drop if is_pack is True
df = df[df.is_pack == False]

# drop reference column
df = df.drop(columns=['reference'])

# drop is_pack column
df = df.drop(columns=['is_pack'])

df = df.drop(columns=['product_qty'])

In [None]:
# sort the dataframe by product_id, and date and reset index
df_filter = df.sort_values(by=['product_id', 'date'])

df_filter = df_filter.reset_index(drop=True)

In [None]:
df_filter.head()

In [None]:
df_filter.info()

In [None]:
# filter the dataframe by date 2023-01-01 to 2023-12-12
df_filter = df_filter[(df_filter['date'] >= '2023-01-01') & (df_filter['date'] <= '2023-12-12')]
df_filter = df_filter.reset_index(drop=True)


In [None]:
df_filter.info()

In [None]:
# warehouse cost = 2% of the product price >> annual cost
# product price = 70% of the product list price
# lead time random, depends on the transportation, we will random using (2,3,4) days

# make a new column product_price
df_filter['product_price'] = df_filter['list_price'] * 0.7

In [None]:
df_filter.info()

In [None]:
# ABC Analysis
# make it annual, and group by product_id, make atotal of buy_quantity, sell_quantity, product_price, and warehouse_cost
# use TOPSIS to rank the product_id


In [None]:
df_filter.head(20)

In [None]:
import pandas as pd

product_ids = []
annual_buy_quantities = []
annual_sell_quantities = []
annual_product_prices = []
annual_unit_costs = []

# per product_id utilities cost = 48000000/lenght of product_id
utilities_cost = 48000000/len(df_filter['product_id'].unique())

for product_id in df_filter['product_id'].unique():
    product_data = df_filter[df_filter['product_id'] == product_id]

    # calculate annual buy_quantity
    annual_buy_quantity = product_data['buy_quantity'].sum()
    # calculate annual sell_quantity
    annual_sell_quantity = product_data['sell_quantity'].sum()
    # calculate annual product_price using average price
    average_product_price = product_data['product_price'].mean()

    # annual_unit_cost = utilities_cost/annual_buy_quantity
    annual_unit_cost = utilities_cost/annual_buy_quantity

    # append the result to the list
    product_ids.append(product_id)
    annual_buy_quantities.append(annual_buy_quantity)
    annual_sell_quantities.append(annual_sell_quantity)
    annual_product_prices.append(average_product_price)
    annual_unit_costs.append(annual_unit_cost)

# make a new dataframe
result_df = pd.DataFrame({
    'product_id': product_ids,
    'annual_buy_quantity': annual_buy_quantities,
    'annual_sell_quantity': annual_sell_quantities,
    'annual_product_price': annual_product_prices,
    'annual_unit_cost': annual_unit_costs
})


In [None]:
result_df['product_id'].value_counts()

In [None]:
# add new column lead_time with random value (2,3,4)
result_df['lead_time'] = np.random.randint(2, 5, result_df.shape[0])

In [None]:
result_df.info()

In [None]:
import numpy as np

# Assuming result_df is your DataFrame
# You can also use result_df.copy() to create a new DataFrame if you want to keep the original unchanged
result_df = result_df[~result_df.isin([np.inf, -np.inf]).any(axis=1)]

# reset the index
result_df = result_df.reset_index(drop=True)


In [None]:
import pandas as pd

# Assuming result_df is your DataFrame
# You can also use result_df.copy() to create a new DataFrame if you want to keep the original unchanged
df_renamed = result_df.copy()

# # Rename the 'product_id' column based on the current index
# df_renamed['product_id'] = 'item ' + (df_renamed.index + 1).astype(str)

In [None]:
df_renamed.info()

In [None]:
df_renamed.head()

In [None]:
df_renamed_2 = df_renamed.copy()

In [None]:
sum([0.45, 0.30, 0.125, 0.125])

In [None]:
# TOPSIS Analys using equal weighta
# weight by the proffesional (Galuh)
weights =  [0.45, 0.30, 0.125, 0.125]
normalized_weights = [w / sum(weights) for w in weights]

#annual_sell_quatity, annual_product_price, annual_unit_cost, and lead_time

# Required Libraries

# normalize the matrix using the weight given by the proffesional
def normalize(dataset, nCol, weights):
    for i in range(1, nCol):
        temp = 0
        # Calculating Root of Sum of squares of a particular column
        for j in range(len(dataset)):
            temp = temp + dataset.iloc[j, i]**2
        temp = temp**0.5
        # Weighted Normalizing a element
        for j in range(len(dataset)):
            dataset.iat[j, i] = (dataset.iloc[j, i] / temp)*weights[i-1]
    return dataset

In [None]:
# drop annual_buy_quantity column
df_renamed = df_renamed.drop(columns=['annual_buy_quantity'])

In [None]:
df_renamed_normalized = normalize(df_renamed, 5, weights)

In [None]:
df_renamed_normalized.head(20)

In [None]:
# calculate ideal best and ideal worst

def calc_values(dataset, nCol, impact):
    p_sln = (dataset.max().values)[1:]
    n_sln = (dataset.min().values)[1:]
    for i in range(1, nCol):
        if impact[i-1] == '-':
            p_sln[i-1], n_sln[i-1] = n_sln[i-1], p_sln[i-1]
    return p_sln, n_sln

In [None]:
temp_dataset = df_renamed_normalized.copy()
nCol = 4
impact = ['+', '+', '+', '+']

# Calculating positive and negative values
p_sln, n_sln = calc_values(temp_dataset, nCol, impact)

# calculating topsis score
score = [] # Topsis score
pp = [] # distance positive
nn = [] # distance negative


# Calculating distances and Topsis score for each row
for i in range(len(temp_dataset)):
    temp_p, temp_n = 0, 0
    for j in range(1, nCol):
        temp_p = temp_p + (p_sln[j-1] - temp_dataset.iloc[i, j])**2
        temp_n = temp_n + (n_sln[j-1] - temp_dataset.iloc[i, j])**2
    temp_p, temp_n = temp_p**0.5, temp_n**0.5
    temp_score = temp_n/(temp_p + temp_n)
    score.append(temp_score)
    nn.append(temp_n)
    pp.append(temp_p)


In [None]:
# Appending new columns in dataset

df_renamed['distance positive'] = pp
df_renamed['distance negative'] = nn
# normalize the topsis score so that it is sum of 1
score = score/sum(score)
df_renamed['Topsis Score'] = score

# calculating the rank according to topsis score
df_renamed['Rank'] = (df_renamed['Topsis Score'].rank(method='max', ascending=False))
df_renamed = df_renamed.astype({"Rank": int})

In [None]:
df_renamed.head()

In [None]:
# sort the dataset by Rank
df_renamed = df_renamed.sort_values(by=['Rank'])

# reset the index
df_renamed = df_renamed.reset_index(drop=True)
df_renamed.head(20)

In [None]:
# make new column cum_sum and cum_perc
# cum_sum is cumulative sum of the topsis score
df_renamed['cum_sum'] = df_renamed['Topsis Score'].cumsum()
# cum_perc is percentage of the cum_sum
df_renamed['cum_perc'] = 100*df_renamed['cum_sum']

In [None]:
df_renamed.head(20)

In [None]:
# sum onf topsis score
df_renamed['Topsis Score'].sum()

In [None]:
# ABC Analysis for the topsis
# if the cum_perc is <= 20% then A
# if the cum_perc is <= 50% then B
# and the rest is C

# make a new column ABC
df_renamed['ABC'] = df_renamed['cum_perc'].apply(lambda x: 'A' if x <= 20 else ('B' if x <= 50 else 'C'))



In [None]:
df_renamed.head(50)

In [None]:
df_renamed['ABC'].value_counts()

In [None]:
result_df.info()

In [None]:
result_df['class'] = result_df['product_id'].map(df_renamed.set_index('product_id')['ABC'])


In [None]:
result_df.head(20)

In [None]:
import seaborn as sns

sns.scatterplot(data=result_df, x='annual_sell_quantity', y='annual_product_price', hue='class')


In [None]:
result_df['class'].value_counts()

In [None]:
# drop annual_buy_quantity column
result_df = result_df.drop(columns=['annual_buy_quantity'])

In [None]:
result_df.info()

In [None]:
result_df.info()

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Define columns
X = result_df[['annual_sell_quantity', 'annual_product_price', 'annual_unit_cost', 'lead_time']]
y = result_df['class']

# Summarize class distribution
print("Original class distribution:", Counter(y))

# Define oversampling strategy using a dictionary to maintain class proportions
sampling_strategy = {'A': 200, 'B': 300, 'C': 500}
oversample = RandomOverSampler(sampling_strategy=sampling_strategy)

# Fit and apply the transform
X_over, y_over = oversample.fit_resample(X, y)

# Convert X_over to a DataFrame
df_over = pd.DataFrame(X_over, columns=X.columns)

# Add the 'class' column
df_over['class'] = y_over

# Summarize class distribution after oversampling
print("Oversampled class distribution:", Counter(y_over))

df_over.head()


In [None]:
# # change the class to numeric value
# df_over['class'] = df_over['class'].apply(lambda x: 1 if x == 'A' else (2 if x == 'B' else 3))

In [None]:
df_over.info()

In [None]:
# plot the data after oversampling
sns.scatterplot(data=df_over, x='annual_sell_quantity', y='annual_product_price', hue='class')


In [None]:
# using SVM and KNN to classify the data

# Import train_test_split function
from sklearn.model_selection import train_test_split



# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(df_over[['annual_sell_quantity', 'annual_product_price', 'annual_unit_cost', 'lead_time']], df_over['class'], test_size=0.2, random_state=1) # 80% training and 20% test


In [None]:

# Standard Scalling the data
from sklearn.preprocessing import StandardScaler

X_train = df_over[['annual_sell_quantity', 'annual_product_price', 'annual_unit_cost', 'lead_time']]
y_train = df_over['class']

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import numpy as np
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error

# Assuming you have your features (X) and labels (y) ready
# Replace X and y with your actual feature and label data

kf = KFold(n_splits=10, shuffle=True)

train_acc_arr = np.empty((10, 1))
test_acc_arr = np.empty((10, 1))
f1_arr = np.empty((10, 1))
precision_arr = np.empty((10, 1))
recall_arr = np.empty((10, 1))
cv_acc_arr = np.empty((10, 1))
cnf_arr = []
x = 0

max_classes = len(np.unique(y))  # Assuming y contains class labels

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model_ = SVC(kernel='poly', gamma=0.1, C=1.0, tol=1e-5, verbose=1, max_iter=2500).fit(X_train, y_train)

    y_train_pred = model_.predict(X_train)
    y_test_pred = model_.predict(X_test)

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred, average='weighted')
    precision = precision_score(y_test, y_test_pred, average='weighted')
    recall = recall_score(y_test, y_test_pred, average='weighted')

    print('Train Accuracy : {:.4f}'.format(train_acc))
    print('Test Accuracy  : {:.4f}'.format(test_acc))
    print('SVC f1-score   : {:.4f}'.format(f1))
    print('SVC precision  : {:.4f}'.format(precision))
    print('SVC recall     : {:.4f}'.format(recall))
    print("\n", classification_report(y_test, y_test_pred))

    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    # Zero-pad the confusion matrix to ensure consistent shape
    cnf_matrix = np.pad(cnf_matrix, ((0, max_classes - cnf_matrix.shape[0]), (0, max_classes - cnf_matrix.shape[1])))
    cnf_arr.append(cnf_matrix)
    train_acc_arr[x] = train_acc
    test_acc_arr[x] = test_acc
    f1_arr[x] = f1
    precision_arr[x] = precision
    recall_arr[x] = recall

    # Calculate cross-validated accuracy
    cv_acc = cross_val_score(model_, X, y, cv=kf, scoring='accuracy').mean()
    cv_acc_arr[x] = cv_acc
    print('Cross-Validated Accuracy: {:.4f}'.format(cv_acc))
    print("\n-----------------------------\n")

    x = x + 1

# Calculate and plot the total confusion matrix in percentage
svc_total_cnf_matrix = np.sum(cnf_arr, axis=0)
svc_total_cnf_matrix_percentage = svc_total_cnf_matrix / svc_total_cnf_matrix.sum(axis=1, keepdims=True) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(svc_total_cnf_matrix_percentage, annot=True, fmt='.2f', cmap='Blues', cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('SVC Confusion Matrix (Percentage)')
plt.show()

print("%0.4f weighted f1 score with a standard deviation of %0.4f" % (f1_arr.mean(), f1_arr.std()))
print("%0.4f weighted precision with a standard deviation of %0.4f" % (precision_arr.mean(), precision_arr.std()))
print("%0.4f weighted recall with a standard deviation of %0.4f" % (recall_arr.mean(), recall_arr.std()))
print("%0.4f train accuracy with a standard deviation of %0.4f" % (train_acc_arr.mean(), train_acc_arr.std()))
print("%0.4f test accuracy with a standard deviation of %0.4f" % (test_acc_arr.mean(), test_acc_arr.std()))
print("%0.4f cross-validated accuracy with a standard deviation of %0.4f" % (cv_acc_arr.mean(), cv_acc_arr.std()))


In [None]:
# from sklearn.model_selection import KFold, cross_val_score
# from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
# import numpy as np
# from sklearn.svm import SVC
# from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix
# from sklearn.metrics import r2_score, roc_auc_score, roc_curve, classification_report
# from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error


# model_ = SVC(kernel='poly',gamma=0.1, C=1.0, tol=1e-5, verbose=1,max_iter=2500).fit(X_train, y_train)

# kf = KFold(n_splits=10, shuffle=True)

# train_acc_arr = np.empty((10, 1))
# test_acc_arr = np.empty((10, 1))
# f1_arr = np.empty((10, 1))
# cv_acc_arr = np.empty((10, 1))
# cnf_arr = []
# x = 0

# for train_index, test_index in kf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]
#     model_.fit(X_train, y_train)
#     y_train_pred = model_.predict(X_train)
#     y_test_pred = model_.predict(X_test)

#     train_acc = accuracy_score(y_train, y_train_pred)
#     test_acc = accuracy_score(y_test, y_test_pred)
#     f1 = f1_score(y_test, y_test_pred, average='weighted')

#     print('Train Accuracy: {:.4f}'.format(train_acc))
#     print('Test Accuracy : {:.4f}'.format(test_acc))
#     print('SVC f1-score  : {:.4f}'.format(f1))
#     print('SVC precision : {:.4f}'.format(precision_score(y_test, y_test_pred, average='weighted')))
#     print('SVC recall    : {:.4f}'.format(recall_score(y_test, y_test_pred, average='weighted')))
#     print("\n", classification_report(y_test, y_test_pred))

#     cnf_matrix = confusion_matrix(y_test, y_test_pred)
#     train_acc_arr[x] = train_acc
#     test_acc_arr[x] = test_acc
#     f1_arr[x] = f1

#     # Calculate cross-validated accuracy
#     cv_acc = cross_val_score(model_, X, y, cv=kf, scoring='accuracy').mean()
#     cv_acc_arr[x] = cv_acc
#     print('Cross-Validated Accuracy: {:.4f}'.format(cv_acc))
#     print("\n-----------------------------\n")

#     x = x + 1

# print("%0.4f weighted f1 score with a standard deviation of %0.4f" % (f1_arr.mean(), f1_arr.std()))
# print("%0.4f train accuracy with a standard deviation of %0.4f" % (train_acc_arr.mean(), train_acc_arr.std()))
# print("%0.4f test accuracy with a standard deviation of %0.4f" % (test_acc_arr.mean(), test_acc_arr.std()))
# print("%0.4f cross-validated accuracy with a standard deviation of %0.4f" % (cv_acc_arr.mean(), cv_acc_arr.std()))


In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Your data (X, y) and KFold definition here

# Create KNN model
model_ = KNeighborsClassifier(n_neighbors=3)

kf = KFold(n_splits=10, shuffle=True)

train_acc_arr = np.empty((10, 1))
test_acc_arr = np.empty((10, 1))
f1_arr = np.empty((10, 1))
precision_arr = np.empty((10, 1))
recall_arr = np.empty((10, 1))
cv_acc_arr = np.empty((10, 1))
cnf_matrix = []

for x, (train_index, test_index) in enumerate(kf.split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Fit the KNN model on the training data
    model_.fit(X_train, y_train)

    # Predictions on the training and test sets
    y_train_pred = model_.predict(X_train)
    y_test_pred = model_.predict(X_test)

    # Calculate metrics
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred, average='weighted')
    precision = precision_score(y_test, y_test_pred, average='weighted')
    recall = recall_score(y_test, y_test_pred, average='weighted')

    print('Train Accuracy : {:.4f}'.format(train_acc))
    print('Test Accuracy  : {:.4f}'.format(test_acc))
    print('KNN f1-score   : {:.4f}'.format(f1))
    print('KNN precision  : {:.4f}'.format(precision))
    print('KNN recall     : {:.4f}'.format(recall))
    print("\n", classification_report(y_test, y_test_pred))

    cnf_matrix.append(confusion_matrix(y_test, y_test_pred))
    train_acc_arr[x] = train_acc
    test_acc_arr[x] = test_acc
    f1_arr[x] = f1
    precision_arr[x] = precision
    recall_arr[x] = recall

    # Calculate cross-validated accuracy
    cv_acc = cross_val_score(model_, X, y, cv=kf, scoring='accuracy').mean()
    cv_acc_arr[x] = cv_acc
    print('Cross-Validated Accuracy: {:.4f}'.format(cv_acc))
    print("\n-----------------------------\n")

# Summing up confusion matrices
max_rows = max(matrix.shape[0] for matrix in cnf_matrix)
max_cols = max(matrix.shape[1] for matrix in cnf_matrix)

knn_total_cnf_matrix = np.zeros((max_rows, max_cols), dtype=int)

for matrix in cnf_matrix:
    knn_total_cnf_matrix[:matrix.shape[0], :matrix.shape[1]] += matrix

knn_total_cnf_matrix_percentage = knn_total_cnf_matrix / knn_total_cnf_matrix.sum(axis=1, keepdims=True) * 100

# Plotting the total confusion matrix as percentages
plt.figure(figsize=(8, 6))
# sns.heatmap(knn_total_cnf_matrix / knn_total_cnf_matrix.sum(axis=1)[:, None], annot=True, fmt='.2%', cmap='Blues', cbar=False)
sns.heatmap(knn_total_cnf_matrix_percentage, annot=True, fmt='.2f', cmap='Blues', cbar=False)
plt.title('KNN Confusion Matrix (Percentage)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Rest of the code for printing summary statistics
print("%0.4f weighted f1 score with a standard deviation of %0.4f" % (f1_arr.mean(), f1_arr.std()))
print("%0.4f weighted precision with a standard deviation of %0.4f" % (precision_arr.mean(), precision_arr.std()))
print("%0.4f weighted recall with a standard deviation of %0.4f" % (recall_arr.mean(), recall_arr.std()))
print("%0.4f train accuracy with a standard deviation of %0.4f" % (train_acc_arr.mean(), train_acc_arr.std()))
print("%0.4f test accuracy with a standard deviation of %0.4f" % (test_acc_arr.mean(), test_acc_arr.std()))
print("%0.4f cross-validated accuracy with a standard deviation of %0.4f" % (cv_acc_arr.mean(), cv_acc_arr.std()))


In [None]:
n = len(cnf_matrix)
ncols = 2
nrows = n // ncols + (n % ncols > 0)

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 20))

# Flatten the axes array and remove unused ones
axes = axes.flatten()
for ax in axes[n:]:
    fig.delaxes(ax)

for i, cm in enumerate(cnf_matrix):
    ax = axes[i]
    sns.heatmap(cm, annot=True, ax=ax, fmt='.2f', cmap='Blues')
    ax.set_title(f'Confusion Matrix for Fold {i+1}')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')

plt.subplots_adjust(wspace=0.5, hspace=0.5)
plt.show()

In [None]:
# from sklearn.model_selection import KFold, cross_val_score
# from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
# from sklearn.neighbors import KNeighborsClassifier
# import numpy as np

# # Create KNN model
# model_ = KNeighborsClassifier(n_neighbors=3)  # You can adjust the number of neighbors (n_neighbors) as needed

# kf = KFold(n_splits=10, shuffle=True)

# train_acc_arr = np.empty((10, 1))
# test_acc_arr = np.empty((10, 1))
# f1_arr = np.empty((10, 1))
# cv_acc_arr = np.empty((10, 1))
# cnf_matrix = []
# x = 0

# for train_index, test_index in kf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     # Fit the KNN model on the training data
#     model_.fit(X_train, y_train)

#     # Predictions on the training and test sets
#     y_train_pred = model_.predict(X_train)
#     y_test_pred = model_.predict(X_test)

#     # Calculate metrics
#     train_acc = accuracy_score(y_train, y_train_pred)
#     test_acc = accuracy_score(y_test, y_test_pred)
#     f1 = f1_score(y_test, y_test_pred, average='weighted')

#     print('Train Accuracy: {:.4f}'.format(train_acc))
#     print('Test Accuracy : {:.4f}'.format(test_acc))
#     print('KNN f1-score  : {:.4f}'.format(f1))
#     print('KNN precision : {:.4f}'.format(precision_score(y_test, y_test_pred, average='weighted')))
#     print('KNN recall    : {:.4f}'.format(recall_score(y_test, y_test_pred, average='weighted')))
#     print("\n", classification_report(y_test, y_test_pred))

#     cnf_matrix.append(confusion_matrix(y_test, y_test_pred))
#     train_acc_arr[x] = train_acc
#     test_acc_arr[x] = test_acc
#     f1_arr[x] = f1

#     # Calculate cross-validated accuracy
#     cv_acc = cross_val_score(model_, X, y, cv=kf, scoring='accuracy').mean()
#     cv_acc_arr[x] = cv_acc
#     print('Cross-Validated Accuracy: {:.4f}'.format(cv_acc))
#     print("\n-----------------------------\n")

#     x = x + 1

# print("%0.4f weighted f1 score with a standard deviation of %0.4f" % (f1_arr.mean(), f1_arr.std()))
# print("%0.4f train accuracy with a standard deviation of %0.4f" % (train_acc_arr.mean(), train_acc_arr.std()))
# print("%0.4f test accuracy with a standard deviation of %0.4f" % (test_acc_arr.mean(), test_acc_arr.std()))
# print("%0.4f cross-validated accuracy with a standard deviation of %0.4f" % (cv_acc_arr.mean(), cv_acc_arr.std()))


In [None]:
n = len(cnf_matrix)
ncols = 2
nrows = n // ncols + (n % ncols > 0)

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 20))

# Flatten the axes array and remove unused ones
axes = axes.flatten()
for ax in axes[n:]:
    fig.delaxes(ax)

for i, cm in enumerate(cnf_matrix):
    ax = axes[i]
    sns.heatmap(cm, annot=True, ax=ax, fmt='.2f', cmap='Blues')
    ax.set_title(f'Confusion Matrix for Fold {i+1}')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')

plt.subplots_adjust(wspace=0.5, hspace=0.5)
plt.show()

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have your features (X) and labels (y) ready
# Replace X and y with your actual feature and label data

kf = KFold(n_splits=10, shuffle=True)

train_acc_arr = np.empty((10, 1))
test_acc_arr = np.empty((10, 1))
f1_arr = np.empty((10, 1))
precision_arr = np.empty((10, 1))
recall_arr = np.empty((10, 1))
cv_acc_arr = np.empty((10, 1))
cnf_arr = []
x = 0

max_classes = len(np.unique(y))  # Assuming y contains class labels

model_ = RandomForestClassifier(n_estimators=100, random_state=42)

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Fit the Random Forest model on the training data
    model_.fit(X_train, y_train)

    # Predictions on the training and test sets
    y_train_pred = model_.predict(X_train)
    y_test_pred = model_.predict(X_test)

    # Calculate metrics
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred, average='weighted')
    precision = precision_score(y_test, y_test_pred, average='weighted')
    recall = recall_score(y_test, y_test_pred, average='weighted')

    print('Train Accuracy : {:.4f}'.format(train_acc))
    print('Test Accuracy  : {:.4f}'.format(test_acc))
    print('RF f1-score    : {:.4f}'.format(f1))
    print('RF precision   : {:.4f}'.format(precision))
    print('RF recall      : {:.4f}'.format(recall))
    print("\n", classification_report(y_test, y_test_pred))

    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    # Zero-pad the confusion matrix to ensure consistent shape
    cnf_matrix = np.pad(cnf_matrix, ((0, max_classes - cnf_matrix.shape[0]), (0, max_classes - cnf_matrix.shape[1])))
    cnf_arr.append(cnf_matrix)
    train_acc_arr[x] = train_acc
    test_acc_arr[x] = test_acc
    f1_arr[x] = f1
    precision_arr[x] = precision
    recall_arr[x] = recall

    # Calculate cross-validated accuracy
    cv_acc = cross_val_score(model_, X, y, cv=kf, scoring='accuracy').mean()
    cv_acc_arr[x] = cv_acc
    print('Cross-Validated Accuracy: {:.4f}'.format(cv_acc))
    print("\n-----------------------------\n")

    x = x + 1

# Summing up confusion matrices
max_rows = max(matrix.shape[0] for matrix in cnf_arr)
max_cols = max(matrix.shape[1] for matrix in cnf_arr)

rf_total_cnf_matrix = np.zeros((max_rows, max_cols), dtype=int)

for matrix in cnf_arr:
    rf_total_cnf_matrix[:matrix.shape[0], :matrix.shape[1]] += matrix

# Plotting the total confusion matrix as percentages
rf_total_cnf_matrix_percentage = rf_total_cnf_matrix / rf_total_cnf_matrix.sum(axis=1, keepdims=True) * 100

plt.figure(figsize=(8, 6))
sns.heatmap(rf_total_cnf_matrix_percentage, annot=True, fmt='.2f', cmap='Blues', cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Random Forest Confusion Matrix (Percentage)')
plt.show()

# Rest of the code for printing summary statistics
print("%0.4f weighted f1 score with a standard deviation of %0.4f" % (f1_arr.mean(), f1_arr.std()))
print("%0.4f weighted precision with a standard deviation of %0.4f" % (precision_arr.mean(), precision_arr.std()))
print("%0.4f weighted recall with a standard deviation of %0.4f" % (recall_arr.mean(), recall_arr.std()))
print("%0.4f train accuracy with a standard deviation of %0.4f" % (train_acc_arr.mean(), train_acc_arr.std()))
print("%0.4f test accuracy with a standard deviation of %0.4f" % (test_acc_arr.mean(), test_acc_arr.std()))
print("%0.4f cross-validated accuracy with a standard deviation of %0.4f" % (cv_acc_arr.mean(), cv_acc_arr.std()))


In [None]:
# from sklearn.model_selection import KFold, cross_val_score
# from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
# from sklearn.ensemble import RandomForestClassifier
# import numpy as np

# # Create Random Forest model
# model_ = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the number of estimators (n_estimators) as needed

# kf = KFold(n_splits=10, shuffle=True)

# train_acc_arr = np.empty((10, 1))
# test_acc_arr = np.empty((10, 1))
# f1_arr = np.empty((10, 1))
# cv_acc_arr = np.empty((10, 1))
# cnf_matrix = []
# x = 0

# for train_index, test_index in kf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     # Fit the Random Forest model on the training data
#     model_.fit(X_train, y_train)

#     # Predictions on the training and test sets
#     y_train_pred = model_.predict(X_train)
#     y_test_pred = model_.predict(X_test)

#     # Calculate metrics
#     train_acc = accuracy_score(y_train, y_train_pred)
#     test_acc = accuracy_score(y_test, y_test_pred)
#     f1 = f1_score(y_test, y_test_pred, average='weighted')

#     print('Train Accuracy: {:.4f}'.format(train_acc))
#     print('Test Accuracy : {:.4f}'.format(test_acc))
#     print('Random Forest f1-score  : {:.4f}'.format(f1))
#     print('Random Forest precision : {:.4f}'.format(precision_score(y_test, y_test_pred, average='weighted')))
#     print('Random Forest recall    : {:.4f}'.format(recall_score(y_test, y_test_pred, average='weighted')))
#     print("\n", classification_report(y_test, y_test_pred))

#     cnf_matrix.append(confusion_matrix(y_test, y_test_pred))
#     train_acc_arr[x] = train_acc
#     test_acc_arr[x] = test_acc
#     f1_arr[x] = f1

#     # Calculate cross-validated accuracy
#     cv_acc = cross_val_score(model_, X, y, cv=kf, scoring='accuracy').mean()
#     cv_acc_arr[x] = cv_acc
#     print('Cross-Validated Accuracy: {:.4f}'.format(cv_acc))
#     print("\n-----------------------------\n")

#     x = x + 1

# print("%0.4f weighted f1 score with a standard deviation of %0.4f" % (f1_arr.mean(), f1_arr.std()))
# print("%0.4f train accuracy with a standard deviation of %0.4f" % (train_acc_arr.mean(), train_acc_arr.std()))
# print("%0.4f test accuracy with a standard deviation of %0.4f" % (test_acc_arr.mean(), test_acc_arr.std()))
# print("%0.4f cross-validated accuracy with a standard deviation of %0.4f" % (cv_acc_arr.mean(), cv_acc_arr.std()))


In [None]:
n = len(cnf_matrix)
ncols = 2
nrows = n // ncols + (n % ncols > 0)

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 20))

# Flatten the axes array and remove unused ones
axes = axes.flatten()
for ax in axes[n:]:
    fig.delaxes(ax)

for i, cm in enumerate(cnf_matrix):
    ax = axes[i]
    sns.heatmap(cm, annot=True, ax=ax, fmt='.2f', cmap='Blues')
    ax.set_title(f'Confusion Matrix for Fold {i+1}')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')

plt.subplots_adjust(wspace=0.5, hspace=0.5)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns

# Assuming you have the matrices rf_total_cnf_matrix_percentage, knn_total_cnf_matrix_percentage, and svc_total_cnf_matrix_percentage

# Create a figure and subplots
fig, axes = plt.subplots(1, 3, figsize=(20, 6))

# Define a custom color map
# Plot rf_total_cnf_matrix_percentage with annotations
im_rf = axes[0].imshow(rf_total_cnf_matrix_percentage, cmap='Blues')
for i in range(rf_total_cnf_matrix_percentage.shape[0]):
    for j in range(rf_total_cnf_matrix_percentage.shape[1]):
        axes[0].text(j, i, f'{rf_total_cnf_matrix_percentage[i, j]:.2f}%', ha='center', va='center', color='black')

axes[0].set_title('Random Forest Confusion Matrix (Percentage)')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('True')
fig.colorbar(im_rf, ax=axes[0])

# Plot knn_total_cnf_matrix_percentage with annotations
im_knn = axes[1].imshow(knn_total_cnf_matrix_percentage, cmap='Reds')
for i in range(knn_total_cnf_matrix_percentage.shape[0]):
    for j in range(knn_total_cnf_matrix_percentage.shape[1]):
        axes[1].text(j, i, f'{knn_total_cnf_matrix_percentage[i, j]:.2f}%', ha='center', va='center', color='black')

axes[1].set_title('KNN Confusion Matrix (Percentage)')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('True')
fig.colorbar(im_knn, ax=axes[1])

# Plot svc_total_cnf_matrix_percentage with annotations
im_svc = axes[2].imshow(svc_total_cnf_matrix_percentage, cmap='Greens')
for i in range(svc_total_cnf_matrix_percentage.shape[0]):
    for j in range(svc_total_cnf_matrix_percentage.shape[1]):
        axes[2].text(j, i, f'{svc_total_cnf_matrix_percentage[i, j]:.2f}%', ha='center', va='center', color='black')

axes[2].set_title('SVC Confusion Matrix (Percentage)')
axes[2].set_xlabel('Predicted')
axes[2].set_ylabel('True')
fig.colorbar(im_svc, ax=axes[2])

# Adjust the spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
rf_total_cnf_matrix_percentage

In [None]:
knn_total_cnf_matrix_percentage

In [None]:
svc_total_cnf_matrix_percentage