# Import Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

# Importing data sets

In [None]:
rawHisTransDF=pd.read_csv('Historical-transaction-data.csv')
rawStoreInfDF=pd.read_csv('Store-info.csv')
rawTestDF=pd.read_csv('Testing-data.csv')

# Viewing data frame

In [None]:
rawHisTransDF.head()

In [None]:
rawStoreInfDF.head()

# Data Pre processing

#### Fixing data

In [None]:
# convert the date string column to datetime
rawHisTransDF['transaction_date'] = pd.to_datetime(rawHisTransDF['transaction_date'], format='%Y/%m/%d').dt.date

In [None]:
# get count of null values in each column
null_counts = rawHisTransDF.isnull().sum()
# print the counts
print(null_counts)

In [None]:
rawHisTransDF.dropna(subset=['item_description','invoice_id'], inplace=True)

In [None]:
# get count of null values in each column
null_counts = rawHisTransDF.isnull().sum()
# print the counts
print(null_counts)

In [None]:
rawHisTransDF=rawHisTransDF.drop_duplicates()

# Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
rawHisTransDF['item_description'] = le.fit_transform(rawHisTransDF['item_description'])
rawHisTransDF['customer_id'] = le.fit_transform(rawHisTransDF['customer_id'])
rawHisTransDF['shop_id'] = rawHisTransDF['shop_id'].str.replace(r'^SHOP', '').astype(int)
rawStoreInfDF['shop_id'] = rawStoreInfDF['shop_id'].str.replace(r'^SHOP', '').astype(int)

In [None]:
rawStoreInfDF['shop_profile'] = rawStoreInfDF['shop_profile'].replace({'High': 1, 'Moderate': 2, 'Low': 3})
rawStoreInfDF['shop_profile'] = rawStoreInfDF['shop_profile'].fillna(0.0).astype(int)
rawHisTransDF['invoice_id'] = rawHisTransDF['invoice_id'].astype(int)

In [None]:
rawStoreInfDF

In [None]:
rawHisTransDF

# Feature Creation

In [None]:
# group the dataframe by the 'group' column and get the size of each group
transactions_by_shop = rawHisTransDF.groupby('shop_id').size().reset_index()

# rename columns of the new dataframe
transactions_by_shop.columns = ['shop_id', 'num_of_transactions']

In [None]:
# rawStoreInfDF['transaction_by_shop']=transactions_by_shop
rawStoreInfDF = pd.merge(rawStoreInfDF, transactions_by_shop, on='shop_id')
rawStoreInfDF

In [None]:
# get count of null values in each column
null_counts = rawStoreInfDF.isnull().sum()
# print the counts
print(null_counts)

In [None]:
output=pd.read_csv('output.csv')

In [None]:
output = output[['shop_id', 'Daily_Sales_avg', 'revnew', 'rev_per_sqfeet', 'avd_daily_items_types_sold', 'avd_daily_transctions', 'avd_daily_custemers', 'avg_visits']]

In [None]:
output =output.drop_duplicates()

In [None]:
# rawStoreInfDF['transaction_by_shop']=transactions_by_shop
rawStoreInfDF = pd.merge(rawStoreInfDF, output, on='shop_id')
rawStoreInfDF

In [None]:
rawStoreInfDF=rawStoreInfDF.drop(['avd_daily_transctions','revnew','avd_daily_custemers','num_of_transactions'], axis=1)

#### Scaling

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# load your data into a pandas dataframe
# df = pd.read_csv('your_data.csv')

# compute the correlation matrix
corr_matrix = rawStoreInfDF.corr()

# plot the correlation matrix as a heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()


# Split to train and test data

In [None]:
# Split the DataFrame into two based on column B
TestDF = rawStoreInfDF[rawStoreInfDF['shop_profile'] == 0].drop(['shop_profile'], axis=1)
TrainDF = rawStoreInfDF[rawStoreInfDF['shop_profile'] != 0]

In [None]:
TestDF

In [None]:
# Split Fulldata into training and testing sets
from sklearn.model_selection import train_test_split

column_name = 'shop_id'
unique_categories = TrainDF[column_name].nunique()
categories_in_dataset_1 = int(unique_categories * 0.8)
categories_in_dataset_2 = unique_categories - categories_in_dataset_1
dataset_1_categories = TrainDF[column_name].unique()[:categories_in_dataset_1]
dataset_2_categories = TrainDF[column_name].unique()[categories_in_dataset_1:]

train_data = TrainDF[TrainDF[column_name].isin(dataset_1_categories)]
test_data = TrainDF[TrainDF[column_name].isin(dataset_2_categories)]





#train_data, test_data = train_test_split(TrainDF, test_size=0.01)

In [None]:
train_data

In [None]:
train_data.to_csv('gpttdanna.csv', index=False)

In [None]:
train_data

In [None]:
expectedResult=test_data[['shop_id','shop_profile']]


In [None]:
expectedResult

In [None]:
# # group the dataframe by the 'group' column
# grouped = expectedResult.groupby('shop_id')

# # find the mode value of each group
# TestMode_df = grouped['shop_profile'].apply(lambda x: x.mode()[0]).reset_index()

# # rename columns of the new dataframe
# TestMode_df.columns = ['shop_id', 'shop_profile']

In [None]:
# TestMode_df['shop_id']=TestMode_df['shop_id'].astype(int)
# TestMode_df['shop_profile']=TestMode_df['shop_profile'].astype(int)

In [None]:
shop_id_x_test = test_data['shop_id']
shop_id_x_TestDF = TestDF['shop_id']
TestDF=TestDF.drop('shop_id',axis=1)
train_data=train_data.drop('shop_id', axis=1)
test_data=test_data.drop('shop_id',axis=1)

In [None]:
X_train=train_data.drop(['shop_profile'], axis=1)
y_train=train_data['shop_profile']
X_test= test_data.drop(['shop_profile'], axis=1)
y_test=test_data['shop_profile']
X_testres = X_test.reset_index(drop=True)

In [None]:
from sklearn.preprocessing import RobustScaler
import pandas as pd

# Create an instance of RobustScaler
scaler = RobustScaler()

temp_X_train=X_train
temp_x_test=X_test
temp_TestDF=TestDF
# Scale the data
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
TestDF = scaler.fit_transform(TestDF)

# Convert the scaled data back to a pandas dataframe
X_train = pd.DataFrame(X_train, columns=temp_X_train.columns)
X_test = pd.DataFrame(X_test, columns=temp_x_test.columns)
TestDF = pd.DataFrame(TestDF, columns=temp_TestDF.columns)




In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# create an instance of MinMaxScaler
scaler = MinMaxScaler()

# fit and transform the dataset using the scaler
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

TestDF = scaler.transform(TestDF)


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Define the logistic regression model
model_random = RandomForestClassifier(max_depth=25, min_samples_leaf=1, min_samples_split=2, n_estimators=10)

# Train the model on the training data
model_random.fit(X_train, y_train)

# Make predictions on the testing data
predictions = model_random.predict(X_test)
predictions_Test_randomforest=model_random.predict(TestDF)

accu = accuracy_score(y_test, predictions)

print(accu)
# print(f1_score(y_test, predictions, average=None))



In [None]:
predictions_Test_randomforest

In [None]:
predDf=pd.DataFrame(predictions, columns=['shop_profile'])
predDf_Test_randomforest=pd.DataFrame(predictions_Test_randomforest, columns=['shop_profile'])

In [None]:
predDf

In [None]:
shop_id_x_TestDF

In [None]:
shop_id_x_test=shop_id_x_test.reset_index()
shop_id_x_TestDF=shop_id_x_TestDF.reset_index()

In [None]:
shop_id_x_TestDF=shop_id_x_TestDF.drop('index',axis=1)

In [None]:
# Concatenate DataFrames
concatenatedRes_df = pd.concat([shop_id_x_test, predDf], axis=1)
# concatenatedRes_df = pd.concat([concatenatedRes_df, shop_id_x_test], axis=1)
# concatenated_df = pd.concat([shop_id_x_test, predDf], ignore_index=True)
concatenatedRes_df_random = pd.concat([shop_id_x_TestDF, predDf_Test_randomforest], axis=1)

In [None]:
concatenatedRes_df_random

In [None]:
concatenatedRes_df_random['shop_profile'] = concatenatedRes_df_random['shop_profile'].replace({1: 'High', 2: 'Moderate', 3: 'Low'})

In [None]:
concatenatedRes_df_random.to_csv('final.csv',index=False)

In [None]:
final = pd.read_csv('final.csv')

In [None]:
concatenatedRes_df

In [None]:
# # group the dataframe by the 'group' column
# grouped = concatenatedRes_df.groupby('shop_id')

# # find the mode value of each group
# result = grouped['shop_profile'].apply(lambda x: x.mode()[0]).reset_index()

# # rename columns of the new dataframe
# result.columns = ['shop_id', 'shop_profile']

In [None]:
# result['shop_id']=result['shop_id'].astype(int)
# result['shop_profile']=result['shop_profile'].astype(int)
# result=result['shop_profile']

In [None]:
expectedResult

In [None]:
concatenatedRes_df

In [None]:
expectedResult=expectedResult['shop_profile']
concatenatedRes_df=concatenatedRes_df['shop_profile']

In [None]:
# Calculate F1 score for each class
f1_class0 = f1_score(expectedResult, concatenatedRes_df, labels=[1], average='weighted')
f1_class1 = f1_score(expectedResult, concatenatedRes_df, labels=[2], average='weighted')
f1_class2 = f1_score(expectedResult, concatenatedRes_df, labels=[3], average='weighted')

# Calculate average F1 score
f1_average = (f1_class0 + f1_class1 + f1_class2) / 3

print(f"F1 score for class 0: {f1_class0:.2f}")
print(f"F1 score for class 1: {f1_class1:.2f}")
print(f"F1 score for class 2: {f1_class2:.2f}")
print(f"Average F1 score: {f1_average:.2f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create confusion matrix
matrix = confusion_matrix(expectedResult, concatenatedRes_df)

# Visualize confusion matrix
sns.heatmap(matrix, annot=True)
plt.show()

In [None]:
y_train = y_train.replace({1: 0, 2: 1, 3: 2})
y_test = y_test.replace({1: 0, 2: 1, 3: 2})

In [None]:
import xgboost as xgb

# define the XGBoost model
model_xg = xgb.XGBClassifier(objective='multi:softmax', random_state=42)

# train the model on the training data
model_xg.fit(X_train, y_train)

# make predictions on the testing data
y_pred = model_xg.predict(X_test)
y_pred_Test_xgb=model_xg.predict(TestDF)

# calculate the accuracy score of the predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
predDf_xg=pd.DataFrame(y_pred, columns=['shop_profile'])
predDf_xg_Test=pd.DataFrame(y_pred_Test_xgb, columns=['shop_profile'])

In [None]:
# Concatenate DataFrames
concatenatedRes_df_xg = pd.concat([shop_id_x_test, predDf_xg], axis=1)
# concatenatedRes_df = pd.concat([concatenatedRes_df, shop_id_x_test], axis=1)
# concatenated_df = pd.concat([shop_id_x_test, predDf], ignore_index=True)
concatenatedRes_df_xg_Test = pd.concat([shop_id_x_TestDF, predDf_xg_Test], axis=1)

In [None]:
concatenatedRes_df_xg_Test

In [None]:
concatenatedRes_df_xg_Test['shop_profile'] = concatenatedRes_df_xg_Test['shop_profile'].replace({0: 'High', 1: 'Moderate', 2: 'Low'})

In [None]:
final=pd.concat([final,concatenatedRes_df_xg_Test], axis=1)

In [None]:
final.to_csv('final.csv',index=False)

In [None]:
# expectedResult=expectedResult['shop_profile']
concatenatedRes_df_xg=concatenatedRes_df_xg['shop_profile']

In [None]:
y_train = y_train.replace({0: 1, 1: 2, 2: 3})
y_test = y_test.replace({0: 1, 1: 2, 2: 3})

In [None]:
# Calculate F1 score for each class
f1_class0 = f1_score(expectedResult, concatenatedRes_df_xg, labels=[1], average='weighted')
f1_class1 = f1_score(expectedResult, concatenatedRes_df_xg, labels=[2], average='weighted')
f1_class2 = f1_score(expectedResult, concatenatedRes_df_xg, labels=[3], average='weighted')

# Calculate average F1 score
f1_average = (f1_class0 + f1_class1 + f1_class2) / 3

print(f"F1 score for class 0: {f1_class0:.2f}")
print(f"F1 score for class 1: {f1_class1:.2f}")
print(f"F1 score for class 2: {f1_class2:.2f}")
print(f"Average F1 score: {f1_average:.2f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create confusion matrix
matrix = confusion_matrix(expectedResult, concatenatedRes_df_xg)

# Visualize confusion matrix
sns.heatmap(matrix, annot=True)
plt.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# define the KNN model
model_knn = KNeighborsClassifier(n_neighbors=5)

# train the model on the training data
model_knn.fit(X_train, y_train)

# make predictions on the testing data
y_pred_knn = model_knn.predict(X_test)
y_pred_knn_Test = model_knn.predict(TestDF)

# calculate the accuracy score of the predictions
accuracy = accuracy_score(y_test, y_pred_knn)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
predDf_knn=pd.DataFrame(y_pred_knn, columns=['shop_profile'])
predDf_knn_Test=pd.DataFrame(y_pred_knn_Test, columns=['shop_profile'])
# Concatenate DataFrames
concatenatedRes_df_knn = pd.concat([shop_id_x_test, predDf_knn], axis=1)
concatenatedRes_df_knn_Test = pd.concat([shop_id_x_TestDF, predDf_knn_Test], axis=1)
# concatenatedRes_df = pd.concat([concatenatedRes_df, shop_id_x_test], axis=1)
# concatenated_df = pd.concat([shop_id_x_test, predDf], ignore_index=True)
# expectedResult=expectedResult['shop_profile']
concatenatedRes_df_knn=concatenatedRes_df_knn['shop_profile']

In [None]:
concatenatedRes_df_knn_Test

In [None]:
concatenatedRes_df_knn_Test['shop_profile'] = concatenatedRes_df_knn_Test['shop_profile'].replace({1: 'High', 2: 'Moderate', 3: 'Low'})

In [None]:
final = pd.concat([final,concatenatedRes_df_knn_Test],axis=1)
final.to_csv('final.csv',index=False)

In [None]:
# Calculate F1 score for each class
f1_class0 = f1_score(expectedResult, concatenatedRes_df_knn, labels=[1], average='weighted')
f1_class1 = f1_score(expectedResult, concatenatedRes_df_knn, labels=[2], average='weighted')
f1_class2 = f1_score(expectedResult, concatenatedRes_df_knn, labels=[3], average='weighted')

# Calculate average F1 score
f1_average = (f1_class0 + f1_class1 + f1_class2) / 3

print(f"F1 score for class 0: {f1_class0:.2f}")
print(f"F1 score for class 1: {f1_class1:.2f}")
print(f"F1 score for class 2: {f1_class2:.2f}")
print(f"Average F1 score: {f1_average:.2f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create confusion matrix
matrix = confusion_matrix(expectedResult, concatenatedRes_df_knn)

# Visualize confusion matrix
sns.heatmap(matrix, annot=True)
plt.show()

---------------------------------------------------------------------------------------------------------------------

In [None]:
from sklearn.tree import DecisionTreeClassifier
# define the KNN model
clf = DecisionTreeClassifier()

# train the model on the training data
clf.fit(X_train, y_train)

# make predictions on the testing data
y_pred_clf = model_knn.predict(X_test)
y_pred_clf_Test = model_knn.predict(TestDF)

# calculate the accuracy score of the predictions
accuracy = accuracy_score(y_test, y_pred_clf)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
predDf_clf=pd.DataFrame(y_pred_clf, columns=['shop_profile'])
predDf_clf_Test=pd.DataFrame(y_pred_clf_Test, columns=['shop_profile'])
# Concatenate DataFrames
concatenatedRes_df_clf = pd.concat([shop_id_x_test, predDf_clf], axis=1)
concatenatedRes_df_clf_Test = pd.concat([shop_id_x_TestDF, predDf_clf_Test], axis=1)
# concatenatedRes_df = pd.concat([concatenatedRes_df, shop_id_x_test], axis=1)
# concatenated_df = pd.concat([shop_id_x_test, predDf], ignore_index=True)
# expectedResult=expectedResult['shop_profile']
concatenatedRes_df_clf=concatenatedRes_df_clf['shop_profile']

In [None]:
concatenatedRes_df_clf_Test

In [None]:
# Calculate F1 score for each class
f1_class0 = f1_score(expectedResult, concatenatedRes_df_clf, labels=[1], average='weighted')
f1_class1 = f1_score(expectedResult, concatenatedRes_df_clf, labels=[2], average='weighted')
f1_class2 = f1_score(expectedResult, concatenatedRes_df_clf, labels=[3], average='weighted')

# Calculate average F1 score
f1_average = (f1_class0 + f1_class1 + f1_class2) / 3

print(f"F1 score for class 0: {f1_class0:.2f}")
print(f"F1 score for class 1: {f1_class1:.2f}")
print(f"F1 score for class 2: {f1_class2:.2f}")
print(f"Average F1 score: {f1_average:.2f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create confusion matrix
matrix = confusion_matrix(expectedResult, concatenatedRes_df_clf)

# Visualize confusion matrix
sns.heatmap(matrix, annot=True)
plt.show()

In [None]:
# # Import necessary libraries
# from keras.models import Sequential
# from keras.layers import Dense
# from sklearn.datasets import load_iris
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.metrics import accuracy_score

# # One-hot encode the target variable
# enc = OneHotEncoder()
# y_train = enc.fit_transform(y_train.reshape(-1, 1)).toarray()
# y_test = enc.transform(y_test.reshape(-1, 1)).toarray()

# # Create a neural network model
# model = Sequential()
# model.add(Dense(10, input_dim=4, activation='relu'))
# model.add(Dense(3, activation='softmax'))

# # Compile the model
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# # Train the model on the training data
# model.fit(X_train, y_train, epochs=50, batch_size=10)

# # Predict the classes of the testing data
# y_pred = model.predict_classes(X_test)

# # Decode one-hot encoded labels back to original labels
# y_test = enc.inverse_transform(y_test)
# y_pred = enc.inverse_transform(y_pred.reshape(-1, 1))

# # Calculate the accuracy of the classifier
# accuracy = accuracy_score(y_test, y_pred)

# # Print the accuracy of the classifier
# print("Accuracy:", accuracy)

#### Ensembling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Train the first set of models
model1 = LogisticRegression()
model1.fit(X_train, y_train)

model2 = RandomForestClassifier(max_depth=25, min_samples_leaf=1, min_samples_split=2, n_estimators=10)
model2.fit(X_train, y_train)

# Make predictions on the testing set using the first set of models
y_pred_1 = model1.predict(X_test)
y_pred_2 = model2.predict(X_test)
y_pred_1_test = model1.predict(TestDF)
y_pred_2_test = model2.predict(TestDF)

# Stack the predictions from the first set of models
X_stack = np.column_stack((y_pred_1, y_pred_2))
X_stack_test = np.column_stack((y_pred_1_test,y_pred_2_test))

# Train the final model on the stacked predictions
final_model = LogisticRegression()
final_model.fit(X_stack, y_test)

# Make predictions on the testing set using the final model
y_pred_int = final_model.predict(X_stack)
y_pred_int_test = final_model.predict(X_stack_test)

# Calculate the accuracy of the final predictions
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)


In [None]:
predDf_int=pd.DataFrame(y_pred_int, columns=['shop_profile'])
predDf_int_Test=pd.DataFrame(y_pred_int_test, columns=['shop_profile'])
# Concatenate DataFrames
concatenatedRes_df_int = pd.concat([shop_id_x_test, predDf_int], axis=1)
concatenatedRes_df_int_Test = pd.concat([shop_id_x_TestDF, predDf_int_Test], axis=1)
# concatenatedRes_df = pd.concat([concatenatedRes_df, shop_id_x_test], axis=1)
# concatenated_df = pd.concat([shop_id_x_test, predDf], ignore_index=True)
# expectedResult=expectedResult['shop_profile']
concatenatedRes_df_int=concatenatedRes_df_int['shop_profile']
concatenatedRes_df_int_Test

In [None]:
# Calculate F1 score for each class
f1_class0 = f1_score(expectedResult, concatenatedRes_df_int, labels=[1], average='weighted')
f1_class1 = f1_score(expectedResult, concatenatedRes_df_int, labels=[2], average='weighted')
f1_class2 = f1_score(expectedResult, concatenatedRes_df_int, labels=[3], average='weighted')

# Calculate average F1 score
f1_average = (f1_class0 + f1_class1 + f1_class2) / 3

print(f"F1 score for class 0: {f1_class0:.2f}")
print(f"F1 score for class 1: {f1_class1:.2f}")
print(f"F1 score for class 2: {f1_class2:.2f}")
print(f"Average F1 score: {f1_average:.2f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create confusion matrix
matrix = confusion_matrix(expectedResult, concatenatedRes_df_int)

# Visualize confusion matrix
sns.heatmap(matrix, annot=True)
plt.show()

#### Random forest and XGboost

In [None]:
y_train = y_train.replace({1: 0, 2: 1, 3: 2})
y_test = y_test.replace({1: 0, 2: 1, 3: 2})

In [None]:
# from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Train the first set of models
model1 = xgb.XGBClassifier(objective='multi:softmax', random_state=42)
model1.fit(X_train, y_train)

model2 = RandomForestClassifier(max_depth=25, min_samples_leaf=1, min_samples_split=2, n_estimators=10)
model2.fit(X_train, y_train)

# Make predictions on the testing set using the first set of models
y_pred_1 = model1.predict(X_test)
y_pred_2 = model2.predict(X_test)
y_pred_1_test = model1.predict(TestDF)
y_pred_2_test = model2.predict(TestDF)

# Stack the predictions from the first set of models
X_stack = np.column_stack((y_pred_1, y_pred_2))
X_stack_test = np.column_stack((y_pred_1_test,y_pred_2_test))

# Train the final model on the stacked predictions
final_model = xgb.XGBClassifier(objective='multi:softmax', random_state=42)
final_model.fit(X_stack, y_test)

# Make predictions on the testing set using the final model
y_pred_int = final_model.predict(X_stack)
y_pred_int_test = final_model.predict(X_stack_test)

# Calculate the accuracy of the final predictions
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)


In [None]:
y_train = y_train.replace({0: 1, 1: 2, 2: 3})
y_test = y_test.replace({0: 1, 1: 2, 2: 3})

In [None]:
predDf_int=pd.DataFrame(y_pred_int, columns=['shop_profile'])
predDf_int_Test=pd.DataFrame(y_pred_int_test, columns=['shop_profile'])
# Concatenate DataFrames
concatenatedRes_df_int = pd.concat([shop_id_x_test, predDf_int], axis=1)
concatenatedRes_df_int_Test = pd.concat([shop_id_x_TestDF, predDf_int_Test], axis=1)
# concatenatedRes_df = pd.concat([concatenatedRes_df, shop_id_x_test], axis=1)
# concatenated_df = pd.concat([shop_id_x_test, predDf], ignore_index=True)
# expectedResult=expectedResult['shop_profile']
concatenatedRes_df_int=concatenatedRes_df_int['shop_profile']
concatenatedRes_df_int_Test

In [None]:
# Calculate F1 score for each class
f1_class0 = f1_score(expectedResult, concatenatedRes_df_int, labels=[0], average='weighted')
f1_class1 = f1_score(expectedResult, concatenatedRes_df_int, labels=[1], average='weighted')
f1_class2 = f1_score(expectedResult, concatenatedRes_df_int, labels=[2], average='weighted')

# Calculate average F1 score
f1_average = (f1_class0 + f1_class1 + f1_class2) / 3

print(f"F1 score for class 0: {f1_class0:.2f}")
print(f"F1 score for class 1: {f1_class1:.2f}")
print(f"F1 score for class 2: {f1_class2:.2f}")
print(f"Average F1 score: {f1_average:.2f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create confusion matrix
matrix = confusion_matrix(expectedResult, concatenatedRes_df_int)

# Visualize confusion matrix
sns.heatmap(matrix, annot=True)
plt.show()

#### Knn and Random Forest

In [1040]:
# from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Train the first set of models
model1 = KNeighborsClassifier(n_neighbors=10)
model1.fit(X_train, y_train)

model2 = RandomForestClassifier(max_depth=100, min_samples_leaf=5, min_samples_split=2, n_estimators=25)
model2.fit(X_train, y_train)

# Make predictions on the testing set using the first set of models
y_pred_1 = model1.predict(X_test)
y_pred_2 = model2.predict(X_test)
y_pred_1_test = model1.predict(TestDF)
y_pred_2_test = model2.predict(TestDF)

# Stack the predictions from the first set of models
X_stack = np.column_stack((y_pred_1, y_pred_2))
X_stack_test = np.column_stack((y_pred_1_test,y_pred_2_test))

# Train the final model on the stacked predictions
final_model = KNeighborsClassifier(n_neighbors=5)
final_model.fit(X_stack, y_test)

# Make predictions on the testing set using the final model
y_pred_int = final_model.predict(X_stack)
y_pred_int_test = final_model.predict(X_stack_test)

# Calculate the accuracy of the final predictions
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)


0.2


In [None]:
predDf_int=pd.DataFrame(y_pred_int, columns=['shop_profile'])
predDf_int_Test=pd.DataFrame(y_pred_int_test, columns=['shop_profile'])
# Concatenate DataFrames
concatenatedRes_df_int = pd.concat([shop_id_x_test, predDf_int], axis=1)
concatenatedRes_df_int_Test = pd.concat([shop_id_x_TestDF, predDf_int_Test], axis=1)
# concatenatedRes_df = pd.concat([concatenatedRes_df, shop_id_x_test], axis=1)
# concatenated_df = pd.concat([shop_id_x_test, predDf], ignore_index=True)
# expectedResult=expectedResult['shop_profile']
concatenatedRes_df_int=concatenatedRes_df_int['shop_profile']
concatenatedRes_df_int_Test

In [None]:
# Calculate F1 score for each class
f1_class0 = f1_score(expectedResult, concatenatedRes_df_int, labels=[0], average='weighted')
f1_class1 = f1_score(expectedResult, concatenatedRes_df_int, labels=[1], average='weighted')
f1_class2 = f1_score(expectedResult, concatenatedRes_df_int, labels=[2], average='weighted')

# Calculate average F1 score
f1_average = (f1_class0 + f1_class1 + f1_class2) / 3

print(f"F1 score for class 0: {f1_class0:.2f}")
print(f"F1 score for class 1: {f1_class1:.2f}")
print(f"F1 score for class 2: {f1_class2:.2f}")
print(f"Average F1 score: {f1_average:.2f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create confusion matrix
matrix = confusion_matrix(expectedResult, concatenatedRes_df_int)

# Visualize confusion matrix
sns.heatmap(matrix, annot=True)
plt.show()