<a href="https://colab.research.google.com/github/codened/DataStorm-4.0/blob/main/stormingRound/DataStorm_4_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.dtreeg" alt="Open In Colab"/></a>

Path 
stormingRound/DataStorm_4_0.ipynb

# Import necessary libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

# Importing Data Sets

In [3]:
rawHisTransDF=pd.read_csv('Historical-transaction-data.csv')
rawStoreInfDF=pd.read_csv('Store-info.csv')
rawTestDF=pd.read_csv('Testing-data.csv')

#### Viewing Dataframe

In [4]:
rawHisTransDF.head()

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold
0,ORANGE BARLEY 1.5L,2021-12-11T00:00:00.000Z,147.0,BGXA,SHOP008,220,2
1,GINGER BEER 1.5L,2021-10-17T00:00:00.000Z,371.0,IA25,SHOP112,220,2
2,TONIC PET 500ML,2021-12-13T00:00:00.000Z,484.0,VN7V,SHOP008,160,2
3,CREAM SODA 1L,2021-12-13T00:00:00.000Z,484.0,VN7V,SHOP008,150,2
4,STRAWBERRY MILK 180ML,2021-10-23T00:00:00.000Z,1310.0,7S00,SHOP112,210,5


In [5]:
rawStoreInfDF.head()

Unnamed: 0,shop_id,shop_area_sq_ft,shop_profile
0,SHOP047,528,Moderate
1,SHOP009,676,High
2,SHOP083,676,Low
3,SHOP117,676,Low
4,SHOP042,676,Low


# Data Pre Processing

### Fixing Data

In [6]:
# convert the date string column to datetime
rawHisTransDF['transaction_date'] = pd.to_datetime(rawHisTransDF['transaction_date'], format='%Y/%m/%d').dt.date

In [7]:
# Performing left join
merged_df = pd.merge(rawHisTransDF, rawStoreInfDF, on='shop_id', how='left')

In [8]:
rawHisTransDF.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
item_description,438046.0,37.0,GINGER BEER 1.5L,59864.0,,,,,,,
transaction_date,473974.0,62.0,2021-12-11,15125.0,,,,,,,
invoice_id,467654.0,,,,1996684.006321,1344594.276666,17.0,1032113.75,2032996.0,3032568.25,8331754.0
customer_id,473974.0,191636.0,RX33,95.0,,,,,,,
shop_id,473974.0,124.0,SHOP043,6631.0,,,,,,,
item_price,473974.0,,,,206.689734,166.764732,35.0,100.0,200.0,220.0,17400.0
quantity_sold,473974.0,,,,1.925506,1.634535,-1.0,1.0,2.0,2.0,101.0


In [9]:
# get count of null values in each column
null_counts = merged_df.isnull().sum()
# print the counts
print(null_counts)

item_description    35928
transaction_date        0
invoice_id           6320
customer_id             0
shop_id                 0
item_price              0
quantity_sold           0
shop_area_sq_ft         0
shop_profile        86633
dtype: int64


In [10]:
merged_df.dropna(subset=['item_description','invoice_id'], inplace=True)

In [11]:
# get count of null values in each column
null_counts = merged_df.isnull().sum()
# print the counts
print(null_counts)

item_description        0
transaction_date        0
invoice_id              0
customer_id             0
shop_id                 0
item_price              0
quantity_sold           0
shop_area_sq_ft         0
shop_profile        79471
dtype: int64


In [12]:
merged_df.drop_duplicates(inplace=True)

### Encoding 

In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
merged_df['item_description'] = le.fit_transform(merged_df['item_description'])
merged_df['customer_id'] = le.fit_transform(merged_df['customer_id'])

In [14]:
merged_df['shop_id'] = merged_df['shop_id'].str.replace(r'^SHOP', '').astype(int)

  merged_df['shop_id'] = merged_df['shop_id'].str.replace(r'^SHOP', '').astype(int)


In [15]:
merged_df['shop_profile'] = merged_df['shop_profile'].replace({'High': 0, 'Moderate': 1, 'Low': 2})
merged_df['shop_profile'] = merged_df['shop_profile'].fillna(0.0).astype(int)
merged_df['invoice_id'] = merged_df['invoice_id'].astype(int)

In [16]:
merged_df


Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold,shop_area_sq_ft,shop_profile
0,27,2021-12-11,147,57272,8,220,2,678,1
1,14,2021-10-17,371,91334,112,220,2,668,1
2,35,2021-12-13,484,158179,8,160,2,678,1
3,4,2021-12-13,484,158179,8,150,2,678,1
4,34,2021-10-23,1310,38862,112,210,5,668,1
...,...,...,...,...,...,...,...,...,...
473820,13,2021-11-14,8014206,176899,3,60,2,810,0
473821,1,2021-10-16,8304754,52159,127,35,2,848,0
473822,14,2021-10-16,8304807,95280,127,220,1,848,0
473823,1,2021-11-07,8313570,8748,127,35,1,848,0


In [17]:
print(merged_df[merged_df['quantity_sold'] == 0])

        item_description transaction_date  invoice_id  customer_id  shop_id  \
54                     5       2021-12-09       24423        70620        3   
103                   32       2021-10-31       31745       177679       90   
230                    3       2021-12-10       52904        26069       40   
273                   24       2021-11-12       59336       154510      103   
465                   22       2021-11-10      111021       100682       32   
...                  ...              ...         ...          ...      ...   
473258                 5       2021-12-04     4363428       108185      113   
473355                32       2021-11-21     4479312       154031       62   
473412                 0       2021-11-13     4603720        68657       58   
473462                35       2021-12-10     4653601        63057       77   
473500                11       2021-10-27     4798624        37513      127   

        item_price  quantity_sold  shop_area_sq_ft 

In [18]:
merged_df = merged_df[merged_df['quantity_sold'] != 0]

In [19]:
merged_df

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold,shop_area_sq_ft,shop_profile
0,27,2021-12-11,147,57272,8,220,2,678,1
1,14,2021-10-17,371,91334,112,220,2,668,1
2,35,2021-12-13,484,158179,8,160,2,678,1
3,4,2021-12-13,484,158179,8,150,2,678,1
4,34,2021-10-23,1310,38862,112,210,5,668,1
...,...,...,...,...,...,...,...,...,...
473820,13,2021-11-14,8014206,176899,3,60,2,810,0
473821,1,2021-10-16,8304754,52159,127,35,2,848,0
473822,14,2021-10-16,8304807,95280,127,220,1,848,0
473823,1,2021-11-07,8313570,8748,127,35,1,848,0


# Feature Engineering

### Feature Creation

In [20]:
merged_df['full_price'] = merged_df['quantity_sold'] * merged_df['item_price']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['full_price'] = merged_df['quantity_sold'] * merged_df['item_price']


#### creating Avarage daily sales for each shop

In [21]:
merged_df['Daily_Sales'] = merged_df.groupby(['shop_id', 'transaction_date'])['full_price'].transform('sum')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['Daily_Sales'] = merged_df.groupby(['shop_id', 'transaction_date'])['full_price'].transform('sum')


In [22]:
merged_df

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold,shop_area_sq_ft,shop_profile,full_price,Daily_Sales
0,27,2021-12-11,147,57272,8,220,2,678,1,440,65375
1,14,2021-10-17,371,91334,112,220,2,668,1,440,23290
2,35,2021-12-13,484,158179,8,160,2,678,1,320,3625
3,4,2021-12-13,484,158179,8,150,2,678,1,300,3625
4,34,2021-10-23,1310,38862,112,210,5,668,1,1050,19600
...,...,...,...,...,...,...,...,...,...,...,...
473820,13,2021-11-14,8014206,176899,3,60,2,810,0,120,27765
473821,1,2021-10-16,8304754,52159,127,35,2,848,0,70,33180
473822,14,2021-10-16,8304807,95280,127,220,1,848,0,220,33180
473823,1,2021-11-07,8313570,8748,127,35,1,848,0,35,109150


In [23]:
subset = merged_df.loc[(merged_df['transaction_date'] == pd.to_datetime('2021-12-11')) & (merged_df['shop_id'] == 8)]

  subset = merged_df.loc[(merged_df['transaction_date'] == pd.to_datetime('2021-12-11')) & (merged_df['shop_id'] == 8)]


In [24]:
# Group by shop id and calculate mean of daily_sales column
avg_sales = merged_df.groupby('shop_id')['Daily_Sales'].mean().reset_index()

# Merge the average sales data back into the original dataframe
merged_df = merged_df.merge(avg_sales, on='shop_id', suffixes=('', '_avg'))

# Print the updated dataframe
merged_df.head()

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold,shop_area_sq_ft,shop_profile,full_price,Daily_Sales,Daily_Sales_avg
0,27,2021-12-11,147,57272,8,220,2,678,1,440,65375,37040.173913
1,35,2021-12-13,484,158179,8,160,2,678,1,320,3625,37040.173913
2,4,2021-12-13,484,158179,8,150,2,678,1,300,3625,37040.173913
3,14,2021-12-10,1000053,159040,8,220,1,678,1,220,27435,37040.173913
4,14,2021-12-10,1000057,43724,8,440,1,678,1,440,27435,37040.173913


#### Full revinew

In [25]:
merged_df['revnew'] = merged_df.groupby(['shop_id'])['full_price'].transform('sum')

In [26]:
merged_df

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold,shop_area_sq_ft,shop_profile,full_price,Daily_Sales,Daily_Sales_avg,revnew
0,27,2021-12-11,147,57272,8,220,2,678,1,440,65375,37040.173913,121225
1,35,2021-12-13,484,158179,8,160,2,678,1,320,3625,37040.173913,121225
2,4,2021-12-13,484,158179,8,150,2,678,1,300,3625,37040.173913,121225
3,14,2021-12-10,1000053,159040,8,220,1,678,1,220,27435,37040.173913,121225
4,14,2021-12-10,1000057,43724,8,440,1,678,1,440,27435,37040.173913,121225
...,...,...,...,...,...,...,...,...,...,...,...,...,...
394475,12,2021-11-28,7027965,44679,72,290,2,617,0,580,38705,41364.617086,2410580
394476,24,2021-12-03,7029132,4618,72,440,3,617,0,1320,42170,41364.617086,2410580
394477,16,2021-12-08,7030122,63983,72,100,2,617,0,200,20010,41364.617086,2410580
394478,13,2021-12-11,7030809,89595,72,140,2,617,0,280,42000,41364.617086,2410580


#### Revnew per sqr feet of land

In [27]:
merged_df['rev_per_sqfeet'] = (merged_df['revnew'] / merged_df['shop_area_sq_ft']).round().astype(int)


In [28]:
merged_df

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold,shop_area_sq_ft,shop_profile,full_price,Daily_Sales,Daily_Sales_avg,revnew,rev_per_sqfeet
0,27,2021-12-11,147,57272,8,220,2,678,1,440,65375,37040.173913,121225,179
1,35,2021-12-13,484,158179,8,160,2,678,1,320,3625,37040.173913,121225,179
2,4,2021-12-13,484,158179,8,150,2,678,1,300,3625,37040.173913,121225,179
3,14,2021-12-10,1000053,159040,8,220,1,678,1,220,27435,37040.173913,121225,179
4,14,2021-12-10,1000057,43724,8,440,1,678,1,440,27435,37040.173913,121225,179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394475,12,2021-11-28,7027965,44679,72,290,2,617,0,580,38705,41364.617086,2410580,3907
394476,24,2021-12-03,7029132,4618,72,440,3,617,0,1320,42170,41364.617086,2410580,3907
394477,16,2021-12-08,7030122,63983,72,100,2,617,0,200,20010,41364.617086,2410580,3907
394478,13,2021-12-11,7030809,89595,72,140,2,617,0,280,42000,41364.617086,2410580,3907


#### Avarage sold item types per each shop 

In [29]:
# group the original table by Shop ID and Transaction Date and count the unique items sold on each day
daily_items_sold = merged_df.groupby(['shop_id', 'transaction_date'])['item_description'].nunique().reset_index()

# group the resulting table by Shop ID and take the mean of the nunique column
avg_daily_items_sold = daily_items_sold.groupby('shop_id')['item_description'].mean().reset_index()

# rename the columns
avg_daily_items_sold.columns = ['shop_id', 'avd_daily_items_types_sold']
# convert float column to integers
avg_daily_items_sold['avd_daily_items_types_sold'] = avg_daily_items_sold['avd_daily_items_types_sold'].round().astype(int)

# merge with the original dataframe
merged_df = pd.merge(merged_df, avg_daily_items_sold, on='shop_id', how='left')

In [30]:
merged_df

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold,shop_area_sq_ft,shop_profile,full_price,Daily_Sales,Daily_Sales_avg,revnew,rev_per_sqfeet,avd_daily_items_types_sold
0,27,2021-12-11,147,57272,8,220,2,678,1,440,65375,37040.173913,121225,179,16
1,35,2021-12-13,484,158179,8,160,2,678,1,320,3625,37040.173913,121225,179,16
2,4,2021-12-13,484,158179,8,150,2,678,1,300,3625,37040.173913,121225,179,16
3,14,2021-12-10,1000053,159040,8,220,1,678,1,220,27435,37040.173913,121225,179,16
4,14,2021-12-10,1000057,43724,8,440,1,678,1,440,27435,37040.173913,121225,179,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394475,12,2021-11-28,7027965,44679,72,290,2,617,0,580,38705,41364.617086,2410580,3907,22
394476,24,2021-12-03,7029132,4618,72,440,3,617,0,1320,42170,41364.617086,2410580,3907,22
394477,16,2021-12-08,7030122,63983,72,100,2,617,0,200,20010,41364.617086,2410580,3907,22
394478,13,2021-12-11,7030809,89595,72,140,2,617,0,280,42000,41364.617086,2410580,3907,22


#### Avarage Daily Transactions per each shop

In [31]:
# group the original table by Shop ID and Transaction Date and count the unique items sold on each day
daily_trans = merged_df.groupby(['shop_id', 'transaction_date'])['invoice_id'].nunique().reset_index()

# group the resulting table by Shop ID and take the mean of the nunique column
avg_daily_trans = daily_trans.groupby('shop_id')['invoice_id'].mean().reset_index()

# rename the columns
avg_daily_trans.columns = ['shop_id', 'avd_daily_transctions']
# convert float column to integers
avg_daily_trans['avd_daily_transctions'] = avg_daily_trans['avd_daily_transctions'].round().astype(int)

# merge with the original dataframe
merged_df = pd.merge(merged_df, avg_daily_trans, on='shop_id', how='left')

In [32]:
merged_df

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold,shop_area_sq_ft,shop_profile,full_price,Daily_Sales,Daily_Sales_avg,revnew,rev_per_sqfeet,avd_daily_items_types_sold,avd_daily_transctions
0,27,2021-12-11,147,57272,8,220,2,678,1,440,65375,37040.173913,121225,179,16,32
1,35,2021-12-13,484,158179,8,160,2,678,1,320,3625,37040.173913,121225,179,16,32
2,4,2021-12-13,484,158179,8,150,2,678,1,300,3625,37040.173913,121225,179,16,32
3,14,2021-12-10,1000053,159040,8,220,1,678,1,220,27435,37040.173913,121225,179,16,32
4,14,2021-12-10,1000057,43724,8,440,1,678,1,440,27435,37040.173913,121225,179,16,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394475,12,2021-11-28,7027965,44679,72,290,2,617,0,580,38705,41364.617086,2410580,3907,22,49
394476,24,2021-12-03,7029132,4618,72,440,3,617,0,1320,42170,41364.617086,2410580,3907,22,49
394477,16,2021-12-08,7030122,63983,72,100,2,617,0,200,20010,41364.617086,2410580,3907,22,49
394478,13,2021-12-11,7030809,89595,72,140,2,617,0,280,42000,41364.617086,2410580,3907,22,49


#### Average number of custemers per day

In [33]:
# group the original table by Shop ID and Transaction Date and count the unique items sold on each day
daily_custemers = merged_df.groupby(['shop_id', 'transaction_date'])['customer_id'].nunique().reset_index()

# group the resulting table by Shop ID and take the mean of the nunique column
avg_daily_custemers = daily_custemers.groupby('shop_id')['customer_id'].mean().reset_index()

# rename the columns
avg_daily_custemers.columns = ['shop_id', 'avd_daily_custemers']
# convert float column to integers
avg_daily_custemers['avd_daily_custemers'] = avg_daily_custemers['avd_daily_custemers'].round().astype(int)

# merge with the original dataframe
merged_df = pd.merge(merged_df, avg_daily_custemers, on='shop_id', how='left')

In [34]:
merged_df

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold,shop_area_sq_ft,shop_profile,full_price,Daily_Sales,Daily_Sales_avg,revnew,rev_per_sqfeet,avd_daily_items_types_sold,avd_daily_transctions,avd_daily_custemers
0,27,2021-12-11,147,57272,8,220,2,678,1,440,65375,37040.173913,121225,179,16,32,32
1,35,2021-12-13,484,158179,8,160,2,678,1,320,3625,37040.173913,121225,179,16,32,32
2,4,2021-12-13,484,158179,8,150,2,678,1,300,3625,37040.173913,121225,179,16,32,32
3,14,2021-12-10,1000053,159040,8,220,1,678,1,220,27435,37040.173913,121225,179,16,32,32
4,14,2021-12-10,1000057,43724,8,440,1,678,1,440,27435,37040.173913,121225,179,16,32,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394475,12,2021-11-28,7027965,44679,72,290,2,617,0,580,38705,41364.617086,2410580,3907,22,49,48
394476,24,2021-12-03,7029132,4618,72,440,3,617,0,1320,42170,41364.617086,2410580,3907,22,49,48
394477,16,2021-12-08,7030122,63983,72,100,2,617,0,200,20010,41364.617086,2410580,3907,22,49,48
394478,13,2021-12-11,7030809,89595,72,140,2,617,0,280,42000,41364.617086,2410580,3907,22,49,48


#### Persentage of Avarage number of time the same customer returning for the same shop

In [35]:
# calculate the number of times each customer visited each shop
visits = merged_df.groupby(['customer_id', 'shop_id'])['transaction_date'].count()
# calculate the average number of visits per customer per shop
avg_visits = visits.groupby(['shop_id']).mean()*100

avg_visits=avg_visits.round().astype(int)
# create a new DataFrame with the average visits
avg_visits_df = avg_visits.reset_index().rename(columns={'transaction_date': 'avg_visits'})

# merge the new DataFrame with the original DataFrame to add the average visits column
merged_df = pd.merge(merged_df, avg_visits_df, on=['shop_id'])

In [36]:
merged_df.to_csv('output.csv', index=False)

# Visualizing

In [None]:
# Create correlation matrix
corr = merged_df.corr()

# Set figure size
plt.figure(figsize=(12, 8))

# Plot correlation matrix as heatmap
sns.heatmap(corr, annot=True, cmap='coolwarm')

# Display plot
plt.show()

In [None]:
# Drop highly co related features
cleanedDF = merged_df.drop(['avd_daily_custemers','transaction_date','revnew','item_price','item_description','quantity_sold','full_price','customer_id'], axis=1)

In [None]:
# Create correlation matrix
corr = cleanedDF.corr()

# # Set figure size
# plt.figure(figsize=(12, 8))

# Plot correlation matrix as heatmap
sns.heatmap(corr, annot=True, cmap='coolwarm')

# Display plot
plt.show()

# Split To Test and Train Data

In [None]:
# Split the DataFrame into two based on column B
TestDF = cleanedDF[cleanedDF['shop_profile'] == 0].drop(['shop_profile'], axis=1)
TrainDF = cleanedDF[cleanedDF['shop_profile'] != 0]

In [None]:
from sklearn.feature_selection import mutual_info_classif
import matplotlib.pyplot as plt

# Separate the target variable
X = TrainDF.drop(['shop_profile'], axis=1)
y = TrainDF['shop_profile']

# Compute MI scores
mi_scores = mutual_info_classif(X, y)

# Convert to DataFrame and sort by MI score
mi_scores_df = pd.DataFrame({'feature': X.columns, 'mi_score': mi_scores})
mi_scores_df = mi_scores_df.sort_values('mi_score', ascending=False)

# Plot bar chart of MI scores
plt.figure(figsize=(12,8))
plt.bar(mi_scores_df['feature'], mi_scores_df['mi_score'])
plt.xticks(rotation=90)
plt.xlabel('Feature')
plt.ylabel('MI Score')
plt.show()

In [None]:
# Split Fulldata into training and testing sets
from sklearn.model_selection import train_test_split

column_name = 'shop_id'
unique_categories = TrainDF[column_name].nunique()
categories_in_dataset_1 = int(unique_categories * 0.8)
categories_in_dataset_2 = unique_categories - categories_in_dataset_1
dataset_1_categories = TrainDF[column_name].unique()[:categories_in_dataset_1]
dataset_2_categories = TrainDF[column_name].unique()[categories_in_dataset_1:]

train_data = TrainDF[TrainDF[column_name].isin(dataset_1_categories)]
test_data = TrainDF[TrainDF[column_name].isin(dataset_2_categories)]





#train_data, test_data = train_test_split(TrainDF, test_size=0.01)

In [None]:
# remove store id from the training and testing sets

train_data_noID = train_data.drop(['shop_id'], axis=1)
test_data_noID = test_data.drop(['shop_id'], axis=1)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
import numpy as np


# Split data into training and test sets
X_train=train_data_noID.drop('shop_profile', axis=1)
y_train=train_data_noID['shop_profile']
X_test=test_data_noID.drop('shop_profile', axis=1)
y_test=test_data_noID['shop_profile']

# Define hyperparameter search space
params = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 1],
    'gamma': [0, 0.1, 0.5, 1],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'subsample': [0.5, 0.7, 1.0],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 0.1, 1]
}

# Create XGBoost regressor
xgb = XGBRegressor(tree_method='gpu_hist', gpu_id=0)

# Perform randomized search over hyperparameters
random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=50, cv=5, scoring='neg_mean_squared_error', random_state=42)
random_search.fit(X_train, y_train)

# Print best hyperparameters
print("Best hyperparameters: ", random_search.best_params_)

# Get predictions on test set using best model
best_model = random_search.best_estimator_
xg_pred = best_model.predict(X_test)

# Evaluate model performance on test set
mse = mean_squared_error(y_test, xg_pred)
rmse = np.sqrt(mse)
print("RMSE: ", rmse)


Best hyperparameters:  {'subsample': 0.5, 'reg_lambda': 0, 'reg_alpha': 0, 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.5}

In [None]:
concatenated_XG_res_ = pd.concat([X_testres, predDf_dtree], axis=1)

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Split data into training and test sets
X_train=train_data.drop('shop_profile', axis=1)
y_train=train_data['shop_profile']
X_test=test_data.drop('shop_profile', axis=1)
y_test=test_data['shop_profile']


# define XGBoost model with best hyperparameters
xgb_model = xgb.XGBRegressor(
    subsample=0.5,
    reg_lambda=0,
    reg_alpha=0,
    n_estimators=500,
    max_depth=7,
    learning_rate=0.01,
    gamma=0,
    colsample_bytree=0.5,
    gpu_id=0,
    tree_method='gpu_hist' # using GPU acceleration
)

# fit model to training data
xgb_model.fit(X_train, y_train)

# make predictions on test data
xg_pred = xgb_model.predict(X_test)

# evaluate performance of model
mse = mean_squared_error(y_test, xg_pred)
print('MSE:', mse)

In [None]:
# import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

# load the data and split into training and testing sets
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# make predictions on the test set
y_pred = clf.predict(X_test)

# calculate the F1 score for each class
f1_class0 = f1_score(y_test, y_pred, labels=[0], average='weighted')
f1_class1 = f1_score(y_test, y_pred, labels=[1], average='weighted')
f1_class2 = f1_score(y_test, y_pred, labels=[2], average='weighted')

# calculate the average F1 score
f1_average = (f1_class0 + f1_class1 + f1_class2) / 3

# print the results
print(f"F1 score for class 0: {f1_class0:.2f}")
print(f"F1 score for class 1: {f1_class1:.2f}")
print(f"F1 score for class 2: {f1_class2:.2f}")
print(f"Average F1 score: {f1_average:.2f}")


In [None]:
ResMode_df_XG

#### Random Forest

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score


X_train=train_data.drop(['shop_profile'], axis=1)
y_train=train_data['shop_profile']
X_test= test_data.drop(['shop_profile'], axis=1)
y_test=test_data['shop_profile']

# Define the logistic regression model
model = RandomForestClassifier()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
predictions = model.predict(X_test)

accu = accuracy_score(y_test, predictions)

print(accu)
# print(f1_score(y_test, predictions, average=None))



In [None]:
expectedResult=test_data[['shop_id','shop_profile']]
print(expectedResult)
unique_values = expectedResult['shop_id'].nunique()
print(unique_values)

predDf=pd.DataFrame(predictions, columns=['shop_profile'])
X_testres = X_test.reset_index(drop=True)

# Concatenate DataFrames
concatenatedRes_df = pd.concat([X_testres, predDf], axis=1)

ResMode_df = concatenatedRes_df.groupby('shop_id')['shop_profile'].agg(pd.Series.mode)
TestMode_df = expectedResult.groupby('shop_id')['shop_profile'].agg(pd.Series.mode)

In [None]:
TestMode_df

In [None]:
# Calculate F1 score for each class
f1_class0 = f1_score(TestMode_df, ResMode_df, labels=[1], average='weighted')
f1_class1 = f1_score(TestMode_df, ResMode_df, labels=[2], average='weighted')
f1_class2 = f1_score(TestMode_df, ResMode_df, labels=[3], average='weighted')

# Calculate average F1 score
f1_average = (f1_class0 + f1_class1 + f1_class2) / 3

print(f"F1 score for class 0: {f1_class0:.2f}")
print(f"F1 score for class 1: {f1_class1:.2f}")
print(f"F1 score for class 2: {f1_class2:.2f}")
print(f"Average F1 score: {f1_average:.2f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create confusion matrix
matrix = confusion_matrix(TestMode_df, ResMode_df)

# Visualize confusion matrix
sns.heatmap(matrix, annot=True)
plt.show()

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the testing data
predictions_logr = model.predict(X_test)

accu = accuracy_score(y_test, predictions_logr)

print(accu)

In [None]:
predDf_logr=pd.DataFrame(predictions_logr, columns=['shop_profile'])

In [None]:
# Concatenate DataFrames
concatenatedRes_df_logr = pd.concat([X_testres, predDf_logr], axis=1)

In [None]:
ResMode_df_logr = concatenatedRes_df_logr.groupby('shop_id')['shop_profile'].agg(pd.Series.mode)

In [None]:
# Calculate F1 score for each class
f1_class0 = f1_score(TestMode_df, ResMode_df_logr, labels=[1], average='weighted')
f1_class1 = f1_score(TestMode_df, ResMode_df_logr, labels=[2], average='weighted')
f1_class2 = f1_score(TestMode_df, ResMode_df_logr, labels=[3], average='weighted')

# Calculate average F1 score
f1_average = (f1_class0 + f1_class1 + f1_class2) / 3

print(f"F1 score for class 0: {f1_class0:.2f}")
print(f"F1 score for class 1: {f1_class1:.2f}")
print(f"F1 score for class 2: {f1_class2:.2f}")
print(f"Average F1 score: {f1_average:.2f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create confusion matrix
matrix = confusion_matrix(TestMode_df, ResMode_df_logr)

# Visualize confusion matrix
sns.heatmap(matrix, annot=True)
plt.show()

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Make predictions on the testing data
predictions_dtree = model.predict(X_test)

accu = accuracy_score(y_test, predictions_dtree)

print(accu)

In [None]:
predDf_dtree=pd.DataFrame(predictions_dtree, columns=['shop_profile'])
# Concatenate DataFrames
concatenatedRes_df_dtree = pd.concat([X_testres, predDf_dtree], axis=1)
ResMode_df_dtree = concatenatedRes_df_dtree.groupby('shop_id')['shop_profile'].agg(pd.Series.mode)

# Calculate F1 score for each class
f1_class0 = f1_score(TestMode_df, ResMode_df_dtree, labels=[1], average='weighted')
f1_class1 = f1_score(TestMode_df, ResMode_df_dtree, labels=[2], average='weighted')
f1_class2 = f1_score(TestMode_df, ResMode_df_dtree, labels=[3], average='weighted')

# Calculate average F1 score
f1_average = (f1_class0 + f1_class1 + f1_class2) / 3

print(f"F1 score for class 0: {f1_class0:.2f}")
print(f"F1 score for class 1: {f1_class1:.2f}")
print(f"F1 score for class 2: {f1_class2:.2f}")
print(f"Average F1 score: {f1_average:.2f}")

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create confusion matrix
matrix = confusion_matrix(TestMode_df, ResMode_df_dtree)

# Visualize confusion matrix
sns.heatmap(matrix, annot=True)
plt.show()

#### Support Vector

In [None]:
from sklearn.svm import SVC

# Initialize and train model
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# Make predictions on the testing data
predictions_sv = model.predict(X_test)

accu = accuracy_score(y_test, predictions_sv)

print(accu)

In [None]:
predDf_sv=pd.DataFrame(predictions_sv, columns=['shop_profile'])
# Concatenate DataFrames
concatenatedRes_df_sv = pd.concat([X_testres, predDf_sv], axis=1)
ResMode_df_sv = concatenatedRes_df_sv.groupby('shop_id')['shop_profile'].agg(pd.Series.mode)

# Calculate F1 score for each class
f1_class0 = f1_score(TestMode_df, ResMode_df_sv, labels=[1], average='weighted')
f1_class1 = f1_score(TestMode_df, ResMode_df_sv, labels=[2], average='weighted')
f1_class2 = f1_score(TestMode_df, ResMode_df_sv, labels=[3], average='weighted')

# Calculate average F1 score
f1_average = (f1_class0 + f1_class1 + f1_class2) / 3

print(f"F1 score for class 0: {f1_class0:.2f}")
print(f"F1 score for class 1: {f1_class1:.2f}")
print(f"F1 score for class 2: {f1_class2:.2f}")
print(f"Average F1 score: {f1_average:.2f}")

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create confusion matrix
matrix = confusion_matrix(TestMode_df, ResMode_df_sv)

# Visualize confusion matrix
sns.heatmap(matrix, annot=True)
plt.show()

-----------------------------------------------------------------------------------------------------------

In [None]:
# predict for test values
Evalpredictions = model.predict(TestDF.drop(['transaction_date'], axis=1))

EvalpredictionsDF=pd.DataFrame(Evalpredictions, columns=['shop_profile'])


In [None]:
EvalpredictionsDF

In [None]:
TestDFinReseted = TestDF.reset_index(drop=True)
# Concatenate DataFrames
concatenatedEval_df = pd.concat([TestDFinReseted['shop_id'], EvalpredictionsDF], axis=1)

In [None]:
EvalMode_df = (concatenatedEval_df.groupby('shop_id')['shop_profile'].agg(pd.Series.mode)).to_frame()

In [None]:
EvalMode_df

In [None]:
# Save array to CSV file
np.savetxt('EvalResult.csv', EvalMode_df, delimiter=',')


In [None]:
UploadShid=pd.read_csv('Testing-datatoUpload.csv')


In [None]:
UploadShid

In [None]:
UploadShid['shop_profile'] = UploadShid['shop_profile'].replace({1 : 'High', 2 : 'Moderate',3 : 'Low'})

In [None]:
UploadShid

In [None]:
# Save array to CSV file
UploadShid.to_csv('Testing-datatoUpload.csv', index=False)


In [None]:
# # Import necessary libraries
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# from sklearn.metrics import accuracy_score, confusion_matrix

# X_train=train_data.drop(['shop_profile','transaction_date'], axis=1)
# y_train=train_data['shop_profile']
# X_test= test_data.drop(['shop_profile','transaction_date'], axis=1)
# y_test=test_data['shop_profile']

# # Create a list of machine learning models to try out
# models = []
# models.append(('Logistic Regression', LogisticRegression()))
# models.append(('SVM', SVC()))
# models.append(('Decision Tree', DecisionTreeClassifier()))
# models.append(('Random Forest', RandomForestClassifier()))
# models.append(('AdaBoost', AdaBoostClassifier()))
# models.append(('Extra Trees', ExtraTreesClassifier()))
# models.append(('K-Nearest Neighbors', KNeighborsClassifier()))
# models.append(('Gaussian Naive Bayes', GaussianNB()))
# models.append(('Linear Discriminant Analysis', LinearDiscriminantAnalysis()))
# models.append(('Gradient Boosting', GradientBoostingClassifier()))


# # Define the hyperparameters to tune for each model
# params = {
#     'Logistic Regression': {'C': [0.1, 1, 10]},
#     'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
#     'Decision Tree': {'max_depth': [2, 4, 6]},
#     'Random Forest': {'n_estimators': [100, 200, 300], 'max_depth': [2, 4, 6]},    
#     'AdaBoost': {'learning_rate': [0.1, 0.01], 'n_estimators': [100, 200, 300]},
#     'Extra Trees': {'n_estimators': [100, 200, 300], 'max_depth': [2, 4, 6]},
#     'K-Nearest Neighbors': {'n_neighbors': [3, 5, 7]},
#     'Gaussian Naive Bayes': {},
#     'Linear Discriminant Analysis': {'solver': ['svd', 'lsqr']},
#     'Gradient Boosting': {'learning_rate': [0.1, 0.01], 'n_estimators': [100, 200, 300], 'max_depth': [2, 4, 6]}
# }

# # Train and evaluate each model with hyperparameter tuning
# results = []
# names = []

# accuResults=[]

# resultsxxx=[]

# for name, model in models:
#     param_grid = params[name]
#     clf = GridSearchCV(model, param_grid, cv=5)
#     clf.fit(X_train, y_train)  # Fit the GridSearchCV object to the training data
#     cv_results = cross_val_score(clf, X_train, y_train, cv=5)
#     results.append(cv_results)
#     names.append(name)

#     y_pred = clf.predict(X_test)
#     accu = accuracy_score(y_test, y_pred)
#     accuResults.append(accu)

#     resultsxxx.append(accu*cv_results)

#     print(f'{name}: cv : {cv_results.mean()}')
#     print(f'{name}: accu : {accu}')
#     print(f'Best parameters: {clf.best_params_}')  # Print the best parameters inside the loop


# # Select the best model based on mean cross-validation score
# best_idx_cv = np.argmax([np.mean(r) for r in results])
# best_model_cv = models[best_idx_cv][1]
# print(f'Best model from cv mean: {names[best_idx_cv]}')

# # Evaluate the best model on the test set
# best_model_cv.fit(X_train, y_train)
# y_pred = best_model_cv.predict(X_test)
# cv_resultscv = cross_val_score(best_model_cv, X_train, y_train, cv=5)
# print(f'CV : {cv_resultscv.mean()}')
# print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
# # Create a confusion matrix to visualize the performance of the model
# cm1 = confusion_matrix(y_test, y_pred)
# sns.heatmap(cm1, annot=True, cmap='Blues', fmt='g')
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.title(f'Confusion Matrix for {best_model_cv}')
# plt.show()



# # Select the best model based on mean accuracy score
# best_idx_acc = np.argmax(accuResults)
# best_model_acc = models[best_idx_acc][1]
# print(f'Best model from accue Accuracy: {names[best_idx_acc]}')

# # Evaluate the best model on the test set
# best_model_acc.fit(X_train, y_train)
# y_pred = best_model_acc.predict(X_test)
# cv_resultsAcc = cross_val_score(best_model_acc, X_train, y_train, cv=5)
# print(f'CV : {cv_resultsAcc.mean()}')
# print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

# # Create a confusion matrix to visualize the performance of the model
# cm2 = confusion_matrix(y_test, y_pred)
# sns.heatmap(cm2, annot=True, cmap='Blues', fmt='g')
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.title(f'Confusion Matrix for {best_model_acc}')
# plt.show()




# # Select the best model based on mean multiplication
# best_id_mul = np.argmax(resultsxxx)
# best_model_mul = models[best_id_mul][1]
# print(f'Best model from multiplication of two: {names[best_id_mul]}')

# # Evaluate the best model on the test set
# best_model_mul.fit(X_train, y_train)
# y_pred = best_model_mul.predict(X_test)
# cv_resultsAcc = cross_val_score(best_model_mul, X_train, y_train, cv=5)
# print(f'CV : {cv_resultsAcc.mean()}')
# print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
# print(f'Mul : {cv_resultsAcc.mean()*accuracy_score(y_test, y_pred)}')

# # Create a confusion matrix to visualize the performance of the model
# cm3 = confusion_matrix(y_test, y_pred)
# sns.heatmap(cm3, annot=True, cmap='Blues', fmt='g')
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.title(f'Confusion Matrix for {best_model_mul}')
# plt.show()