# Import Libraries

In [52]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

# Importing data sets

In [53]:
rawHisTransDF=pd.read_csv('Historical-transaction-data.csv')
rawStoreInfDF=pd.read_csv('Store-info.csv')
rawTestDF=pd.read_csv('Testing-data.csv')

# Viewing data frame

In [54]:
rawHisTransDF.head()

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold
0,ORANGE BARLEY 1.5L,2021-12-11T00:00:00.000Z,147.0,BGXA,SHOP008,220,2
1,GINGER BEER 1.5L,2021-10-17T00:00:00.000Z,371.0,IA25,SHOP112,220,2
2,TONIC PET 500ML,2021-12-13T00:00:00.000Z,484.0,VN7V,SHOP008,160,2
3,CREAM SODA 1L,2021-12-13T00:00:00.000Z,484.0,VN7V,SHOP008,150,2
4,STRAWBERRY MILK 180ML,2021-10-23T00:00:00.000Z,1310.0,7S00,SHOP112,210,5


In [55]:
rawStoreInfDF.head()

Unnamed: 0,shop_id,shop_area_sq_ft,shop_profile
0,SHOP047,528,Moderate
1,SHOP009,676,High
2,SHOP083,676,Low
3,SHOP117,676,Low
4,SHOP042,676,Low


# Data Pre processing

#### Fixing data

In [56]:
# convert the date string column to datetime
rawHisTransDF['transaction_date'] = pd.to_datetime(rawHisTransDF['transaction_date'], format='%Y/%m/%d').dt.date

In [57]:
# get count of null values in each column
null_counts = rawHisTransDF.isnull().sum()
# print the counts
print(null_counts)

item_description    35928
transaction_date        0
invoice_id           6320
customer_id             0
shop_id                 0
item_price              0
quantity_sold           0
dtype: int64


In [58]:
rawHisTransDF.dropna(subset=['item_description','invoice_id'], inplace=True)

In [59]:
# get count of null values in each column
null_counts = rawHisTransDF.isnull().sum()
# print the counts
print(null_counts)

item_description    0
transaction_date    0
invoice_id          0
customer_id         0
shop_id             0
item_price          0
quantity_sold       0
dtype: int64


In [60]:
rawHisTransDF=rawHisTransDF.drop_duplicates()

# Encoding

In [61]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
rawHisTransDF['item_description'] = le.fit_transform(rawHisTransDF['item_description'])
rawHisTransDF['customer_id'] = le.fit_transform(rawHisTransDF['customer_id'])
rawHisTransDF['shop_id'] = rawHisTransDF['shop_id'].str.replace(r'^SHOP', '').astype(int)
rawStoreInfDF['shop_id'] = rawStoreInfDF['shop_id'].str.replace(r'^SHOP', '').astype(int)

  rawHisTransDF['shop_id'] = rawHisTransDF['shop_id'].str.replace(r'^SHOP', '').astype(int)
  rawStoreInfDF['shop_id'] = rawStoreInfDF['shop_id'].str.replace(r'^SHOP', '').astype(int)


In [62]:
rawStoreInfDF['shop_profile'] = rawStoreInfDF['shop_profile'].replace({'High': 1, 'Moderate': 2, 'Low': 3})
rawStoreInfDF['shop_profile'] = rawStoreInfDF['shop_profile'].fillna(0.0).astype(int)
rawHisTransDF['invoice_id'] = rawHisTransDF['invoice_id'].astype(int)

In [63]:
rawStoreInfDF

Unnamed: 0,shop_id,shop_area_sq_ft,shop_profile
0,47,528,2
1,9,676,1
2,83,676,3
3,117,676,3
4,42,676,3
...,...,...,...
119,87,527,0
120,50,411,0
121,61,699,0
122,56,597,0


In [64]:
rawHisTransDF

Unnamed: 0,item_description,transaction_date,invoice_id,customer_id,shop_id,item_price,quantity_sold
0,27,2021-12-11,147,57272,8,220,2
1,14,2021-10-17,371,91334,112,220,2
2,35,2021-12-13,484,158179,8,160,2
3,4,2021-12-13,484,158179,8,150,2
4,34,2021-10-23,1310,38862,112,210,5
...,...,...,...,...,...,...,...
473820,13,2021-11-14,8014206,176899,3,60,2
473821,1,2021-10-16,8304754,52159,127,35,2
473822,14,2021-10-16,8304807,95280,127,220,1
473823,1,2021-11-07,8313570,8748,127,35,1


# Feature Creation

In [65]:
# group the dataframe by the 'group' column and get the size of each group
transactions_by_shop = rawHisTransDF.groupby('shop_id').size().reset_index()

# rename columns of the new dataframe
transactions_by_shop.columns = ['shop_id', 'num_of_transactions']

In [66]:
# rawStoreInfDF['transaction_by_shop']=transactions_by_shop
rawStoreInfDF = pd.merge(rawStoreInfDF, transactions_by_shop, on='shop_id')
rawStoreInfDF

Unnamed: 0,shop_id,shop_area_sq_ft,shop_profile,num_of_transactions
0,47,528,2,1396
1,9,676,1,3688
2,83,676,3,2981
3,117,676,3,3544
4,42,676,3,2683
...,...,...,...,...
119,87,527,0,1713
120,50,411,0,3554
121,61,699,0,2761
122,56,597,0,4846


In [67]:
# get count of null values in each column
null_counts = rawStoreInfDF.isnull().sum()
# print the counts
print(null_counts)

shop_id                0
shop_area_sq_ft        0
shop_profile           0
num_of_transactions    0
dtype: int64


In [68]:
output=pd.read_csv('output.csv')

In [69]:
output = output[['shop_id', 'Daily_Sales_avg', 'revnew', 'rev_per_sqfeet', 'avd_daily_items_types_sold', 'avd_daily_transctions', 'avd_daily_custemers', 'avg_visits']]

In [71]:
output =output.drop_duplicates()

In [72]:
# rawStoreInfDF['transaction_by_shop']=transactions_by_shop
rawStoreInfDF = pd.merge(rawStoreInfDF, output, on='shop_id')
rawStoreInfDF

Unnamed: 0,shop_id,shop_area_sq_ft,shop_profile,num_of_transactions,Daily_Sales_avg,revnew,rev_per_sqfeet,avd_daily_items_types_sold,avd_daily_transctions,avd_daily_custemers,avg_visits
0,47,528,2,1396,12942.461651,697510,1321,12,19,18,164
1,9,676,1,3688,30764.239580,1605555,2375,21,49,49,166
2,83,676,3,2981,27360.354452,1476445,2184,19,39,39,172
3,117,676,3,3544,38445.995702,2033385,3008,20,47,47,186
4,42,676,3,2683,21678.661223,1133820,1677,18,36,36,160
...,...,...,...,...,...,...,...,...,...,...,...
119,87,527,0,1713,20176.512253,1048015,1989,13,22,22,168
120,50,411,0,3554,42217.525685,2133300,5191,18,48,48,208
121,61,699,0,2761,44238.682756,1983470,2838,17,36,35,166
122,56,597,0,4846,46307.680336,2494620,4179,24,64,64,182


# Split to train and test data

In [73]:
# Split the DataFrame into two based on column B
TestDF = rawStoreInfDF[rawStoreInfDF['shop_profile'] == 0].drop(['shop_profile'], axis=1)
TrainDF = rawStoreInfDF[rawStoreInfDF['shop_profile'] != 0]

In [74]:
# Split Fulldata into training and testing sets
from sklearn.model_selection import train_test_split

column_name = 'shop_id'
unique_categories = TrainDF[column_name].nunique()
categories_in_dataset_1 = int(unique_categories * 0.8)
categories_in_dataset_2 = unique_categories - categories_in_dataset_1
dataset_1_categories = TrainDF[column_name].unique()[:categories_in_dataset_1]
dataset_2_categories = TrainDF[column_name].unique()[categories_in_dataset_1:]

train_data = TrainDF[TrainDF[column_name].isin(dataset_1_categories)]
test_data = TrainDF[TrainDF[column_name].isin(dataset_2_categories)]





#train_data, test_data = train_test_split(TrainDF, test_size=0.01)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Make predictions on the testing data
predictions_dtree = model.predict(X_test)

accu = accuracy_score(y_test, predictions_dtree)

print(accu)