**IMPORTS**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import timedelta

**LOAD FILES**

In [2]:
# Load the uploaded CSV files
brln_items_file_path = 'BRLN_rd5500.csv'
brln_sales_file_path = 'BRLN_rd5000.csv'

# Read the files into DataFrames
df_barlin_items = pd.read_csv(brln_items_file_path)
df_barlin_sales = pd.read_csv(brln_sales_file_path)

**EDA FOR RD5500 (STORE ITEMS)**

In [3]:
print("Before data cleansing:")
print(df_barlin_items.count())

Before data cleansing:
INCODE      4374
ITE_DESC    4374
DEP_CODE    4374
UNT_PRIC    4374
UNIT           0
dtype: int64


In [4]:
df_barlin_items.head()

Unnamed: 0,INCODE,ITE_DESC,DEP_CODE,UNT_PRIC,UNIT
0,PP9,PAN BIGGS SPAGHETTI,26,1199.0,
1,CHM13,CHICKEN IN A BOX (6),1,669.0,
2,SLD7,NENA'S HARVEST,9,169.0,
3,SAL3,ROASTED SESAME DRESSING,46,0.0,
4,BL1,PINEAPPLE JUICE,42,0.01,


In [5]:
print(df_barlin_items.dtypes)

INCODE       object
ITE_DESC     object
DEP_CODE      int64
UNT_PRIC    float64
UNIT        float64
dtype: object


**CLEAN RD5500 DATA**

In [6]:
# Drop all columns except 'INCODE' and 'ITE_DESC'
df_barlin_items = df_barlin_items[['INCODE', 'ITE_DESC']]

# Get all unique pairs of 'INCODE' and 'ITE_DESC'
df_barlin_filtered_items = df_barlin_items.drop_duplicates()

In [7]:
print("After data cleansing:")
print(df_barlin_filtered_items.count())

After data cleansing:
INCODE      1099
ITE_DESC    1099
dtype: int64


In [8]:
pip install xlwt

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [9]:
output_file = 'store_items.csv'
df_barlin_filtered_items.to_csv(output_file, index=False)

**MERGE SIMILAR STORE ITEMS BASED ON ITEM DESCRIPTION**

In [10]:
# Normalize the ITE_DESC column to make it case-insensitive and group similar items
df_barlin_filtered_items.loc[:, 'Normalized_ITE_DESC'] = df_barlin_filtered_items['ITE_DESC'].str.strip().str.lower()

# Group by the normalized item description and aggregate INCODE into a list
grouped_df = df_barlin_filtered_items.groupby('Normalized_ITE_DESC').agg({
    'INCODE': list,
    'ITE_DESC': 'first'  # Take the first occurrence of the original case item description
}).reset_index()

# Rename columns for clarity
grouped_df = grouped_df.rename(columns={'INCODE': 'INCODE_LIST', 'ITE_DESC': 'ITEM_DESC'})

# Drop the normalized column as it is no longer needed
grouped_df = grouped_df[['INCODE_LIST', 'ITEM_DESC']]
grouped_df.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_barlin_filtered_items.loc[:, 'Normalized_ITE_DESC'] = df_barlin_filtered_items['ITE_DESC'].str.strip().str.lower()


Unnamed: 0,INCODE_LIST,ITEM_DESC
0,[BULK123],1 2 TIPS
1,[BO41],1 CRISPY
2,[BULK130],1 CRISPY MISCUT
3,[TEST10],1 PC CHICKEN CAJUN W TROPICAL SLAW
4,[BO43],1BBQ SINANTOLAN RICE
5,[BL86],1CAJCHX SPG PBRD CNC IT
6,[BULK114],1CAJUN MINIBBQ TSMAC
7,[BO71],1CHX CB SPG SOFTD WATER
8,[BULK116],1CHX SPAG SODA WATER
9,"[CHM6, DEL8, REPCHM6, XTR2, BDS6, REPXTR2, DEL...",1PC BIGG CAJUN CHICKEN


In [11]:
print("After incode merge:")
print(grouped_df.count())

After incode merge:
INCODE_LIST    473
ITEM_DESC      473
dtype: int64


In [12]:
output_file = 'barlin_grouped_items.csv'
grouped_df.to_csv(output_file, index=False)

**EDA FOR RD5000 DATA**

In [13]:
df_barlin_sales.head()

Unnamed: 0,BRANCH,POS,TRANSDATE,ITE_CODE,QUANTITY,DEP_CODE,DATE,TIME,TYPE,DELIVERY
0,BRLN,1,2023-12-31,PP9,1,26,2023-12-31,21:10,T,0
1,BRLN,1,2023-12-31,CHM13,1,1,2023-12-31,21:04,T,0
2,BRLN,1,2023-12-31,SLD7,1,9,2023-12-31,20:52,T,0
3,BRLN,1,2023-12-31,SAL3,1,46,2023-12-31,20:52,T,0
4,BRLN,1,2023-12-31,BL1,1,42,2023-12-31,20:33,D,0


In [14]:
print(df_barlin_sales.dtypes)

BRANCH       object
POS           int64
TRANSDATE    object
ITE_CODE     object
QUANTITY      int64
DEP_CODE      int64
DATE         object
TIME         object
TYPE         object
DELIVERY      int64
dtype: object
