In [1]:
import pandas as pd

In [2]:
df_sr = pd.read_csv(
    '../data/raw_csv_files/Sale Report.csv', low_memory=False)

In [3]:
df_sr.head()

Unnamed: 0,index,SKU Code,Design No.,Stock,Category,Size,Color
0,0,AN201-RED-L,AN201,5.0,AN : LEGGINGS,L,Red
1,1,AN201-RED-M,AN201,5.0,AN : LEGGINGS,M,Red
2,2,AN201-RED-S,AN201,3.0,AN : LEGGINGS,S,Red
3,3,AN201-RED-XL,AN201,6.0,AN : LEGGINGS,XL,Red
4,4,AN201-RED-XXL,AN201,3.0,AN : LEGGINGS,XXL,Red


In [4]:
# Format columns
df_sr.columns = (
    df_sr.columns
    .str.strip()                                # Remove leading/trailing whitespace
    .str.lower()                                # Convert to lowercase
    .str.replace(r'[^\w\s]', '', regex=True)    # Remove special characters like - or .
    .str.replace(r'\s+', '_', regex=True)       # Replace spaces with underscores
)

In [5]:
df_sr.nunique()

index        9271
sku_code     9170
design_no    1594
stock         295
category       21
size           11
color          62
dtype: int64

In [6]:
# Drop unnecessary index column
df_sr.drop(columns=['index'], inplace=True)

In [7]:
df_sr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9271 entries, 0 to 9270
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sku_code   9188 non-null   object 
 1   design_no  9235 non-null   object 
 2   stock      9235 non-null   float64
 3   category   9226 non-null   object 
 4   size       9235 non-null   object 
 5   color      9226 non-null   object 
dtypes: float64(1), object(5)
memory usage: 434.7+ KB


In [8]:
# Change data type of 'Stock' to int and check
df_sr['stock'] = df_sr['stock'].fillna(0).astype(int)
df_sr['stock'].dtype

dtype('int64')

In [9]:
df_sr.head()

Unnamed: 0,sku_code,design_no,stock,category,size,color
0,AN201-RED-L,AN201,5,AN : LEGGINGS,L,Red
1,AN201-RED-M,AN201,5,AN : LEGGINGS,M,Red
2,AN201-RED-S,AN201,3,AN : LEGGINGS,S,Red
3,AN201-RED-XL,AN201,6,AN : LEGGINGS,XL,Red
4,AN201-RED-XXL,AN201,3,AN : LEGGINGS,XXL,Red


In [10]:
# Rename sku_code columns to match other df columns
df_sr.rename(columns={'sku_code': 'sku'}, inplace=True)

In [11]:
df_sr['category'].unique().tolist()

['AN : LEGGINGS',
 'BLOUSE',
 'PANT',
 'BOTTOM',
 'PALAZZO',
 'SHARARA',
 'SKIRT',
 'DRESS',
 'KURTA SET',
 'LEHENGA CHOLI',
 'SET',
 'TOP',
 'KURTA',
 nan,
 'CROP TOP',
 'TUNIC',
 'CARDIGAN',
 'JUMPSUIT',
 'CROP TOP WITH PLAZZO',
 'SAREE',
 'KURTI',
 'NIGHT WEAR']

In [12]:
# Strip 'AN :' from any category values
df_sr['category'] = df_sr['category'].str.replace(r'^AN\s*:\s*', '', regex=True).str.strip()

In [13]:
# Format Title Case for category values to match other dataframes
df_sr['category'] = df_sr['category'].str.title()

In [14]:
# Check for duplicates
print(df_sr.duplicated().sum())

38


In [15]:
# Inspect the 38 pairs of duplicates
dupes = df_sr[df_sr.duplicated(keep=False)]
display(dupes)

Unnamed: 0,sku,design_no,stock,category,size,color
83,#REF!,BL021,0,Blouse,FREE,Black
84,#REF!,BL021,0,Blouse,FREE,Black
86,#REF!,BL021,0,Blouse,FREE,Black
88,#REF!,BL022,0,Blouse,FREE,Beige
89,#REF!,BL022,0,Blouse,FREE,Beige
9235,,,0,,,
9236,,,0,,,
9237,,,0,,,
9238,,,0,,,
9239,,,0,,,


In [16]:
# Drop rows with '#REF!' in sku
df_sr = df_sr[df_sr['sku'] != '#REF!']

# Drop rows where all key columns are NaN
df_sr = df_sr.dropna(subset=['sku', 'design_no', 'category', 'size', 'color'], how='all')

In [17]:
df_sr.drop_duplicates(inplace=True)

In [18]:
# Check (should be empty dataframe now)
df_sr[df_sr['sku'] == '#REF!']

Unnamed: 0,sku,design_no,stock,category,size,color


In [19]:
# Check duplicates now (should be 0)
print(df_sr.duplicated().sum())

0


In [20]:
# Check for null values
df_sr.isnull().sum()

sku          47
design_no     0
stock         0
category      9
size          0
color         9
dtype: int64

In [21]:
# Check the sku nulls
df_sr[df_sr['sku'].isnull()]

Unnamed: 0,sku,design_no,stock,category,size,color
135,,BL086,1,Blouse,FREE,Blue
142,,BL087,0,Blouse,FREE,Multicolor
197,,BL097,1,Blouse,XXXL,Maroon
198,,BL098,2,Blouse,FREE,Brown
223,,BL102,1,Blouse,FREE,Navy Blue
248,,BL109,2,Blouse,FREE,Gold
273,,BL113,0,Blouse,XXXL,Maroon
546,,CH203,0,Leggings,FREE,Maroon
901,,J0055,0,,S,
902,,J0055,0,,XL,


In [22]:
# Define the rows with missing skus
df_sku_nulls = df_sr[df_sr['sku'].isnull()]

# Get the design_nos that have at least one non-null sku
designs_with_sku = df_sr[df_sr['sku'].notnull()]['design_no'].unique()

# Keep only rows where design_no exists elsewhere with a known sku
skus_to_fill = df_sku_nulls[df_sku_nulls['design_no'].isin(designs_with_sku)]
skus_to_fill

Unnamed: 0,sku,design_no,stock,category,size,color
135,,BL086,1,Blouse,FREE,Blue
142,,BL087,0,Blouse,FREE,Multicolor
197,,BL097,1,Blouse,XXXL,Maroon
198,,BL098,2,Blouse,FREE,Brown
223,,BL102,1,Blouse,FREE,Navy Blue
248,,BL109,2,Blouse,FREE,Gold
273,,BL113,0,Blouse,XXXL,Maroon
546,,CH203,0,Leggings,FREE,Maroon
1103,,J0096,1,Kurta,4XL,Blue
1104,,J0096,0,Kurta,5XL,Blue


In [23]:
# Filter out the rows where size = FREE
# The design_nos of these items don't have FREE as their size in the rest of the df
# Without the size, we can't impute the sku

skus_to_fill = skus_to_fill[skus_to_fill['size'] != 'FREE']
skus_to_fill

Unnamed: 0,sku,design_no,stock,category,size,color
197,,BL097,1,Blouse,XXXL,Maroon
273,,BL113,0,Blouse,XXXL,Maroon
1103,,J0096,1,Kurta,4XL,Blue
1104,,J0096,0,Kurta,5XL,Blue
1105,,J0096,0,Kurta,6XL,Blue
2245,,J0290,0,Cardigan,XXXL,Mustard
3306,,JNE1906,1,Kurta,5XL,Black
3451,,JNE2199,1,Kurta,6XL,Pink
3489,,JNE2270,0,Kurta,6XL,Beige
3546,,JNE3065,0,Kurta,XS,Turquoise Blue


In [24]:
# Make a loop to imput the missing skus 

# Get the rows with known skus
known_skus = df_sr[df_sr['sku'].notnull()]

# Build a lookup: design_no -> example sku
design_to_example_sku = known_skus.groupby('design_no')['sku'].first().to_dict()

# Loop through candidates, replace only the size part in the sku
adjusted_rows = []
for _, row in skus_to_fill.iterrows():
    design = row['design_no']
    size = row['size']
    
    # Get any known sku for this design_no
    example_sku = design_to_example_sku.get(design)

    # Skip if we don’t have any sku to copy from
    if not example_sku:
        continue

    # Skip if the known sku doesn’t follow the expected 'design-size' format
    # If there's no hyphen, we can't safely separate the size
    if '-' not in example_sku:
        continue
    
    # Replace the part after the last hyphen with the current row's size
    base = example_sku.rsplit('-', 1)[0]
    new_sku = f"{base}-{size}"
    
    # Make a copy of the row and update the sku
    new_row = row.copy()
    new_row['sku'] = new_sku
    adjusted_rows.append(new_row)

# Convert to df for inspection
adjusted_df = pd.DataFrame(adjusted_rows)
adjusted_df

Unnamed: 0,sku,design_no,stock,category,size,color
197,BL097-XXXL,BL097,1,Blouse,XXXL,Maroon
273,BL113-XXXL,BL113,0,Blouse,XXXL,Maroon
1103,J0096-KR-4XL,J0096,1,Kurta,4XL,Blue
1104,J0096-KR-5XL,J0096,0,Kurta,5XL,Blue
1105,J0096-KR-6XL,J0096,0,Kurta,6XL,Blue
2245,J0290-CD-XXXL,J0290,0,Cardigan,XXXL,Mustard
3306,JNE1906-KR-031-5XL,JNE1906,1,Kurta,5XL,Black
3451,JNE2199-KR-411-6XL,JNE2199,1,Kurta,6XL,Pink
3489,JNE2270-KR-487-6XL,JNE2270,0,Kurta,6XL,Beige
3546,JNE3065-KR-XS,JNE3065,0,Kurta,XS,Turquoise Blue


In [25]:
# Update using update (same index nos in both dfs) and confirm updated rows
df_sr.update(adjusted_df)
df_sr.loc[adjusted_df.index, ['design_no', 'size', 'sku']]

Unnamed: 0,design_no,size,sku
197,BL097,XXXL,BL097-XXXL
273,BL113,XXXL,BL113-XXXL
1103,J0096,4XL,J0096-KR-4XL
1104,J0096,5XL,J0096-KR-5XL
1105,J0096,6XL,J0096-KR-6XL
2245,J0290,XXXL,J0290-CD-XXXL
3306,JNE1906,5XL,JNE1906-KR-031-5XL
3451,JNE2199,6XL,JNE2199-KR-411-6XL
3489,JNE2270,6XL,JNE2270-KR-487-6XL
3546,JNE3065,XS,JNE3065-KR-XS


In [26]:
# Check reaming nulls
df_sr[df_sr['sku'].isnull()]

Unnamed: 0,sku,design_no,stock,category,size,color
135,,BL086,1,Blouse,FREE,Blue
142,,BL087,0,Blouse,FREE,Multicolor
198,,BL098,2,Blouse,FREE,Brown
223,,BL102,1,Blouse,FREE,Navy Blue
248,,BL109,2,Blouse,FREE,Gold
546,,CH203,0,Leggings,FREE,Maroon
901,,J0055,0,,S,
902,,J0055,0,,XL,
903,,J0055,0,,XS,
904,,J0055,0,,XXL,


In [27]:
# Drop remaining 16 rows with unrecoverable/null sku values
df_sr = df_sr[df_sr['sku'].notnull()]

In [28]:
df_sr.isnull().sum()

sku          0
design_no    0
stock        0
category     0
size         0
color        0
dtype: int64

In [29]:
# Check values for size
df_sr['size'].unique().tolist()

['L', 'M', 'S', 'XL', 'XXL', 'FREE', 'XS', 'XXXL', '4XL', '5XL', '6XL']

In [30]:
# Change 'XXXL' to '3XL' for consistency in other datasets (all other are ok)
df_sr['size'] = df_sr['size'].replace('XXXL', '3XL')

In [31]:
# Check values for size
df_sr['size'].unique().tolist()

['L', 'M', 'S', 'XL', 'XXL', 'FREE', 'XS', '3XL', '4XL', '5XL', '6XL']

In [32]:
df_sr.head()

Unnamed: 0,sku,design_no,stock,category,size,color
0,AN201-RED-L,AN201,5,Leggings,L,Red
1,AN201-RED-M,AN201,5,Leggings,M,Red
2,AN201-RED-S,AN201,3,Leggings,S,Red
3,AN201-RED-XL,AN201,6,Leggings,XL,Red
4,AN201-RED-XXL,AN201,3,Leggings,XXL,Red


In [33]:
# Confirm shape for data frame
df_sr.shape

(9204, 6)

In [34]:
# !!! DO NOT RUN THIS UNLESS YOU ALSO RUN ITS UPDATED VERSION IN amazon_sale_report_dfs.ipynb !!!
# # Save the cleaned Sale Report to .csv
# df_sr.to_csv('../cleaned_data/sale_report.csv', index=False)