In [2]:
import pandas as pd


In [3]:
a = pd.read_csv("International sale Report.csv")

In [4]:
a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37432 entries, 0 to 37431
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   index               37432 non-null  int64 
 1   Sale_Date           37431 non-null  object
 2   Sale_Month          37407 non-null  object
 3   Customer_Name       36392 non-null  object
 4   Product_Style       36392 non-null  object
 5   Product_SKU         34958 non-null  object
 6   Product_Size        36392 non-null  object
 7   Quantity_Purchased  36392 non-null  object
 8   Price_per_Unit      36392 non-null  object
 9   Gross_Amount        36392 non-null  object
dtypes: int64(1), object(9)
memory usage: 2.9+ MB


In [5]:
# Get unique customer names
print("Unique Customer Names:")
print(a['Customer_Name'].unique())
print("\n")

# Get unique sale dates
print("Unique Sale Dates:")
print(a['Sale_Date'].unique())
print("\n")

# Get unique sale months
print("Unique Sale Months:")
print(a['Sale_Month'].unique())

Unique Customer Names:
['REVATHY LOGANATHAN' 'FARIA ESSOPP' 'MANGALAM SHOP'
 'THANA NAGISSWARY L MARIMUTHU' 'MR.ALWAR MURALI' 'RAZIA ROSEANE NASER'
 'SIRI PADALA' 'FUSION FASHIONS CORP.' 'MIZNA WAHEEDH'
 'AMANI CONCEPT TRADING LLC (KAPDA)' 'NITHARSHA' 'SYEDA MORSHED'
 'VINTAGE INDIA NYC' 'M/S CHARISMAKURTIES(DUBAI)' 'MULBERRIES BOUTIQUE'
 'COTTON CLOSET LTD' 'THANA MARIMUTHU' 'SHWETA' 'MONIYSHAA' 'AANCHOL'
 'KHUSBOO BEEHARRY' 'ARUNA DEVI' 'YAMUNAH PUSPANATHAN'
 'RISHIKESH DASHPUTRE' 'REGA' 'SURE FASHIONS LLC' 'MR ALWAR MURALI'
 'KOGILA SELLAPPAN' 'MONISYAA' 'VINODHA PUSPANATHAN' 'REKA RASAKODY'
 'ARKH FASHION' 'PUVANES GANNASIN' 'VINI’S AUTHENTIC INDIAN ETHNIC WEAR'
 'BINCY SKARIA' 'YASHWINI REDDY' 'BANUJA RASAKODY' 'VAHLAARMATHY'
 'AKASH KAUSHAL' 'KIRUTHIKA V SURESHBABU' 'RIVAAN LLC' 'SINDHU'
 'DHENOOGA RAVINTHERAN' 'VAHARSHA BOUTIQUE' 'MANISH DHOORUNDHUR'
 'RINO SANDARAN' 'VENDAN' 'VISHAL DARSHAN BOUTIQUE' 'KAVEENAZ COLLECTION'
 'ALAMELOO(MALA)' 'MURUGESAN CHANDRA' 'MEERA RASADURAI' 

In [6]:

# Remove rows where Product_SKU is empty/nan/blank
df = a.copy()
df = df.dropna(subset=['Product_SKU'])
df = df[df['Product_SKU'].str.strip() != '']

# Save cleaned dataframe
print(f"Rows before cleaning: {len(a)}")
df.to_csv("International_sale_Report_cleaned.csv", index=False)
print(f"Rows after cleaning: {len(df)}")

# Verify unique values in Product_SKU column
print("\nSample of unique Product_SKU values:")
print(df['Product_SKU'].unique()[:10])

Rows before cleaning: 37432
Rows after cleaning: 34958

Sample of unique Product_SKU values:
['MEN5004-KR-L' 'MEN5004-KR-XL' 'MEN5004-KR-XXL' 'MEN5009-KR-L'
 'MEN5011-KR-L' 'MEN5025-KR-L' 'MEN5015-KR-XL' 'MEN5022-KR-XXL'
 'MEN5014-KR-S' 'MEN5013-KR-S']


In [7]:
import numpy as np
import re


# Helper functions
def is_date(value):
    if pd.isna(value):
        return False
    return bool(re.match(r'^\d{1,2}[-/]\d{1,2}[-/]\d{2,4}$', str(value).strip()))

def is_name(value):
    if pd.isna(value):
        return False
    return bool(re.match(r'^[A-Za-z\s\.\&\(\)]+$', str(value).strip()))

# Step 1: Fix swapped values
for i, row in df.iterrows():
    if is_name(row['Sale_Date']) and is_date(row['Customer_Name']):
        df.at[i, 'Sale_Date'], df.at[i, 'Customer_Name'] = row['Customer_Name'], row['Sale_Date']

# Step 2: Clean Sale_Date & convert to datetime
df['Sale_Date'] = pd.to_datetime(df['Sale_Date'], errors='coerce', dayfirst=True)

# Step 3: Remove rows without valid Sale_Date
df = df.dropna(subset=['Sale_Date'])

# Step 4: Create new Sale_Month
df['Sale_Month_Clean'] = df['Sale_Date'].dt.strftime('%y-%b')

# Step 5: Clean Customer_Name (remove months/dates)
df['Customer_Name'] = df['Customer_Name'].replace(
    r'^\d{1,2}[-/][A-Za-z]{3}|\d+$', np.nan, regex=True
)

# Step 6: Drop duplicates and reset index
df = df.reset_index(drop=True)

df['Sale_Date']=df['Sale_Date'].dt.strftime('%d-%m-%Y')
# Get unique customer names
print("Unique Customer Names:")
print(df['Customer_Name'].unique())
print("\n")

# Get unique sale dates
print("Unique Sale Dates:")
print(df['Sale_Date'].unique())
print("\n")

# Get unique sale months
print("Unique Sale Months:")
print(df['Sale_Month_Clean'].unique())


Unique Customer Names:
['REVATHY LOGANATHAN' 'FARIA ESSOPP' 'MANGALAM SHOP' 'MULBERRIES BOUTIQUE'
 'RAZIA ROSEANE NASER' 'COTTON CLOSET LTD' 'THANA MARIMUTHU' 'SHWETA'
 'SURE FASHIONS LLC' 'MR ALWAR MURALI' 'KOGILA SELLAPPAN' 'MONISYAA'
 'AMANI CONCEPT TRADING LLC (KAPDA)' 'VINODHA PUSPANATHAN' 'REKA RASAKODY'
 'BANUJA RASAKODY' 'PUVANES GANNASIN' 'VAHLAARMATHY' 'AKASH KAUSHAL'
 'KIRUTHIKA V SURESHBABU' 'VISHAL DARSHAN BOUTIQUE'
 'GALAXY GROUP OF COMPANIES PVT. LTD'
 'VINI’S AUTHENTIC INDIAN ETHNIC WEAR' 'NIRUSAH TAILORING' 'DEVY'
 'VAHARSHA BOUTIQUE ' 'KALAIVANI PERMALU' 'ARKH FASHION BOUTIQUE PTY LTD'
 'ALAMELOO(MALA)' 'RINO SANDARAN' 'DHENOOGA RAVINTHERAN' 'MEERA RASADURAI'
 'RANDHIR CHAUDHARY' 'ANJU BHARATI' 'VISHA DEVAN' 'KAZI NURAN HAQUE'
 'PUNITHAVATHY KARUPIAH' 'SUUMAYA CLOTHING' 'NALINI' 'MAAVI FASHION'
 'K.PARTHIBAN' 'KHIRTHIKA ' 'HARSINIY KUMARESON' 'SWATI(SINGAPORE)'
 'THILAS BOMBAY BOUTIQUE SDN BHD' 'MR.JEYARAJ' 'ABBI COLLECTION ' 'AVIN  '
 'SHILPI KUMARI' 'YASSHLINY KUNJU

In [8]:
# Assuming your original DataFrame is named df
columns_to_remove = [
    'index','Sale_Month'
]

# Assign df to b first, then drop columns
df = df.drop(columns=columns_to_remove)

In [9]:
df.info()
count_rows = len(df)    
print("Total Rows:", count_rows)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6978 entries, 0 to 6977
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Sale_Date           6978 non-null   object
 1   Customer_Name       6978 non-null   object
 2   Product_Style       6978 non-null   object
 3   Product_SKU         6978 non-null   object
 4   Product_Size        6978 non-null   object
 5   Quantity_Purchased  6978 non-null   object
 6   Price_per_Unit      6978 non-null   object
 7   Gross_Amount        6978 non-null   object
 8   Sale_Month_Clean    6978 non-null   object
dtypes: object(9)
memory usage: 490.8+ KB
Total Rows: 6978


In [10]:
df = df.rename(columns={
    'Product_SKU': 'sku',
    'Product_Style': 'style',
    'Product_size': 'size',
    'Quantity_purchased': 'quantity',
    'Sale_Date': 'date',
    'Sale_Month_Clean': 'month'
})

# Check result
print(df.head())

         date       Customer_Name    style             sku Product_Size  \
0  06-05-2021  REVATHY LOGANATHAN  MEN5004    MEN5004-KR-L            L   
1  06-05-2021  REVATHY LOGANATHAN  MEN5004   MEN5004-KR-XL           XL   
2  06-05-2021  REVATHY LOGANATHAN  MEN5004  MEN5004-KR-XXL          XXL   
3  06-05-2021  REVATHY LOGANATHAN  MEN5009    MEN5009-KR-L            L   
4  06-05-2021  REVATHY LOGANATHAN  MEN5011    MEN5011-KR-L            L   

  Quantity_Purchased Price_per_Unit Gross_Amount   month  
0                  1         616.56          617  21-May  
1                  1         616.56          617  21-May  
2                  1         616.56          617  21-May  
3                  1         616.56          617  21-May  
4                  1         616.56          617  21-May  


In [24]:
df.to_excel('new_international_sales_report.xlsx', index=False)

In [12]:
df.to_csv('new_international_sales_report.csv', index=False)