In [None]:
# Read the data from csv file
import os
import pandas as pd
# Load the Sale Report dataframe if not already loaded for SKU codes
df_sale_report = pd.read_csv('./cleaned_data/sale_report_clean.csv')
df_international_sr = pd.read_csv('Amazon Data - Capstone Project/International sale Report.csv')

In [None]:
# Split df_international_sr into two parts at index 19675
# From 19575 the order of columns is different
df_int_first_half = df_international_sr.loc[:19674].reset_index(drop=True)
df_int_second_half = df_international_sr.loc[19675:].reset_index(drop=True)

In [None]:
# Remove the 'index' column
df_int_second_half = df_int_second_half.drop(columns=['index'])
# Set the first row(19675) as column headers 
df_int_second_half.columns = df_int_second_half.iloc[0]
# Drop the first row which is now redundant
df_int_second_half = df_int_second_half.drop(df_int_second_half.index[0]).reset_index(drop=True)

In [None]:
# Check for missing values in the dataframe
df_int_first_half.isna().sum(), df_int_second_half.isna().sum()

In [None]:
# Change the 'DATE' column to datetime format to detect any not matching values
df_int_first_half['date_dt'] = pd.to_datetime(df_int_first_half['DATE'], format='%m-%d-%y', errors='coerce')
date_nat = df_int_first_half[df_int_first_half['date_dt'].isna()]

In [None]:
# Months in the 'Months' column to datetime format to detect any not matching values
df_int_first_half['month_dt'] = pd.to_datetime(df_int_first_half['Months'], format='%b-%y', errors='coerce')
month_nat = df_int_first_half[df_int_first_half['month_dt'].isna()]

In [None]:
# Check for missing values in the dataframe
df_int_first_half[['month_dt', 'date_dt']].isna().sum()

In [None]:
# The rows of date_dt & month_dt with NaT values are the same as CUSTOMER NaN rows 
# Also, they have no values in other columns. Only the wrong values in DATE or Months columns.
(df_int_first_half.loc[df_int_first_half['month_dt'].isna(), 'CUSTOMER'].isna() == df_int_first_half.loc[df_int_first_half['date_dt'].isna(), 'CUSTOMER'].isna()).sum()

In [None]:
# Drop rows where 'CUSTOMER' is NaN in df_int_first_half
df_int_first_half = df_int_first_half.dropna(subset=['CUSTOMER']).reset_index(drop=True)

In [None]:
df_int_first_half.isna().sum()

In [None]:
# Uppercase the 'Style' and 'Size' columns in df_int_first_half
df_int_first_half['Style'] = df_int_first_half['Style'].str.upper()
df_int_first_half['Size'] = df_int_first_half['Size'].str.upper()

In [None]:
# Create a mapping from (Style, Size) to SKU in df_sale_report
style_size_to_sku = df_sale_report[['design_no', 'size', 'sku']].dropna().set_index(['design_no', 'size'])['sku'].to_dict()

# Fill missing SKUs in df_int_first_half using Style and Size mapping
mask_no_sku = df_int_first_half['SKU'].isna()
df_int_first_half.loc[mask_no_sku, 'SKU'] = df_int_first_half.loc[mask_no_sku].apply(
    lambda row: style_size_to_sku.get((row['Style'], row['Size']), None), axis=1
)

In [None]:
# Only consider styles where SKU is NaN in df_int_first_half
nan_sku_styles = df_int_first_half.loc[df_int_first_half['SKU'].isna(), 'Style'].dropna().unique()
style_to_first_matching_sku = {}

for style in nan_sku_styles:
    matches = df_sale_report[df_sale_report['sku'].astype(str).str.contains(style, na=False)]
    if not matches.empty:
        style_to_first_matching_sku[style] = matches.iloc[0]['sku']
    else:
        style_to_first_matching_sku[style] = None

style_to_first_matching_sku

In [None]:
# For rows in df_int_first_half where SKU is NaN, fill SKU using style_to_first_matching_sku,
# but remove the last part after the last '-' and append the Size from df_int_first_half

mask_remaining_no_sku = df_int_first_half['SKU'].isna()
for idx, row in df_int_first_half[mask_remaining_no_sku].iterrows():
    style = row['Style']
    size = row['Size']
    sku = style_to_first_matching_sku.get(style)
    if sku and '-' in sku:
        base = '-'.join(sku.split('-')[:-1])
        new_sku = f"{base}-{size}"
        df_int_first_half.at[idx, 'SKU'] = new_sku


In [None]:
#NaN SKUs in the second half of the dataframe have no size 
#We cannot fill them with respective SKU code from df_sale_report
df_int_second_half[df_int_second_half['SKU'].isna()].head()

In [None]:
# Drop last row where 'SKU' is NaN in df_int_first_half
df_int_first_half = df_int_first_half.dropna(subset=['SKU']).reset_index(drop=True)
df_int_first_half.isna().sum()

In [None]:
# Create a mapping from (Style, Size) to SKU in Sale Report
style_size_to_sku = df_sale_report[['design_no', 'size', 'sku']].dropna().set_index(['design_no', 'size'])['sku'].to_dict()

# Fill missing SKUs in df_int_first_half using Style and Size mapping
mask_no_sku = df_int_first_half['SKU'].isna()
df_int_first_half.loc[mask_no_sku, 'SKU'] = df_int_first_half.loc[mask_no_sku].apply(
    lambda row: style_size_to_sku.get((row['Style'], row['Size']), None), axis=1
)

In [None]:
# Extract the last part of 'SKU' after the last '-' as 'Size' in df_int_second_half
df_int_second_half['Size'] = df_int_second_half['SKU'].str.split('-').str[-1]

In [None]:
df_int_second_half = df_int_second_half.dropna(subset=['SKU']).reset_index(drop=True)

In [None]:
df_int_second_half

In [None]:
# Show columns that are different and same between df_int_first_half and df_int_second_half
cols_first = set(df_int_first_half.columns)
cols_second = set(df_int_second_half.columns)

same_cols = cols_first & cols_second
diff_first = cols_first - cols_second
diff_second = cols_second - cols_first

print("Same columns:", same_cols)
print("Columns only in first half:", diff_first)
print("Columns only in second half:", diff_second)

In [None]:
# Select only the common and necessary columns and reorder them 
common_cols = ['CUSTOMER', 'DATE', 'Style', 'SKU', 'Size', 'PCS', 'RATE', 'GROSS AMT']

df_first_common = df_int_first_half[common_cols]
df_second_common = df_int_second_half[common_cols]

# Concatenate the two dataframes
df_int_clean = pd.concat([df_first_common, df_second_common], ignore_index=True)
df_int_clean.tail()

In [None]:
df_int_clean.isna().sum()

In [None]:
shipping_rows = df_int_clean[df_int_clean.apply(lambda row: 'SHIPPING' in row.values, axis=1)]
print(shipping_rows)

In [None]:
# Check if 'SHIPPING' appears in the same rows for both 'Style' and 'SKU'
shipping_style_idx = df_int_clean[df_int_clean['Style'] == 'SHIPPING'].index
shipping_sku_idx = df_int_clean[df_int_clean['SKU'] == 'SHIPPING'].index

same_shipping_rows = shipping_style_idx.equals(shipping_sku_idx)
same_shipping_rows


In [None]:
# Dropping rows with Shipping inst4ead of actual values
df_int_clean = df_int_clean[df_int_clean['Style'] != 'SHIPPING'].reset_index(drop=True)

In [None]:
#change column names to lower case and fix 'gross amt' column name to gross_amt
df_int_clean.columns = df_int_clean.columns.str.lower()
df_int_clean = df_int_clean.rename(columns={'gross amt': 'gross_amt'})

In [None]:
# Convert 'date' and 'months' columns in df_int_clean to datetime
df_int_clean['date'] = pd.to_datetime(df_int_clean['date'], format='%m-%d-%y', errors='coerce')

In [None]:
# Extract 'Category' from df_sale_report for each style in df_int_clean
# Create a mapping from style (Design No.) to Category
style_to_category = df_sale_report.dropna(subset=['design_no', 'category']).set_index('design_no')['category'].to_dict()

# Add a new 'category' column to df_int_clean before 'style'
cols = df_int_clean.columns.tolist()
insert_at = cols.index('style')
df_int_clean.insert(insert_at, 'category', df_int_clean['style'].map(style_to_category))


In [None]:
# Show unique styles where category is NaN
df_int_clean.loc[df_int_clean['category'].isna(), 'style'].unique()

In [None]:
df_int_clean[df_int_clean['category'].isna() & df_int_clean['style'].str.contains('CMB5')]

In [None]:
# Fill category as 'Kurta Set' where category is NaN and sku contains both 'SET' and 'KR'
mask = df_int_clean['category'].isna() & df_int_clean['sku'].str.contains('SET') & df_int_clean['sku'].str.contains('KR')
df_int_clean.loc[mask, 'category'] = 'Kurta Set'

In [None]:
# Fill category as 'KURTA' where category is NaN and sku contains 'KR'
mask_kurta = df_int_clean['category'].isna() & df_int_clean['sku'].str.contains('KR')
df_int_clean.loc[mask_kurta, 'category'] = 'Kurta'

In [None]:
mask_kurta = df_int_clean['category'].isna() & df_int_clean['sku'].str.contains('SAR086')
df_int_clean.loc[mask_kurta, 'category'] = 'Saree'
df_int_clean.loc[mask_kurta, 'sku'] = 'SAR086-FREE'

In [None]:
# Drop rows where category is NaN and style is in the specified list
drop_styles = [
    'TAG PRINTING', 'TAGS(LABOUR)', 'TAGS', 'CMB5',
    'LABEL CHARGE', 'SHIPPING CHARGES', 'LABEL MANUF.CHRAGE', 'SAR086'
]
df_int_clean = df_int_clean[~(df_int_clean['category'].isna() & df_int_clean['style'].isin(drop_styles))].reset_index(drop=True)

In [None]:
# Expand the row with size 'S TO XXL' into 5 rows with sizes S, M, L, XL, XXL
sizes = ['S', 'M', 'L', 'XL', 'XXL']

# Find all rows in df_int_first_half where Size is 'S TO XXL'
mask_multi_size = df_int_clean['size'] == 'S TO XXL'
rows_to_expand = df_int_clean[mask_multi_size]

expanded_rows = []
for _, row in rows_to_expand.iterrows():
    # Divide PCS and GROSS AMT by 5 for each size, keep RATE the same
    pcs = float(row['pcs']) / 5 if pd.notna(row['pcs']) else None
    rate = row['rate']
    gross_amt = float(row['gross_amt']) / 5 if pd.notna(row['gross_amt']) else None
    for size in sizes:
        new_row = row.copy()
        new_row['size'] = size
        # Change SKU to end with the current size
        if pd.notna(row['sku']) and '-' in str(row['sku']):
            base_sku = '-'.join(str(row['sku']).split('-')[:-1])
            new_row['sku'] = f"{base_sku}-{size}"
        new_row['pcs'] = f"{pcs:.2f}" if pcs is not None else None
        new_row['rate'] = rate
        new_row['gross_amt'] = f"{gross_amt:.2f}" if gross_amt is not None else None
        expanded_rows.append(new_row)
        new_row['GROSS AMT'] = f"{gross_amt:.2f}" if gross_amt is not None else None
        expanded_rows.append(new_row)

# Remove the original multi-size rows and append the expanded ones
df_int_clean = df_int_clean[~mask_multi_size].reset_index(drop=True)
df_int_clean = pd.concat([df_int_clean, pd.DataFrame(expanded_rows)], ignore_index=True)

In [None]:
# Check how the expansion worked
df_int_clean[df_int_clean['style'] == 'JAN8641']

In [None]:
# Drop fully duplicated rows in df_int_clean
df_int_clean = df_int_clean.drop_duplicates().reset_index(drop=True)
df_int_clean = df_int_clean.drop(columns=['GROSS AMT'])
# Check for missing values in the cleaned dataframe
df_int_clean.isna().sum()

In [None]:
# Check unique values of relevant columns in df_int_clean to ensure coherence with amazon_sales main CSV
columns_to_check = ['customer', 'style', 'sku', 'size', 'category','pcs', 'rate', 'gross_amt']
for col in columns_to_check:
    print(f"Unique values in '{col}':")
    print(df_int_clean[col].unique())
    print('-' * 40)

In [None]:
# Show sizes ending with a dot
sizes_with_dot = df_int_clean[df_int_clean['size'].str.endswith('.')]['size'].unique()
#print("Sizes ending with '.':", sizes_with_dot)

# Remove the dot at the end of 'size'
df_int_clean['size'] = df_int_clean['size'].str.rstrip('.')

In [None]:
# Replace 'XXXL' with '3XL' in the 'size' column
df_int_clean['size'] = df_int_clean['size'].replace('XXXL', '3XL')

# Replace 'XXXL' with '3XL' at the end of SKU after the last '-'
df_int_clean['sku'] = df_int_clean['sku'].str.replace(r'-(XXXL)$', '-3XL', regex=True)

In [None]:
allowed_sizes = ['XS', 'S', 'M', 'L', 'XL', 'XXL', '3XL', '4XL', '5XL', '6XL', 'FREE']
df_int_clean = df_int_clean[df_int_clean['size'].isin(allowed_sizes)]

In [None]:
# Show rows where 'size' is not in the allowed list
df_int_clean[~df_int_clean['size'].isin(allowed_sizes)]

In [None]:
# # Capitalize customer names to "Title Case" (e.g., "Gulnara Mustafayeva")
df_int_clean['customer'] = df_int_clean['customer'].str.title()

In [None]:
# Drop rows where the 'sku' column value is 'KURTI'
df_int_clean = df_int_clean[df_int_clean['sku'] != 'KURTI'].reset_index(drop=True)

In [None]:
# Drop rows with any missing values
df_int_clean = df_int_clean.dropna().reset_index(drop=True)

In [None]:
# Set category to 'Kurta Set' for rows where style is 'SET350'
df_int_clean.loc[df_int_clean['style'] == 'SET350', 'category'] = 'Kurta Set'

In [None]:
# Find styles present in both df_sale_report and df_int_clean
styles_amazon = set(df_sale_report['design_no'].unique())
styles_international = set(df_int_clean['style'].unique())

# Find common styles
common_styles = styles_amazon & styles_international

# Create a mapping for style -> category from amazon sales
amazon_style_to_category = df_sale_report.dropna(subset=['design_no', 'category']).set_index('design_no')['category'].to_dict()

# Create a mapping for style -> category from international sales
international_style_to_category = df_int_clean.dropna(subset=['style', 'category']).set_index('style')['category'].to_dict()

# Collect mismatches
mismatches = []
for style in common_styles:
    cat_amazon = amazon_style_to_category.get(style)
    cat_international = international_style_to_category.get(style)
    if cat_amazon and cat_international and cat_amazon != cat_international:
        mismatches.append({
            'style': style,
            'category_amazon': cat_amazon,
            'category_international': cat_international
        })

# Create a DataFrame of mismatches
df_category_mismatches = pd.DataFrame(mismatches)
df_category_mismatches

In [None]:
# take the categories for those styles id int and amazon sales which are not samr into style - category amazon category international


In [None]:
#upload to schema
from dotenv import dotenv_values
from sqlalchemy import create_engine, types
from sqlalchemy import text

my_details = dotenv_values('./.env')

In [None]:
pg_user = my_details.get('pg_user')
pg_host = my_details.get('pg_host')
pg_port = my_details.get('pg_port')
pg_db = my_details.get('pg_db')
pg_schema = my_details.get('pg_schema')
pg_pass = my_details.get('pg_pass')

In [None]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'
engine = create_engine(url, echo=False)

In [None]:
# Update the CSV file
csv_path = './cleaned_data/international_sales_report_clean.csv'
df_int_clean.to_csv(csv_path, index=False)

In [None]:
# Load the cleaned CSV into a DataFrame
#csv_path = './cleaned_data/international_sales_report_clean.csv'
df_upload = pd.read_csv(csv_path)

# Upload to PostgreSQL schema
table_name = 'international_clean'
df_upload.to_sql(table_name, engine, schema=pg_schema, if_exists='replace', index=False)