# Monthly Performance (Daily)

In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
import glob
import re

import gspread
from gspread_dataframe import set_with_dataframe
from oauth2client.service_account import ServiceAccountCredentials
from google.oauth2.service_account import Credentials

import matplotlib.pyplot as plt

In [2]:
def extract_date_from_filename(filename):
    """
    Extract date from filename pattern: SA_Campaign_List_YYYYMMDD_YYYYMMDD_hash.csv
    Returns the first date (start date)
    """
    pattern = r'SA_Campaign_List_(\d{8})_\d{8}_.*\.csv'
    match = re.search(pattern, os.path.basename(filename))
    if match:
        date_str = match.group(1)
        return pd.to_datetime(date_str, format='%Y%m%d')
    return None

def clean_currency_column(column):
    """
    Remove $ symbol and convert to float
    """
    if column.dtype == 'object':
        # Remove $ symbol and any other non-numeric characters except decimal point
        cleaned = column.astype(str).str.replace(r'[$,]', '', regex=True)
        # Replace empty strings and 'nan' with NaN
        cleaned = cleaned.replace(['', 'nan', 'NaN'], np.nan)
        return pd.to_numeric(cleaned, errors='coerce')
    return column

def convert_to_float(column):
    """
    Convert object columns to float
    """
    if column.dtype == 'object':
        # Replace empty strings and specific text with NaN
        cleaned = column.astype(str).str.replace(r'[%,]', '', regex=True)
        cleaned = cleaned.replace(['', 'nan', 'NaN', '--', 'N/A'], np.nan)
        return pd.to_numeric(cleaned, errors='coerce')
    return column

def convert_to_int(column):
    """
    Convert object columns to int
    """
    if column.dtype == 'object':
        cleaned = column.astype(str).str.replace(r'[,]', '', regex=True)
        cleaned = cleaned.replace(['', 'nan', 'NaN', '--', 'N/A'], np.nan)
        # Convert to float first, then to int (handling NaN values)
        float_col = pd.to_numeric(cleaned, errors='coerce')
        return float_col.astype('Int64')  # Nullable integer type
    return column

def extract_asin_from_portfolio(portfolio_str):
    """
    Extract ASIN from Portfolio string. ASIN is typically 10 characters:
    - Pattern 1: B followed by 9 alphanumeric characters (e.g., B08XXXXXXX)
    - Pattern 2: 10 alphanumeric characters starting with letters
    - Pattern 3: Any 10 consecutive alphanumeric characters
    """
    if pd.isna(portfolio_str) or portfolio_str == '':
        return None
    
    portfolio_str = str(portfolio_str)
    
    # Pattern 1: B + 9 alphanumeric (most common ASIN format)
    pattern1 = r'B[A-Z0-9]{9}'
    match1 = re.search(pattern1, portfolio_str)
    if match1:
        return match1.group()
    
    # Pattern 2: 10 alphanumeric characters starting with letter
    pattern2 = r'[A-Z][A-Z0-9]{9}'
    match2 = re.search(pattern2, portfolio_str)
    if match2:
        return match2.group()
    
    # Pattern 3: Any 10 consecutive alphanumeric characters
    pattern3 = r'[A-Z0-9]{10}'
    match3 = re.search(pattern3, portfolio_str)
    if match3:
        return match3.group()
    
    # Pattern 4: 10 alphanumeric with possible lowercase (convert to uppercase)
    pattern4 = r'[A-Za-z0-9]{10}'
    match4 = re.search(pattern4, portfolio_str)
    if match4:
        return match4.group().upper()
    
    # If no pattern matches, return first 10 characters as fallback
    clean_str = re.sub(r'[^A-Za-z0-9]', '', portfolio_str)
    if len(clean_str) >= 10:
        return clean_str[:10].upper()
    
    return portfolio_str[:10] if len(portfolio_str) >= 10 else portfolio_str

def normalize_campaign_types(text):
    """
    Normalize campaign type keywords
    """
    if pd.isna(text) or text == '':
        return text
    
    text = str(text)
    
    # Define normalization mapping
    normalizations = {
        'sponsoredBrands': 'SB',
        'sponsoredDisplay': 'SD', 
        'sponsoredProducts': 'SP',
        'sponsoredbrands': 'SB',
        'sponsoreddisplay': 'SD',
        'sponsoredproducts': 'SP',
        'Sponsored Brands': 'SB',
        'Sponsored Display': 'SD',
        'Sponsored Products': 'SP'
    }
    
    # Apply normalizations
    for original, normalized in normalizations.items():
        text = text.replace(original, normalized)
    
    return text

def process_single_csv(file_path):
    """
    Process a single CSV file according to specifications
    """
    try:
        # Read CSV file
        df = pd.read_csv(file_path, encoding='utf-8')
        
        # Extract date from filename
        date_extracted = extract_date_from_filename(file_path)
        
        # Drop specified columns if they exist
        columns_to_drop = ['Profile', 'Labels', 'Budget group']
        existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
        if existing_columns_to_drop:
            df = df.drop(columns=existing_columns_to_drop)
        
        # Add ASIN column as first column (extract ASIN from Portfolio using smart detection)
        if 'Portfolio' in df.columns:
            df.insert(0, 'ASIN', df['Portfolio'].apply(extract_asin_from_portfolio))
        
        # Add Date column
        df.insert(1, 'Date', date_extracted)
        
        # Normalize campaign types in Campaign Type column
        if 'Campaign type' in df.columns:
            df['Campaign type'] = df['Campaign type'].apply(normalize_campaign_types)
        
        # Clean currency columns (remove $ and convert to float)
        currency_columns = ['Daily Budget', 'Current Budget']
        for col in currency_columns:
            if col in df.columns:
                df[col] = clean_currency_column(df[col])
        
        # Convert specified columns to float
        float_columns = ['Avg.time in Budget', 'Top-of-search IS', 'CPC', 'CVR', 'ACOS', 'ROAS']
        for col in float_columns:
            if col in df.columns:
                df[col] = convert_to_float(df[col])
        
        # Convert specified columns to int
        int_columns = ['Orders Other SKU', 'Units Other SKU']
        for col in int_columns:
            if col in df.columns:
                df[col] = convert_to_int(df[col])
        
        print(f"Successfully processed: {os.path.basename(file_path)}")
        return df
        
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def process_folder(folder_path):
    """
    Process all CSV files in a folder
    """
    # Find all CSV files in the folder
    csv_pattern = os.path.join(folder_path, "*.csv")
    csv_files = glob.glob(csv_pattern)
    
    if not csv_files:
        print(f"No CSV files found in {folder_path}")
        return pd.DataFrame()
    
    print(f"Found {len(csv_files)} CSV files in {folder_path}")
    
    # Process each file and collect DataFrames
    dataframes = []
    for file_path in sorted(csv_files):  # Sort to ensure consistent order
        df = process_single_csv(file_path)
        if df is not None and not df.empty:
            dataframes.append(df)
    
    # Combine all DataFrames
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True, sort=False)
        print(f"Combined {len(dataframes)} files from {folder_path}")
        return combined_df
    else:
        print(f"No valid data found in {folder_path}")
        return pd.DataFrame()

In [3]:
def main():
    """
    Main function to process Ads M7 and M8 folders
    """
    # Define folder paths
    base_path = "C:/Users/admin1/Desktop/Performance-Tracking"  # Adjust this path as needed
    ads_m7_path = os.path.join(base_path, "Ads-XNurta", "Ads M7")
    ads_m8_path = os.path.join(base_path, "Ads-XNurta", "Ads M8")
    
    # Check if folders exist
    folders_to_process = []
    if os.path.exists(ads_m7_path):
        folders_to_process.append(("Ads M7", ads_m7_path))
    else:
        print(f"Warning: {ads_m7_path} not found")
    
    if os.path.exists(ads_m8_path):
        folders_to_process.append(("Ads M8", ads_m8_path))
    else:
        print(f"Warning: {ads_m8_path} not found")
    
    if not folders_to_process:
        print("No valid folders found. Please check your paths.")
        return
    
    # Process each folder
    all_dataframes = []
    for folder_name, folder_path in folders_to_process:
        print(f"\n=== Processing {folder_name} ===")
        df = process_folder(folder_path)
        if not df.empty:
            all_dataframes.append(df)
    
    # Combine all data from both folders
    if all_dataframes:
        final_df = pd.concat(all_dataframes, ignore_index=True, sort=False)
        
        # Sort by Date and ASIN for better organization
        final_df = final_df.sort_values(['Date', 'ASIN'], na_position='last')
        
        # Reset index
        final_df = final_df.reset_index(drop=True)
        
        print(f"\n=== Final Results ===")
        print(f"Total rows: {len(final_df)}")
        print(f"Date range: {final_df['Date'].min()} to {final_df['Date'].max()}")
        print(f"Columns: {list(final_df.columns)}")
        
        # Save combined data
        output_filename = f"Combined_Ads_Data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        final_df.to_csv(output_filename, index=False)
        print(f"\nData saved to: {output_filename}")
        
        # Display sample data
        print(f"\nSample data (first 5 rows):")
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', None)
        print(final_df.head())
        
        return final_df
    else:
        print("No data to process.")
        return pd.DataFrame()

# Additional utility functions for ongoing updates
def update_with_new_file(existing_df, new_file_path):
    """
    Add new file data to existing DataFrame
    """
    new_df = process_single_csv(new_file_path)
    if new_df is not None and not new_df.empty:
        # Combine with existing data
        updated_df = pd.concat([existing_df, new_df], ignore_index=True, sort=False)
        # Remove duplicates based on Date and ASIN
        updated_df = updated_df.drop_duplicates(subset=['Date', 'ASIN'], keep='last')
        # Sort by Date and ASIN
        updated_df = updated_df.sort_values(['Date', 'ASIN'], na_position='last')
        updated_df = updated_df.reset_index(drop=True)
        
        print(f"Successfully added data from {os.path.basename(new_file_path)}")
        return updated_df
    else:
        print(f"Failed to process new file: {new_file_path}")
        return existing_df

def daily_update(base_df_path, new_file_path):
    """
    Daily update function for adding new data
    """
    # Load existing data
    if os.path.exists(base_df_path):
        existing_df = pd.read_csv(base_df_path)
        existing_df['Date'] = pd.to_datetime(existing_df['Date'])
    else:
        existing_df = pd.DataFrame()
    
    # Add new file data
    updated_df = update_with_new_file(existing_df, new_file_path)
    
    # Save updated data
    updated_df.to_csv(base_df_path, index=False)
    print(f"Updated data saved to: {base_df_path}")
    
    return updated_df

if __name__ == "__main__":
    # Run the main processing
    result_df = main()
    
    # Example of how to use daily update:
    # daily_update("Combined_Ads_Data_20241201_120000.csv", "path/to/new/file.csv")


=== Processing Ads M7 ===
Found 31 CSV files in C:/Users/admin1/Desktop/Performance-Tracking\Ads-XNurta\Ads M7
Successfully processed: SA_Campaign_List_20250701_20250701_K2WWz0.csv
Successfully processed: SA_Campaign_List_20250702_20250702_Z3Xspn.csv
Successfully processed: SA_Campaign_List_20250703_20250703_3QMabM.csv
Successfully processed: SA_Campaign_List_20250704_20250704_VYujNl.csv
Successfully processed: SA_Campaign_List_20250705_20250705_xZBIxL.csv
Successfully processed: SA_Campaign_List_20250706_20250706_vxMjEt.csv
Successfully processed: SA_Campaign_List_20250707_20250707_HLFsmI.csv
Successfully processed: SA_Campaign_List_20250708_20250708_jykgJT.csv
Successfully processed: SA_Campaign_List_20250709_20250709_gyLSeV.csv
Successfully processed: SA_Campaign_List_20250710_20250710_EnigeZ.csv
Successfully processed: SA_Campaign_List_20250711_20250711_sVkMfE.csv
Successfully processed: SA_Campaign_List_20250712_20250712_tRJ1lL.csv
Successfully processed: SA_Campaign_List_2025071

In [4]:
scopes = ["https://www.googleapis.com/auth/spreadsheets", 
          "https://www.googleapis.com/auth/drive"]
creds = Credentials.from_service_account_file("c:/Users/admin1/Downloads/new_credential.json", scopes=scopes)
client = gspread.authorize(creds)

# Mở Google Sheet
sheet_id = "1lZ4dsi94HaeWshsEizKTyNHeOOG0tpLJhzL9pMxvd6k"

# Mở file Google Sheet (Spreadsheet object)
spreadsheet = client.open_by_key(sheet_id)
sheet1 = client.open_by_key(sheet_id).worksheet("Raw_XNurta_H2")

set_with_dataframe(sheet1, result_df)

# SellerBoard

In [5]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
import glob
import re

import gspread
from gspread_dataframe import set_with_dataframe
from oauth2client.service_account import ServiceAccountCredentials
from google.oauth2.service_account import Credentials

import matplotlib.pyplot as plt

In [6]:
import os
import re
import pandas as pd
from datetime import datetime

def extract_date_from_filename(filename):
    """Extract first DD_MM_YYYY pattern from filename"""
    match = re.search(r"(\d{2}_\d{2}_\d{4})", filename)
    if match:
        return datetime.strptime(match.group(1), "%d_%m_%Y").date()
    return None

def process_single_excel(file_path):
    """Process a single Excel file and return DataFrame with Date column"""
    try:
        df = pd.read_excel(file_path)
        df = df.dropna(axis=1, how="all")  
        df.columns = [c.strip() for c in df.columns]

        # Extract date from filename
        date_val = extract_date_from_filename(os.path.basename(file_path))
        if date_val:
            df["Date"] = pd.to_datetime(date_val)

        # Move Date column after ASIN
        if "ASIN" in df.columns and "Date" in df.columns:
            asin_idx = df.columns.get_loc("ASIN")
            cols = list(df.columns)
            cols.insert(asin_idx + 1, cols.pop(cols.index("Date")))
            df = df[cols]

        return df
    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {e}")
        return pd.DataFrame()

def append_new_files(base_folder, master_csv_path="AggSB_master.csv"):
    """
    Process ALL subfolders inside Agg-SB
    Append new Excel files into one master DataFrame
    """
    # Load existing master
    if os.path.exists(master_csv_path):
        master_df = pd.read_csv(master_csv_path)
        if "Date" in master_df.columns:
            master_df["Date"] = pd.to_datetime(master_df["Date"], errors="coerce")
    else:
        master_df = pd.DataFrame()

    # Loop through subfolders
    for root, dirs, files in os.walk(base_folder):
        for file in files:
            if file.endswith(".xlsx"):
                file_path = os.path.join(root, file)
                file_date = extract_date_from_filename(file)

                # Skip if this date already exists
                if file_date and "Date" in master_df.columns:
                    if file_date in master_df["Date"].dt.date.values:
                        print(f"⏭️ Skipping {file}, already in master")
                        continue

                print(f"➕ Adding {file}")
                new_df = process_single_excel(file_path)
                if not new_df.empty:
                    master_df = pd.concat([master_df, new_df], ignore_index=True, sort=False)

    # Save updated master
    master_df.to_csv(master_csv_path, index=False)
    print(f"✅ Master file updated: {master_csv_path}")
    return master_df

# Example usage
if __name__ == "__main__":
    agg_sb_folder = "C:/Users/admin1/Desktop/Performance-Tracking/Agg-SB"
    updated_master = append_new_files(agg_sb_folder, "AggSB_master.csv")


⏭️ Skipping NewEleven_Dashboard Products Group by ASIN_01_07_2025-01_07_2025_(02_16_59_866).xlsx, already in master
⏭️ Skipping NewEleven_Dashboard Products Group by ASIN_02_07_2025-02_07_2025_(02_17_19_401).xlsx, already in master
⏭️ Skipping NewEleven_Dashboard Products Group by ASIN_03_07_2025-03_07_2025_(02_17_35_203).xlsx, already in master
⏭️ Skipping NewEleven_Dashboard Products Group by ASIN_04_07_2025-04_07_2025_(02_17_52_472).xlsx, already in master
⏭️ Skipping NewEleven_Dashboard Products Group by ASIN_05_07_2025-05_07_2025_(02_18_20_680).xlsx, already in master
⏭️ Skipping NewEleven_Dashboard Products Group by ASIN_06_07_2025-06_07_2025_(02_18_35_117).xlsx, already in master
⏭️ Skipping NewEleven_Dashboard Products Group by ASIN_07_07_2025-07_07_2025_(02_18_54_289).xlsx, already in master
⏭️ Skipping NewEleven_Dashboard Products Group by ASIN_08_07_2025-08_07_2025_(02_19_15_601).xlsx, already in master
⏭️ Skipping NewEleven_Dashboard Products Group by ASIN_09_07_2025-09_07_

In [7]:
scopes = ["https://www.googleapis.com/auth/spreadsheets", 
          "https://www.googleapis.com/auth/drive"]
creds = Credentials.from_service_account_file("c:/Users/admin1/Downloads/new_credential.json", scopes=scopes)
client = gspread.authorize(creds)

# Mở Google Sheet
sheet_id = "1lZ4dsi94HaeWshsEizKTyNHeOOG0tpLJhzL9pMxvd6k"

# Mở file Google Sheet (Spreadsheet object)
spreadsheet = client.open_by_key(sheet_id)
sheet1 = client.open_by_key(sheet_id).worksheet("Raw_SellerBoard_H2")

set_with_dataframe(sheet1, updated_master)