In [0]:
dbutils.widgets.text("params","")

In [0]:
params_str = dbutils.widgets.get("params")

In [0]:
import json
params = json.loads(params_str)

In [0]:
PIPELINE_NAME = "sku_master"
print(f"-----------Starting pipeline: {PIPELINE_NAME}-----------")

In [0]:
import sys
import os

# Get the directory where the notebook is located
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
mage_ai_path = os.path.join(notebook_dir, '/Volumes/dealshare_prod/default/mage-ai/mage-ai/mage-ai-master/')
print(f"Adding to sys.path: {mage_ai_path}")
if mage_ai_path not in sys.path:
    sys.path.insert(0, mage_ai_path)
print(f"Added to path: {mage_ai_path}")

# Try importing mage_ai
import mage_ai
print(f"Successfully imported mage_ai from: {mage_ai.__file__}")

**Mage block:** bq_std_generic_last_modified
**Type:** data_loader

In [0]:
from mage_ai.settings.repo import get_repo_path
from mage_ai.io.bigquery import BigQuery
from mage_ai.io.config import ConfigFileLoader
from mage_ai.io.mysql import MySQL
from os import path
from datetime import datetime,timedelta
import pandas as pd
import time

if 'data_loader' not in globals():
    from mage_ai.data_preparation.decorators import data_loader

@data_loader
def load_data_from_big_query(*args, **kwargs):
    query = f"SELECT max(modified_date) as dt FROM {kwargs['dataset']}.{kwargs['table_name']}"
    config_path = path.join(get_repo_path(), '/Volumes/dealshare_prod/default/io_config/io_config.yaml')
    config_profile = 'default'
    dt = BigQuery.with_config(ConfigFileLoader(config_path, config_profile)).load(query)['dt'][0]-timedelta(seconds=30)
    dt = dt.strftime("%Y-%m-%d %H:%M:%S")
    return dt

In [0]:
last_modified_date = load_data_from_big_query(**params)

**Mage block:** sku_master_loader
**Type:** data_loader

In [0]:
if 'data_loader' not in globals():
    from mage_ai.data_preparation.decorators import data_loader

@data_loader
def load_data_from_mysql(last_modified_date, *args, **kwargs):
    item_attributes_columns = {
        'name': 'name',
        'categoryL1Id': 'category_l1',
        'subCategoryL2Id': 'category_l2',
        'subCategoryL3Id': 'category_l3',
        'subCategoryL4Id': 'category_l4',
        'brandId': 'brand_id',
        'weight.uom': 'weight_uom',
        'weight.weight': 'weight',
        'pricing.gst': 'gst',
        'pricing.cgst': 'cgst',
        'pricing.sgst': 'sgst',
        'pricing.igst': 'igst'
    }
    extract_query = ',\n\t'.join([f"JSON_EXTRACT(i.attributes, '$.{k}') AS {v}" for k, v in item_attributes_columns.items()])
    condition = f"""
        WHERE i.item_id in (
            SELECT 
                i.item_id
            FROM items i
            LEFT OUTER JOIN skus s ON s.item_id = i.item_id
            LEFT JOIN loose_skus ls ON ls.sku_id = s.sku_id
            WHERE i.modified_date>'{last_modified_date}' 
            OR s.modified_date>'{last_modified_date}' 
            OR ls.modified_date>'{last_modified_date}'
            )
    """
    query = f"""
        SELECT
            i.item_id,
            i.status AS item_status,
            {extract_query},
            s.sku_id,
            s.quantity AS sku_quantity,
            s.vehicle_type,
            s.attributes AS sku_attributes,
            s.status AS sku_status,
            s.channel,
            s.is_loosely_sellable,
            s.food_type,
            c.color as sku_color,
            f.value as sku_flavour,
            sz.size as sku_size,
            sz.uom as sku_size_uom,
            w.weight as sku_weight,
            w.uom as sku_weight_uom,
            ls.parent_sku_id,
            ls.conversion_factor,
            ls.status AS loose_skus_status,
            GREATEST(
                COALESCE(i.modified_date, '2015-01-01 00:00:00'),
                COALESCE(s.modified_date, '2015-01-01 00:00:00'),
                COALESCE(ls.modified_date, '2015-01-01 00:00:00')
            ) AS modified_date,
            JSON_VALUE(s.expiry_attributes, '$.shelfLife.value') AS ShelfLife,
            JSON_VALUE(s.expiry_attributes, '$.shelfLife.unit') AS ShelfLife_unit,
            JSON_VALUE(s.expiry_attributes, '$.sellableLife.value') AS SellableLife,
            JSON_VALUE(s.expiry_attributes, '$.sellableLife.unit') AS SellableLife_unit
        FROM
            items i
            LEFT OUTER JOIN skus s ON s.item_id = i.item_id
            LEFT JOIN loose_skus ls ON ls.sku_id = s.sku_id
            LEFT JOIN sku_color c ON c.sku_id = s.sku_id
            LEFT JOIN sku_flavor f ON f.sku_id = s.sku_id
            LEFT JOIN sku_size sz ON sz.sku_id = s.sku_id
            LEFT JOIN sku_weight w ON w.sku_id = s.sku_id
        {condition if last_modified_date!='2015-01-01' else ''}
    """
    config_path = path.join(get_repo_path(), 'io_config.yaml')
    config_profile = 'items'

    with MySQL.with_config(ConfigFileLoader(config_path, config_profile)) as loader:
        return loader.load(query)

In [0]:
source_df = load_data_from_mysql(last_modified_date)

**Mage block:** sku_master_transform
**Type:** transformer

In [0]:
if 'transformer' not in globals():
    from mage_ai.data_preparation.decorators import transformer
import pandas as pd
import json

def extract_weights(sku_attributes):
    if sku_attributes is None:
        return None, None
    sku_attributes = json.loads(sku_attributes)
    for attr in sku_attributes:
        if attr["attributeName"] == "SCM weight":
            value_data = json.loads(attr["value"])
            weight_value = next((item["value"] for item in value_data if item["attributeName"] == "Weight Value"), None)
            weight_key = next((item["value"] for item in value_data if item["attributeName"] == "Weight Key"), None)
            return weight_value, weight_key
    return None, None

def combine_attributes(columns, group):
    return group[columns].to_dict(orient='records')

@transformer
def transform(df):

    df[['scm_weight_value', 'scm_weight_key']] = df['sku_attributes'].apply(lambda x: pd.Series(extract_weights(x)))
    df.drop(columns=['sku_attributes'], inplace=True)

    int_cols = ['category_l1', 'category_l2', 'category_l3', 'category_l4', 
                'brand_id', 'sku_quantity', 'is_loosely_sellable', 
                'conversion_factor', 'loose_skus_status','sku_quantity','ShelfLife','SellableLife']
    float_cols = ['weight', 'gst', 'cgst', 'sgst', 'igst','scm_weight_value','sku_weight']
    str_cols = ['item_status', 'name', 'weight_uom','channel','food_type','sku_color',
                'sku_flavour','sku_size','sku_size_uom','sku_weight_uom', 'parent_sku_id',
                'scm_weight_key','sku_id','sku_status','vehicle_type','ShelfLife_unit','SellableLife_unit']

    for col in int_cols:
        df[col] = df[col].fillna(0).astype(int)
    for col in float_cols:
        df[col] = df[col].astype(float)
    for col in str_cols:
        df[col] = df[col].str.replace('"', '').fillna('').astype(str)

    loose_skus_columns = ['conversion_factor', 'loose_skus_status', 'parent_sku_id']
    skus_columns = ['sku_id', 'sku_quantity', 'vehicle_type', 'sku_status', 
                    'channel', 'is_loosely_sellable', 'scm_weight_value', 
                    'scm_weight_key', 'food_type','sku_color','sku_flavour',
                    'sku_size','sku_size_uom','sku_weight','sku_weight_uom','loose_skus_details','ShelfLife','ShelfLife_unit',
                    'SellableLife','SellableLife_unit']

    max_modified_date = df.groupby('item_id')['modified_date'].max().reset_index()
    df.drop(columns=['modified_date'], inplace=True)
    df = pd.merge(df, max_modified_date, on='item_id', how='left')

    loose_skus_details = df[df['sku_id']!=''].groupby(['sku_id']).apply(
        lambda group: combine_attributes(loose_skus_columns, group)).reset_index(name='loose_skus_details')
    loose_skus_details.loc[len(loose_skus_details)] = {
        'sku_id':'',
        'loose_skus_details': [{'conversion_factor': 0, 'loose_skus_status': 0, 'parent_sku_id': ''}]
    }
    df  = df.drop(columns=loose_skus_columns).drop_duplicates(keep='first')
    df = df.merge(loose_skus_details, on='sku_id',how='left')
    df = df.reset_index(drop=True)

    skus_details = df.groupby(['item_id']).apply(
        lambda group: combine_attributes(skus_columns, group)).reset_index(name='skus_details')
    df  = df.drop(columns=skus_columns).drop_duplicates(keep='first')
    df = df.merge(skus_details, on='item_id',how='left')
    df = df.reset_index(drop=True)

    return df


In [0]:
tarnsformed_df = transform(source_df)

**Mage block:** bq_std_generic_exporter
**Type:** data_exporter

In [0]:
if 'data_exporter' not in globals():
    from mage_ai.data_preparation.decorators import data_exporter


@data_exporter
def export_data_to_big_query(df: pd.DataFrame, **kwargs):
    temp_table = f"dealshare-d82f7.{kwargs['dataset']}.temp_{kwargs['table_name']}"
    main_table = f"dealshare-d82f7.{kwargs['dataset']}.{kwargs['table_name']}"
    config_path ='/Volumes/dealshare_prod/default/io_config/io_config.yaml'
    config_profile = 'default'

    for col in kwargs['datetime_columns']:
        df[col] = pd.to_datetime(df[col], format='%Y-%m-%d %H:%M:%S')

    insert_list = df.columns.tolist()
    update_list = [i for i in insert_list if i!=kwargs['id_column']]
    update_string = ', '.join([f'target.{col}=source.{col}' for col in update_list])
    insert_target_string = ', '.join(insert_list)
    insert_source_string = ', '.join([f'source.{col}' for col in insert_list])

    trunc_query = f"""TRUNCATE TABLE {temp_table}"""
    merge_query = f"""
        BEGIN TRANSACTION;
        MERGE INTO {main_table} AS target USING {temp_table} AS source 
        ON target.{kwargs['id_column']} = source.{kwargs['id_column']} 
        WHEN MATCHED THEN UPDATE SET {update_string} 
        WHEN NOT MATCHED THEN INSERT ({insert_target_string}) VALUES ({insert_source_string});
        TRUNCATE TABLE {temp_table};
        COMMIT TRANSACTION;
    """
    BigQuery.with_config(ConfigFileLoader(config_path, config_profile)).execute(trunc_query)
    time.sleep(2)
    BigQuery.with_config(ConfigFileLoader(config_path, config_profile)).export(
        df,
        temp_table,
        if_exists='append',
        # unique_conflict_method='UPDATE',
        # unique_constraints=['picklist_id']
    )
    time.sleep(2)
    BigQuery.with_config(ConfigFileLoader(config_path, config_profile)).execute(merge_query)

In [0]:
export_data_to_big_query(tarnsformed_df, **params)

In [0]:
print(f"-----------Pipeline {PIPELINE_NAME} completed successfully-----------")