In [3]:
import os
import pandas as pd

DATA_FOLDER = "data"
CRM_PATH = os.path.join(DATA_FOLDER, "crm_data.csv")
WEB_TRACKING_PATH = os.path.join(DATA_FOLDER, "web_tracking.csv")

In [4]:
df_crm = pd.read_csv(CRM_PATH)
df_web = pd.read_csv(WEB_TRACKING_PATH)

data = {
    'crm': df_crm,
    'web_tracking': df_web,
}

In [6]:
# Code created together with ChatGpt
crm_df = data['crm']
web_tracking_df = data['web_tracking']

# create dimensional tables
lead_status_dim = crm_df[['lead_status']].drop_duplicates().reset_index(drop=True)
lead_status_dim['lead_status_id'] = lead_status_dim.index + 1

company_dim = crm_df[['company']].drop_duplicates().reset_index(drop=True)
company_dim['company_id'] = company_dim.index + 1

industry_dim = crm_df[['industry']].drop_duplicates().reset_index(drop=True)
industry_dim['industry_id'] = industry_dim.index + 1

country_dim = crm_df[['country']].drop_duplicates().reset_index(drop=True)
country_dim['country_id'] = country_dim.index + 1

page_dim = web_tracking_df[['page']].drop_duplicates().reset_index(drop=True)
page_dim['page_id'] = page_dim.index + 1

utm_source_dim = web_tracking_df[['utm_source']].drop_duplicates().reset_index(drop=True)
utm_source_dim['utm_source_id'] = utm_source_dim.index + 1

utm_medium_dim = web_tracking_df[['utm_medium']].drop_duplicates().reset_index(drop=True)
utm_medium_dim['utm_medium_id'] = utm_medium_dim.index + 1

device_type_dim = web_tracking_df[['device_type']].drop_duplicates().reset_index(drop=True)
device_type_dim['device_type_id'] = device_type_dim.index + 1

crm_df['sign_up_datetime'] = pd.to_datetime(crm_df['sign_up_date'])
sign_up_time_dim = crm_df[['sign_up_datetime']].drop_duplicates().reset_index(drop=True)
sign_up_time_dim['sign_up_time_id'] = sign_up_time_dim.index + 1
sign_up_time_dim['sign_up_year'] = sign_up_time_dim['sign_up_datetime'].dt.year
sign_up_time_dim['sign_up_month'] = sign_up_time_dim['sign_up_datetime'].dt.month
sign_up_time_dim['sign_up_week'] = sign_up_time_dim['sign_up_datetime'].dt.isocalendar().week
sign_up_time_dim['sign_up_weekday'] = sign_up_time_dim['sign_up_datetime'].dt.weekday

web_tracking_df["timestamp_datetime"] = pd.to_datetime(web_tracking_df['timestamp'])
datetime_dim = web_tracking_df[['timestamp_datetime']].drop_duplicates().reset_index(drop=True)
datetime_dim['datetime_id'] = datetime_dim.index + 1
datetime_dim['year'] = datetime_dim['timestamp_datetime'].dt.year
datetime_dim['month'] = datetime_dim['timestamp_datetime'].dt.month
datetime_dim['week'] = datetime_dim['timestamp_datetime'].dt.isocalendar().week
datetime_dim['weekday'] = datetime_dim['timestamp_datetime'].dt.weekday

# merge user dim table
user_dim = crm_df.merge(company_dim, on="company")\
.merge(industry_dim, on="industry")\
.merge(country_dim, on="country")\
.merge(sign_up_time_dim, on="sign_up_datetime")\
.merge(lead_status_dim, on="lead_status")\
[["user_id","lead_status_id", "industry_id", "company_id", "country_id", "sign_up_time_id"]].drop_duplicates(subset='user_id')

# create fact table by merging
fact_table = web_tracking_df.merge(user_dim, on="user_id")\
.merge(datetime_dim, on="timestamp_datetime")\
.merge(page_dim, on="page")\
.merge(utm_source_dim, on="utm_source")\
.merge(utm_medium_dim, on="utm_medium")\
.merge(device_type_dim, on="device_type")\
[['session_id', 'user_id', 'datetime_id', 'page_id', 'utm_source_id', 'utm_medium_id', 'device_type_id', 'time_spent_sec']]

# writing "return" for mage function

return {    
    'user_dim': user_dim,
    'company_dim': company_dim,
    'country_dim': country_dim,
    'industry_dim': industry_dim,
    'lead_status_dim': lead_status_dim,
    'sign_up_time_dim': sign_up_time_dim,
    'page_dim': page_dim,
    'datetime_dim': datetime_dim,
    'utm_source_dim': utm_source_dim,
    'utm_medium_dim': utm_medium_dim,
    'device_type_dim': device_type_dim,
    'fact_table': fact_table,
}

{'user_dim':     user_id  lead_status_id  industry_id  company_id  country_id  \
0       137               1            1           1           1   
1       163               1            2           2           2   
2       103               2            1           3           3   
4       171               3            1           5           4   
5       151               1            2           6           1   
6       141               2            4           7           4   
7       117               3            2           8           3   
8       114               3            2           9           2   
10      102               2            4          11           2   
11      154               3            2          12           1   
12      162               3            2          13           4   
13      147               2            2          14           2   
14      181               3            3          15           1   
15      192               1        

In [None]:
 crm_df = data['crm']
    web_tracking_df = data['web_tracking']

    # Create user sub-dimension Tables
    lead_status_dim = crm_df[['lead_status']].drop_duplicates().reset_index(drop=True)
    lead_status_dim['lead_status_id'] = lead_status_dim.index + 1

    company_dim = crm_df[['company']].drop_duplicates().reset_index(drop=True)
    company_dim['company_id'] = company_dim.index + 1

    industry_dim = crm_df[['industry']].drop_duplicates().reset_index(drop=True)
    industry_dim['industry_id'] = industry_dim.index + 1

    country_dim = crm_df[['country']].drop_duplicates().reset_index(drop=True)
    country_dim['country_id'] = country_dim.index + 1

    # Sign-up time breakdown
    crm_df['sign_up_datetime'] = pd.to_datetime(crm_df['sign_up_date'])
    sign_up_time_dim = crm_df[['sign_up_datetime']].drop_duplicates().reset_index(drop=True)
    sign_up_time_dim['sign_up_time_id'] = sign_up_time_dim.index + 1
    sign_up_time_dim['sign_up_year'] = sign_up_time_dim['sign_up_datetime'].dt.year
    sign_up_time_dim['sign_up_month'] = sign_up_time_dim['sign_up_datetime'].dt.month
    sign_up_time_dim['sign_up_week'] = sign_up_time_dim['sign_up_datetime'].dt.isocalendar().week
    sign_up_time_dim['sign_up_weekday'] = sign_up_time_dim['sign_up_datetime'].dt.weekday

    # User Dimension Table
    user_dim = crm_df.merge(company_dim, on="company")\
    .merge(industry_dim, on="industry")\
    .merge(country_dim, on="country")\
    .merge(sign_up_time_dim, on="sign_up_datetime")\
    .merge(lead_status_dim, on="lead_status")\
    [["user_id","lead_status_id", "industry_id", "company_id", "country_id", "sign_up_time_id"]].drop_duplicates(subset='user_id')

    # Prepare Session Data Dimensions
    web_tracking_df["timestamp_datetime"] = pd.to_datetime(web_tracking_df['timestamp'])
    datetime_dim = web_tracking_df[['timestamp_datetime']].drop_duplicates().reset_index(drop=True)
    datetime_dim['datetime_id'] = datetime_dim.index + 1
    datetime_dim['year'] = datetime_dim['timestamp_datetime'].dt.year
    datetime_dim['month'] = datetime_dim['timestamp_datetime'].dt.month
    datetime_dim['week'] = datetime_dim['timestamp_datetime'].dt.isocalendar().week
    datetime_dim['weekday'] = datetime_dim['timestamp_datetime'].dt.weekday

    page_dim = web_tracking_df[['page']].drop_duplicates().reset_index(drop=True)
    page_dim['page_id'] = page_dim.index + 1

    utm_source_dim = web_tracking_df[['utm_source']].drop_duplicates().reset_index(drop=True)
    utm_source_dim['utm_source_id'] = utm_source_dim.index + 1

    utm_medium_dim = web_tracking_df[['utm_medium']].drop_duplicates().reset_index(drop=True)
    utm_medium_dim['utm_medium_id'] = utm_medium_dim.index + 1

    device_type_dim = web_tracking_df[['device_type']].drop_duplicates().reset_index(drop=True)
    device_type_dim['device_type_id'] = device_type_dim.index + 1

    # Fact Table
    fact_table = web_tracking_df.merge(user_dim, on="user_id")\
    .merge(datetime_dim, on="timestamp_datetime")\
    .merge(page_dim, on="page")\
    .merge(utm_source_dim, on="utm_source")\
    .merge(utm_medium_dim, on="utm_medium")\
    .merge(device_type_dim, on="device_type")\
    [['session_id', 'user_id', 'datetime_id', 'page_id', 'utm_source_id', 'utm_medium_id', 'device_type_id', 'time_spent_sec']]

    return {
        'user_dim': user_dim,
        'company_dim': company_dim,
        'country_dim': country_dim,
        'industry_dim': industry_dim,
        'lead_status_dim': lead_status_dim,
        'sign_up_time_dim': sign_up_time_dim,
        'page_dim': page_dim,
        'datetime_dim': datetime_dim,
        'utm_source_dim': utm_source_dim,
        'utm_medium_dim': utm_medium_dim,
        'device_type_dim': device_type_dim,
        'fact_table': fact_table,
    }
