# Import Libraries

In [8]:
import functools
import tracemalloc
import psutil
import os
from pathlib import Path
import time

import httpx
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

# Utilities

In [3]:
def profiler(func):
    """Decorator to measure memory usage and execution time of a function."""
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        process = psutil.Process(os.getpid())

        # Start memory + time tracking
        start_mem = process.memory_info().rss / 1024**2
        tracemalloc.start()
        start_time = time.time()

        result = func(*args, **kwargs)  # run target function

        # After execution
        current, peak = tracemalloc.get_traced_memory()
        end_mem = process.memory_info().rss / 1024**2
        end_time = time.time()
        tracemalloc.stop()

        print(f"\n--- Memory Profile for `{func.__name__}` ---")
        print(f"Start memory   : {start_mem:.2f} MB")
        print(f"End memory     : {end_mem:.2f} MB")
        print(f"Peak (tracked) : {peak / 1024**2:.2f} MB")
        print(f"Execution time : {end_time - start_time:.2f} sec")
        print("------------------------------------------\n")

        return result
    return wrapper

In [16]:
# Constants

ROOT_PATH = Path(os.getcwd())
DATASET_URL = "https://drive.usercontent.google.com/download?id=1N1xoxgcw2K3d-49tlchXAWw4wuxLj7EV&export=download"
DATASET_OUTPUT_PATH = ROOT_PATH / "dataset.csv"

In [19]:
# Utilities
@profiler
def download_data(url: str, output_path: Path) -> None:
    with httpx.stream("GET", url) as response:
        response.raise_for_status()  # check for HTTP errors
        with open(output_path, "wb") as f:
            for chunk in response.iter_bytes():
                f.write(chunk)

    print(f"Downloaded to {output_path}")

@profiler
def load_data(file_path: Path, **kwargs) -> pd.DataFrame:
    return pd.read_csv(file_path, **kwargs)

# EDA

## Data Loading

In [21]:
# Data Loading
download_data(DATASET_URL, DATASET_OUTPUT_PATH)

Downloaded to C:\Users\Dary\Documents\06 Projects\03 AI-ML-Data Science\Shopee AI Engineer Technical Test\src\Q1 CSV Parsing\dataset.csv

--- Memory Profile for `download_data` ---
Start memory   : 216.87 MB
End memory     : 217.22 MB
Peak (tracked) : 0.13 MB
Execution time : 7.73 sec
------------------------------------------



In [30]:
# Data Parsing
df = load_data(DATASET_OUTPUT_PATH, index_col=0)


--- Memory Profile for `load_data` ---
Start memory   : 230.51 MB
End memory     : 339.88 MB
Peak (tracked) : 61.37 MB
Execution time : 1.20 sec
------------------------------------------



## Data Understanding

In [33]:
df.head()

Unnamed: 0_level_0,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,ffeCAb7AbcB0f07,Jared,Jarvis,Sanchez-Fletcher,Hatfieldshire,Eritrea,274.188.8773x41185,001-215-760-4642x969,gabriellehartman@benjamin.com,2021-11-11,https://www.mccarthy.info/
2,b687FfC4F1600eC,Marie,Malone,Mckay PLC,Robertsonburgh,Botswana,283-236-9529,(189)129-8356x63741,kstafford@sexton.com,2021-05-14,http://www.reynolds.com/
3,9FF9ACbc69dcF9c,Elijah,Barrera,Marks and Sons,Kimbury,Barbados,8252703789,459-916-7241x0909,jeanettecross@brown.com,2021-03-17,https://neal.com/
4,b49edDB1295FF6E,Sheryl,Montgomery,"Kirby, Vaughn and Sanders",Briannaview,Antarctica (the territory South of 60 deg S),425.475.3586,(392)819-9063,thomassierra@barrett.com,2020-09-23,https://www.powell-bryan.com/
5,3dcCbFEB17CCf2E,Jeremy,Houston,Lester-Manning,South Brianna,Micronesia,+1-223-666-5313x4530,252-488-3850x692,rubenwatkins@jacobs-wallace.info,2020-09-18,https://www.carrillo.com/


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Index              100000 non-null  int64 
 1   Customer Id        100000 non-null  object
 2   First Name         100000 non-null  object
 3   Last Name          100000 non-null  object
 4   Company            100000 non-null  object
 5   City               100000 non-null  object
 6   Country            100000 non-null  object
 7   Phone 1            100000 non-null  object
 8   Phone 2            100000 non-null  object
 9   Email              100000 non-null  object
 10  Subscription Date  100000 non-null  object
 11  Website            100000 non-null  object
dtypes: int64(1), object(11)
memory usage: 9.2+ MB


## Data Cleaning

## Descriptive Analysis

## Univariate Analysis

## Bivariate Analysis