In [14]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', None)

# Load Dataset

In [15]:
SAVE_PATH = "data/task2/"

In [16]:
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)


In [17]:
def load_data(task):
    df = pd.read_csv(task+"_data.txt")
    if task == "task2":
        train_labels = pd.read_csv(task+"_train_label.txt", header=None, sep="\t", names=["order","product", "label"])
        valid_labels = pd.read_csv(task+"_valid_label.txt", header=None, sep="\t", names=["order", "product","label"])
        test_query = pd.read_csv(task+"_test_query.txt",header=None,sep="\t", names=["order", "product"])
    else:
        train_labels = pd.read_csv(task+"_train_label.txt", header=None, sep="\t", names=["order", "label"])
        valid_labels = pd.read_csv(task+"_valid_label.txt", header=None, sep="\t", names=["order", "label"])
        test_query = pd.read_csv(task+"_test_query.txt",header=None,sep="\t", names=["order"])
    test_query["label"] = -1
    return df, train_labels, valid_labels, test_query

In [18]:
task = "task2"

In [19]:

df, train_labels, valid_labels, test_query = load_data(task="task2")
labels = pd.concat([train_labels, valid_labels, test_query])


In [20]:
def split_df(df, labels,train_labels, valid_labels, test_query):
    train_df = df.merge(train_labels)
    valid_df = df.merge(valid_labels)
    test_df = df.merge(test_query)
    df = df.merge(labels)

    return df,train_df, valid_df, test_df

In [21]:
def create_masks(labels,task):
    total_length = len(labels)
    train_masks = np.concatenate([np.ones(len(train_labels)), np.zeros(total_length - len(train_labels))])
    valid_masks = np.concatenate([np.zeros(len(train_labels)), np.ones(len(valid_labels)), np.zeros(len(test_query))])
    test_masks = np.concatenate([np.zeros(total_length - len(test_query)), np.ones(len(test_query))])

    if task == "task2":
        masks = pd.DataFrame({"order": labels["order"],"product": labels["product"],"train_masks": train_masks, "valid_masks": valid_masks, "test_masks": test_masks})
    else:
        masks = pd.DataFrame({"order": labels["order"],"train_masks": train_masks, "valid_masks": valid_masks, "test_masks": test_masks})
    return masks

In [23]:
labels = pd.concat([train_labels, valid_labels, test_query])
masks = create_masks(labels,task)
df,train_df, valid_df, test_df = split_df(df, labels,train_labels, valid_labels, test_query)


# Create Customer Edgelist

In [None]:
def get_customer_edgelist(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns the customer edgelist with the following edge attributes: customer, order, n_products, label
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer order information.
    Returns:
        pandas.DataFrame: DataFrame containing customer edgelist.
    """
    # Group by customer
    customer_groups = df.groupby(['order', 'customer', 'label']).size().reset_index().rename(columns={0: 'n_products'})
    return customer_groups[['customer', 'order', 'n_products', 'label']].sort_values(by='customer')

def save_customer_edgelist(df: pd.DataFrame, filename: str) -> None:
    """
    Saves the customer edgelist to a file.
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer edgelist.
        filename (str): Filename to save the customer edgelist to.
    """
    output = get_customer_edgelist(df)
    output.to_csv(filename, sep=',', index=False)

In [None]:
masks

In [None]:
# save_customer_edgelist(train_df, 'train_customer_edgelist.csv')
customer_edge_list = get_customer_edgelist(df)
customer_edge_list = customer_edge_list.merge(masks, left_index=True, right_index=True)
customer_edge_list.to_csv(SAVE_PATH + "customer_edgelist.csv", index=False)

# Create Order Edgelist

In [None]:
def get_order_edgelist(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns the order edgelist with the following edge attributes: order, product, n_customers, label
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer order information.
    Returns:
        pandas.DataFrame: DataFrame containing order edgelist.
    """
    # Group by order
    order_groups = df.groupby(['order', 'product', 'label','color','size']).size().reset_index().rename(columns={0: 'n_customers'})
    return order_groups[['order', 'product', 'label','color','size']].sort_values(by='order')

def save_order_edgelist(df: pd.DataFrame, filename: str) -> None:
    """
    Saves the order edgelist to a file.
    
    Args:
        df (pandas.DataFrame): DataFrame containing order edgelist.
        filename (str): Filename to save the order edgelist to.
    """
    output = get_order_edgelist(df)
    output.to_csv(filename, sep=',', index=False)

In [None]:
# save_order_edgelist(train_df, 'train_order_edgelist.csv')
save_order_edgelist(df, SAVE_PATH+'order_edgelist.csv')

# Create Customer Node Attributes
Possibly find some way to find the average size of product ordered, but also linked to product type.

In [None]:
def get_customer_node_attributes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns the customer node attributes with the following attributes: customer, n_orders, n_products, label
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer order information.
    Returns:
        pandas.DataFrame: DataFrame containing customer node attributes.
    """
    # Group by customer
    customer_groups = df.groupby(['customer', 'order', 'label']).size().reset_index().rename(columns={0: 'n_products'})
    customer_groups['n_not_returned'] = customer_groups['label'].apply(lambda x: 1 if x == 0 else 0)
    customer_groups['n_partial_returned'] = customer_groups['label'].apply(lambda x: 1 if x == 1 else 0)
    customer_groups['n_fully_returned'] = customer_groups['label'].apply(lambda x: 1 if x == 2 else 0)
    customer_groups = customer_groups.groupby('customer').agg({'order': 'count', 'n_not_returned': 'sum', 'n_partial_returned': 'sum', 'n_fully_returned': 'sum', 'n_products': 'sum'}).reset_index().rename(columns={'order': 'n_orders'})
    return customer_groups

def save_customer_node_attributes(df: pd.DataFrame, filename: str) -> None:
    """
    Saves the customer node attributes to a file.
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer node attributes.
        filename (str): Filename to save the customer node attributes to.
    """
    output = get_customer_node_attributes(df)
    output.to_csv(filename, sep=',', index=False)

In [None]:
save_customer_node_attributes(df, SAVE_PATH+'customer_node_attributes.csv')

# Create Order Node Attributes

In [None]:
def get_order_node_attributes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns the order node attributes with the following attributes: customer, n_orders, n_products, label
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer order information.
    Returns:
        pandas.DataFrame: DataFrame containing customer node attributes.
    """
    customer_groups = df.groupby(['order', 'label']).size().reset_index().rename(columns={0: 'n_products'})
    return customer_groups

def save_order_node_attributes(df: pd.DataFrame, filename: str) -> None:
    """
    Saves the order node attributes to a file.
    
    Args:
        df (pandas.DataFrame): DataFrame containing order node attributes.
        filename (str): Filename to save the order node attributes to.
    """
    output = get_order_node_attributes(df)
    output.to_csv(filename, sep=',', index=False)

In [None]:
# save_order_node_attributes(train_df, 'train_order_node_attributes.csv')
save_order_node_attributes(df, SAVE_PATH+'order_node_attributes.csv')

# Create Product Node Attributes

In [None]:
train_df

In [None]:
#todo: Finish this
def get_product_node_attributes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the node attributes for each product in the given DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the product data.

    Returns:
    pd.DataFrame: A DataFrame with the calculated node attributes for each product.
    """
    # Function implementation goes here
    pass

    # Group by customer
    product_stats = df.groupby('product').agg(
    total_count=('product', 'count'),
    return_1=('label', lambda x: (x==1).sum()),
    return_2=('label', lambda x: (x==2).sum())
    ).reset_index(drop=True)
    product_stats['return1_rate'] = product_stats['return_1'] / product_stats['total_count']
    product_stats['return2_rate'] = product_stats['return_2'] / product_stats['total_count']
    product_stats.rename_axis('product', inplace=True)
    return product_stats

def save_product_node_attributes(df: pd.DataFrame, filename: str) -> pd.DataFrame:
    """
    Save the product node attributes to a CSV file.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing the product node attributes.
    - filename (str): The name of the CSV file to save the attributes to.

    Returns:
    - pd.DataFrame: The DataFrame containing the product node attributes.

    """
    output = get_product_node_attributes(df)
    output.to_csv(filename, index=True)



In [None]:
# Returns calculated only on training data
# save_product_node_attributes(train_df, SAVE_PATH+"product_node_attributes.csv")
pa = get_product_node_attributes(train_df)

In [None]:
pa