In [12]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', None)

# Load Dataset

In [13]:
task = "task2"
SAVE_PATH = "data/"+task+"/"


In [14]:
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)


In [15]:
def load_data(task):
    df = pd.read_csv(task+"_data.txt")
    if task == "task2":
        train_labels = pd.read_csv(task+"_train_label.txt", header=None, sep="\t", names=["order","product", "label"])
        valid_labels = pd.read_csv(task+"_valid_label.txt", header=None, sep="\t", names=["order", "product","label"])
        test_query = pd.read_csv(task+"_test_query.txt",header=None,sep="\t", names=["order", "product"])
    else:
        train_labels = pd.read_csv(task+"_train_label.txt", header=None, sep="\t", names=["order", "label"])
        valid_labels = pd.read_csv(task+"_valid_label.txt", header=None, sep="\t", names=["order", "label"])
        test_query = pd.read_csv(task+"_test_query.txt",header=None,sep="\t", names=["order"])
    test_query["label"] = -1
    return df, train_labels, valid_labels, test_query

In [16]:
df = pd.read_csv(task+"_data.txt")

In [17]:

df, train_labels, valid_labels, test_query = load_data(task=task)
# labels = pd.concat([train_labels, valid_labels, test_query])


In [18]:
def split_df(df, labels,train_labels, valid_labels, test_query):
    train_df = df.merge(train_labels)
    valid_df = df.merge(valid_labels)
    test_df = df.merge(test_query)
    df = df.merge(labels)

    return df,train_df, valid_df, test_df

In [19]:
def create_masks(labels,task):
    total_length = len(labels)
    train_masks = np.concatenate([np.ones(len(train_labels)), np.zeros(total_length - len(train_labels))])
    valid_masks = np.concatenate([np.zeros(len(train_labels)), np.ones(len(valid_labels)), np.zeros(len(test_query))])
    test_masks = np.concatenate([np.zeros(total_length - len(test_query)), np.ones(len(test_query))])

    if task == "task2":
        masks = pd.DataFrame({"order": labels["order"],"product": labels["product"],"train_masks": train_masks, "valid_masks": valid_masks, "test_masks": test_masks})
    else:
        masks = pd.DataFrame({"order": labels["order"],"train_masks": train_masks, "valid_masks": valid_masks, "test_masks": test_masks})
    return masks

In [20]:
labels = pd.concat([train_labels, valid_labels, test_query])
masks = create_masks(labels,task)
df,train_df, valid_df, test_df = split_df(df, labels,train_labels, valid_labels, test_query)


In [21]:
masks

Unnamed: 0,order,product,train_masks,valid_masks,test_masks
0,152158,43939,1.0,0.0,0.0
1,78524,57599,1.0,0.0,0.0
2,192868,12958,1.0,0.0,0.0
3,325485,17062,1.0,0.0,0.0
4,723734,48067,1.0,0.0,0.0
...,...,...,...,...,...
399934,115842,31870,0.0,0.0,1.0
399935,504278,53317,0.0,0.0,1.0
399936,144537,33864,0.0,0.0,1.0
399937,622382,22404,0.0,0.0,1.0


In [22]:
df

Unnamed: 0,order,product,customer,color,size,group,label
0,391395,20019,106251,557,21,15,0
1,391395,20114,106251,396,21,15,-1
2,112116,45195,135056,243,24,23,0
3,112116,6117,135056,338,24,23,1
4,739883,622,233090,586,25,23,-1
...,...,...,...,...,...,...,...
2666257,576096,9045,15446,462,24,8,1
2666258,314874,20257,217058,437,21,15,0
2666259,314874,40360,217058,65,25,15,0
2666260,314874,46311,217058,460,25,8,0


# Create Customer Edgelist

In [52]:
def get_customer_edgelist(df: pd.DataFrame, masks, task) -> pd.DataFrame:
    """
    Returns the customer edgelist with the following edge attributes: customer, order, n_products, label
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer order information.
    Returns:
        pandas.DataFrame: DataFrame containing customer edgelist.
    """
    # Group by customer
    if task == "task2":
        customer_edge_list = df.groupby(['order', 'customer']).size().reset_index().rename(columns={0: 'n_products'})
        customer_edge_list = customer_edge_list[['customer', 'order', 'n_products']].sort_values(by='customer')
    else:
        print("Gi")
        customer_edge_list = df.groupby(['order', 'customer', 'label']).size().reset_index().rename(columns={0: 'n_products'})
        customer_edge_list = customer_edge_list[['customer', 'order', 'n_products', 'label']].sort_values(by='customer')
        customer_edge_list = customer_edge_list.merge(masks,  on="order")

    return  customer_edge_list

def save_customer_edgelist(df: pd.DataFrame,masks,task, filename: str) -> None:
    """
    Saves the customer edgelist to a file.
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer edgelist.
        filename (str): Filename to save the customer edgelist to.
    """
    output = get_customer_edgelist(df,masks, task)
    output.to_csv(filename, sep=',', index=False)
    return output

In [64]:
customer_edge_list = save_customer_edgelist(df, masks, task,SAVE_PATH+'customer_edgelist.csv')
# customer_edge_list = get_customer_edgelist(df, masks, task)

Gi


In [54]:
df[df["customer"] == 0]

Unnamed: 0,order,product,customer,color,size,group,label
988929,679928,44672,0,348,24,11,-1


In [55]:
customer_edge_list

Unnamed: 0,customer,order,n_products,label,train_masks,valid_masks,test_masks
0,1,390185,1,2,1.0,0.0,0.0
1,1,71090,5,2,1.0,0.0,0.0
2,1,155095,4,1,1.0,0.0,0.0
3,2,803833,5,1,1.0,0.0,0.0
4,3,537236,1,0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
594425,342033,100290,2,2,1.0,0.0,0.0
594426,342033,801674,1,2,1.0,0.0,0.0
594427,342033,51518,1,2,1.0,0.0,0.0
594428,342035,770601,2,1,1.0,0.0,0.0


# Create Order Edgelist

In [32]:
def get_order_edgelist(df: pd.DataFrame,task, masks) -> pd.DataFrame:
    """
    Returns the order edgelist with the following edge attributes: order, product, n_customers, label
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer order information.
    Returns:
        pandas.DataFrame: DataFrame containing order edgelist.
    """
    # Group by order
    if task == "task2":
        order_groups = df.groupby(['order', 'product', 'label','color','size']).size().reset_index().rename(columns={0: 'n_customers'})
        order_groups = order_groups.merge(masks)
        return order_groups


        
    else:
        order_groups = df.groupby(['order', 'product', 'label','color','size']).size().reset_index().rename(columns={0: 'n_customers'})
        return order_groups[['order', 'product', 'label','color','size']].sort_values(by='order')

def save_order_edgelist(df: pd.DataFrame,task, masks, filename: str) -> None:
    """
    Saves the order edgelist to a file.
    
    Args:
        df (pandas.DataFrame): DataFrame containing order edgelist.
        filename (str): Filename to save the order edgelist to.
    """
    output = get_order_edgelist(df,task, masks)
    output.to_csv(filename, sep=',', index=False)

In [34]:
# save_order_edgelist(train_df, 'train_order_edgelist.csv')
save_order_edgelist(df,task,masks, SAVE_PATH+'order_edgelist.csv')
# xd = get_order_edgelist(df,task, masks)
# xd

# Create Customer Node Attributes
Possibly find some way to find the average size of product ordered, but also linked to product type.

In [58]:
def get_customer_node_attributes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns the customer node attributes with the following attributes: customer, n_orders, n_products, label
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer order information.
    Returns:
        pandas.DataFrame: DataFrame containing customer node attributes.
    """
    # Group by customer
    customer_groups = df.groupby(['customer', 'order', 'label']).size().reset_index().rename(columns={0: 'n_products'})
    customer_groups['n_not_returned'] = customer_groups['label'].apply(lambda x: 1 if x == 0 else 0)
    customer_groups['n_partial_returned'] = customer_groups['label'].apply(lambda x: 1 if x == 1 else 0)
    customer_groups['n_fully_returned'] = customer_groups['label'].apply(lambda x: 1 if x == 2 else 0)
    customer_groups = customer_groups.groupby('customer').agg({'order': 'count', 'n_not_returned': 'sum', 'n_partial_returned': 'sum', 'n_fully_returned': 'sum', 'n_products': 'sum'}).reset_index().rename(columns={'order': 'n_orders'})
    return customer_groups

def save_customer_node_attributes(df: pd.DataFrame, filename: str) -> None:
    """
    Saves the customer node attributes to a file.
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer node attributes.
        filename (str): Filename to save the customer node attributes to.
    """
    output = get_customer_node_attributes(df)
    output.to_csv(filename, sep=',', index=False)

In [59]:
save_customer_node_attributes(df, SAVE_PATH+'customer_node_attributes.csv')

# Create Order Node Attributes

In [60]:
def get_order_node_attributes(df: pd.DataFrame, task) -> pd.DataFrame:
    """
    Returns the order node attributes with the following attributes: customer, n_orders, n_products, label
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer order information.
    Returns:
        pandas.DataFrame: DataFrame containing customer node attributes.
    """
    if task == "task2":
        customer_groups = df.groupby(['order']).size().reset_index().rename(columns={0: 'n_products'})
    else:
        customer_groups = df.groupby(['order', 'label']).size().reset_index().rename(columns={0: 'n_products'})
    return customer_groups

def save_order_node_attributes(df: pd.DataFrame,task, filename: str) -> None:
    """
    Saves the order node attributes to a file.
    
    Args:
        df (pandas.DataFrame): DataFrame containing order node attributes.
        filename (str): Filename to save the order node attributes to.
    """
    output = get_order_node_attributes(df,task)
    output.to_csv(filename, sep=',', index=False)

In [61]:
# save_order_node_attributes(train_df, 'train_order_node_attributes.csv')
save_order_node_attributes(df,task, SAVE_PATH+'order_node_attributes.csv')

# Create Product Node Attributes

In [62]:
#todo: Finish this
def get_product_node_attributes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the node attributes for each product in the given DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the product data.

    Returns:
    pd.DataFrame: A DataFrame with the calculated node attributes for each product.
    """
    # Function implementation goes here

    # Group by customer
    product_stats = df.groupby('product').agg(
    total_count=('product', 'count'),
    return_1=('label', lambda x: (x==1).sum()),
    return_2=('label', lambda x: (x==2).sum())
    ).reset_index(drop=True)
    product_stats['return1_rate'] = product_stats['return_1'] / product_stats['total_count']
    product_stats['return2_rate'] = product_stats['return_2'] / product_stats['total_count']
    product_stats.rename_axis('product', inplace=True)
    return product_stats

def save_product_node_attributes(df: pd.DataFrame, filename: str) -> pd.DataFrame:
    """
    Save the product node attributes to a CSV file.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing the product node attributes.
    - filename (str): The name of the CSV file to save the attributes to.

    Returns:
    - pd.DataFrame: The DataFrame containing the product node attributes.

    """
    output = get_product_node_attributes(df)
    output.to_csv(filename, index=True)



In [63]:
# Returns calculated only on training data
save_product_node_attributes(train_df, SAVE_PATH+"product_node_attributes.csv")
# pa = get_product_node_attributes(train_df)