In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

# Load Dataset

In [2]:
df = pd.read_csv("task1_data.txt")
train_labels = pd.read_csv("task1_train_label.txt", header=None, sep="\t", names=["order", "label"])
valid_labels = pd.read_csv("task1_valid_label.txt", header=None, sep="\t", names=["order", "label"])
test_query = pd.read_csv("task1_test_query.txt",header=None, names=["order"])

In [23]:
train_df = df.merge(train_labels, on="order")
valid_df = df.merge(valid_labels, on="order")
test_df = df.merge(test_query, on="order")

In [4]:
train_df.head()

Unnamed: 0,order,product,customer,color,size,group,label
0,298957,54654,192219,169,10,11,0
1,298957,57127,192219,611,10,11,0
2,570617,23677,322757,43,24,19,1
3,570617,54993,322757,543,24,19,1
4,654410,53969,305375,313,17,27,2


# Create Customer Edgelist

In [5]:
def get_customer_edgelist(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns the customer edgelist with the following edge attributes: customer, order, n_products, label
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer order information.
    Returns:
        pandas.DataFrame: DataFrame containing customer edgelist.
    """
    # Group by customer
    customer_groups = df.groupby(['order', 'customer', 'label']).size().reset_index().rename(columns={0: 'n_products'})
    return customer_groups[['customer', 'order', 'n_products', 'label']].sort_values(by='customer')

def save_customer_edgelist(df: pd.DataFrame, filename: str) -> None:
    """
    Saves the customer edgelist to a file.
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer edgelist.
        filename (str): Filename to save the customer edgelist to.
    """
    output = get_customer_edgelist(df)
    output.to_csv(filename, sep=',', index=False)

In [6]:
save_customer_edgelist(train_df, 'train_customer_edgelist.csv')

# Create Order Edgelist

In [13]:
def get_order_edgelist(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns the order edgelist with the following edge attributes: order, product, n_customers, label
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer order information.
    Returns:
        pandas.DataFrame: DataFrame containing order edgelist.
    """
    # Group by order
    order_groups = df.groupby(['order', 'product', 'label']).size().reset_index().rename(columns={0: 'n_customers'})
    return order_groups[['order', 'product', 'label']].sort_values(by='order')

def save_order_edgelist(df: pd.DataFrame, filename: str) -> None:
    """
    Saves the order edgelist to a file.
    
    Args:
        df (pandas.DataFrame): DataFrame containing order edgelist.
        filename (str): Filename to save the order edgelist to.
    """
    output = get_order_edgelist(df)
    output.to_csv(filename, sep=',', index=False)

In [8]:
save_order_edgelist(train_df, 'train_order_edgelist.csv')

In [12]:
train_df[train_df['product'] == 3]

Unnamed: 0,order,product,customer,color,size,group,label
1266235,390189,3,100385,581,1,11,1
1271929,255843,3,7806,581,1,11,0
1284389,310636,3,75451,581,1,11,1
1292920,352676,3,172144,581,1,11,0
1294842,795723,3,190483,581,1,11,1
1294851,501180,3,100385,581,1,11,1
1294918,11302,3,68625,581,1,11,1
1294967,564757,3,96900,581,1,11,0
1304616,582173,3,249054,581,1,11,1
1316562,549270,3,12780,581,1,11,1


# Create Customer Node Attributes
Possibly find some way to find the average size of product ordered, but also linked to product type.

In [69]:
def get_customer_node_attributes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns the customer node attributes with the following attributes: customer, n_orders, n_products, label
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer order information.
    Returns:
        pandas.DataFrame: DataFrame containing customer node attributes.
    """
    # Group by customer
    customer_groups = df.groupby(['customer', 'order', 'label']).size().reset_index().rename(columns={0: 'n_products'})
    customer_groups['n_not_returned'] = customer_groups['label'].apply(lambda x: 1 if x == 0 else 0)
    customer_groups['n_partial_returned'] = customer_groups['label'].apply(lambda x: 1 if x == 1 else 0)
    customer_groups['n_fully_returned'] = customer_groups['label'].apply(lambda x: 1 if x == 2 else 0)
    customer_groups = customer_groups.groupby('customer').agg({'order': 'count', 'n_not_returned': 'sum', 'n_partial_returned': 'sum', 'n_fully_returned': 'sum', 'n_products': 'sum'}).reset_index().rename(columns={'order': 'n_orders'})
    return customer_groups

def save_customer_node_attributes(df: pd.DataFrame, filename: str) -> None:
    """
    Saves the customer node attributes to a file.
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer node attributes.
        filename (str): Filename to save the customer node attributes to.
    """
    output = get_customer_node_attributes(df)
    output.to_csv(filename, sep=',', index=False)

In [70]:
save_customer_node_attributes(train_df, 'train_customer_node_attributes.csv')

# Create Order Node Attributes

In [74]:
def get_order_node_attributes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns the order node attributes with the following attributes: customer, n_orders, n_products, label
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer order information.
    Returns:
        pandas.DataFrame: DataFrame containing customer node attributes.
    """
    customer_groups = df.groupby(['order', 'label']).size().reset_index().rename(columns={0: 'n_products'})
    return customer_groups

def save_order_node_attributes(df: pd.DataFrame, filename: str) -> None:
    """
    Saves the order node attributes to a file.
    
    Args:
        df (pandas.DataFrame): DataFrame containing order node attributes.
        filename (str): Filename to save the order node attributes to.
    """
    output = get_order_node_attributes(df)
    output.to_csv(filename, sep=',', index=False)

In [75]:
save_order_node_attributes(train_df, 'train_order_node_attributes.csv')

# Create Product Node Attributes

In [105]:
#todo: Finish this
def get_customer_node_attributes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns the customer node attributes with the following attributes: customer, n_orders, n_products, label
    
    Args:
        df (pandas.DataFrame): DataFrame containing customer order information.
    Returns:
        pandas.DataFrame: DataFrame containing customer node attributes.
    """
    # Group by customer
    customer_groups = df.groupby(['product', 'order', 'label']).size().reset_index()
    customer_groups['n_not_returned'] = customer_groups['label'].apply(lambda x: 1 if x == 0 else 0)
    customer_groups['n_partial_returned'] = customer_groups['label'].apply(lambda x: 1 if x == 1 else 0)
    customer_groups['n_fully_returned'] = customer_groups['label'].apply(lambda x: 1 if x == 2 else 0)
    customer_groups = customer_groups.agg({'order': 'count', 'n_not_returned': 'sum', 'n_partial_returned': 'sum', 'n_fully_returned': 'sum'}).reset_index().rename(columns={'order': 'n_orders'})
    return customer_groups

get_customer_node_attributes(train_df)

Unnamed: 0,product,order,label,0,n_not_returned,n_partial_returned,n_fully_returned
0,0,4932,1,1,0,1,0
1,0,31563,1,1,0,1,0
2,0,82040,1,1,0,1,0
3,0,102027,2,1,0,0,1
4,0,119032,0,1,1,0,0
...,...,...,...,...,...,...,...
1893389,58413,652528,1,1,0,1,0
1893390,58413,714235,1,1,0,1,0
1893391,58413,768522,1,1,0,1,0
1893392,58413,825964,1,1,0,1,0
