In [46]:
import pandas as pd
import json
import numpy as np
import os
import re
import time
from matplotlib import pyplot as plt
import ast
from PIL import Image
import datetime
from collections import defaultdict

Partition 1: March 22, 2024 – May 29, 2024
Partition 2: May 30, 2024 – August 6, 2024
Partition 3: August 7, 2024 – October 14, 2024

In [None]:
directory = '/Users/HP/Desktop/UNI/LM_1/ACN/ACN_project/data/'
file_name = 'filter_df_gian.csv' #dataset già filtrato (quello con 3700 utenti)


# Define parameters
start_date = datetime.date(2024, 4, 1)
end_date = datetime.date(2024, 4, 1)
radius = 10.0
n = 20  # Temporal window of 20 minutes

### Importing dataset

In [None]:
df = pd.read_csv(f'{directory}{file_name}')

In [None]:
df['date'] = pd.to_datetime(df['date']).dt.date

### Functions

In [None]:
from collections import defaultdict

def register_edge_weights_by_window(edge_dict, n):
    """
    Groups edges into temporal windows of n minutes and assigns weights to edges
    based on their occurrence frequency normalized by the window size.
    
    Parameters:
        edge_dict (dict): Keys are rounded times (HH:MM), values are lists of edges (tuples of ids).
        n (int): Size of the temporal window in minutes.
    
    Returns:
        dict: Keys are window ranges (start-end), values are edges with their weights (lists of tuples).
    """
    # Initialize a dictionary to store edges for each window
    window_edges = defaultdict(list)

    # Map rounded times to window indices
    for time, edges in edge_dict.items():
        # Convert time to total minutes
        hours, minutes = map(int, time.split(':'))
        window = (hours * 60 + minutes) // n

        # Add edges to the corresponding window
        window_edges[window].extend(edges)

    # Register weights for edges in each window
    result = {}
    for window, edges in window_edges.items():
        edge_counts = defaultdict(int)

        # Count occurrences of each edge, using a sorted tuple to ensure consistency
        for edge in edges:
            edge_counts[tuple(sorted(edge))] += 1

        # Assign weights proportional to frequency of the edge divided by n
        weighted_edges = [
            (edge, count / n) for edge, count in edge_counts.items()
        ]

        # Define window range in the HH:MM - HH:MM format
        start_time = f"{(window * n) // 60:02}:{(window * n) % 60:02}"
        end_time = f"{((window + 1) * n) // 60:02}:{((window + 1) * n) % 60:02}"
        window_range = f"{start_time} - {end_time}"
        
        result[window_range] = weighted_edges

    return result


def get_distance(position1, position2):
    """Calculate the Euclidean distance between two positions."""
    return np.sqrt((position1[0] - position2[0]) ** 2 + (position1[2] - position2[2]) ** 2)

def get_edges_with_window(df, date, radius, n):
    # Filter the DataFrame for the specified date
    df_filtered = df[df['date'] == date]

    # Get the unique rounded times and initialize the edges dictionary
    rounded_times = df_filtered['rounded_time'].unique()
    edges_dict = {}

    for time in rounded_times:
        #print(f'Time = {time}')
        
        # Filter for the specific time slice
        df_tmp = df_filtered[df_filtered['rounded_time'] == time]
        unique_ids = df_tmp['address'].unique()
        #print(f'Active players: {len(unique_ids)}')
        
        # Store positions for each user in a dictionary
        positions = {user_id: np.array(ast.literal_eval(df_tmp[df_tmp['address'] == user_id]['position'].tolist()[0])) for user_id in unique_ids}
        
        # Initialize the list of edges
        edges_tmp = []

        # Compare all pairs of users using a more efficient approach
        for i in range(len(unique_ids)):
            for j in range(i + 1, len(unique_ids)):
                id1, id2 = unique_ids[i], unique_ids[j]
                
                # Calculate the distance between the two users
                distance = get_distance(positions[id1], positions[id2])
                
                # If the distance is smaller than the radius, add an edge
                if distance <= radius:
                    edges_tmp.append([id1, id2])
        
        #print(f'Edges created: {len(edges_tmp)}\n')
        edges_dict[time] = edges_tmp

    result = register_edge_weights_by_window(edges_dict, n)
    return result

def get_edges_date(df, radius, start_date, end_date, n):
    """
    Creates a dictionary of edges for each date in the specified range, applying a temporal window and filtering by frequency.

    Parameters:
        df (pd.DataFrame): DataFrame containing columns 'date', 'rounded_time', 'address', and 'position'.
        radius (float): Maximum distance to consider two users connected (edge).
        start_date (datetime.date): Start of the date range (inclusive).
        end_date (datetime.date): End of the date range (inclusive).
        n (int): Size of the temporal window in minutes.
        k (int): Minimum frequency for an edge to be included in the result.

    Returns:
        dict: Keys are dates, values are dictionaries of temporal windows and their filtered edges.
    """
    # Filter the DataFrame for the date range
    date_range = pd.date_range(start=start_date, end=end_date).date
    df_filtered = df[df['date'].isin(date_range)]

    # Initialize the result dictionary
    edges_by_date = {}

    for date in date_range:
        print(f"Processing date: {date}")
        # Filter for the specific date
        edges_by_date[date] = get_edges_with_window(df_filtered, date, radius, n)

    return edges_by_date


In [None]:
import matplotlib.pyplot as plt

# Example input dictionary
data = {
    '2024-04-01 00:00:00': {
        '00:00 - 00:10': [
            (('0x68c86c2a1edfbf93ef160fb61bcd174f9ce4e649', '0xa6c6dc29b99e8e7c919a5d2ea426874ad15ea0ed'), 0.9),
            (('0x460ac2abff955c3110014496a81bbc0867619c06', '0x79e29d9ab0dcb38ab9f473c30757fc64b87b0a19'), 0.5),
            (('0x460ac2abff955c3110014496a81bbc0867619c06', '0x5642e6fd2a6d393e774d11fd528862f1b5f8c8d3'), 0.2),
            (('0x460ac2abff955c3110014496a81bbc0867619c06', '0xd140392e8741ef75329094a93e519597d1afc88d'), 0.4),
        ],
        '00:10 - 00:20': [
            (('0x123', '0x456'), 0.7),
            (('0x789', '0xabc'), 0.3)
        ]
    }
}

def extract_weights_for_day(data):
    """
    Extracts all edge weights for a specific day from the nested dictionary.
    
    Parameters:
        data (dict): Nested dictionary with daily weights by time windows.
    
    Returns:
        list: A flattened list of all weights for the day.
    """
    all_weights = []
    for time_window, edges in data.items():
        for _, edges_with_weights in edges.items():
            for _, weight in edges_with_weights:
                all_weights.append(weight)
    return all_weights

# # Extract weights for the day
# weights = extract_weights_for_day(serializable_data)

# # Plot the distribution of weights
# plt.figure(figsize=(10, 6))
# plt.hist(weights, bins=1440, color='skyblue', edgecolor='black', alpha=0.7)
# plt.title('Distribution of Edge Weights for the Day', fontsize=16)
# plt.xlabel('Weight', fontsize=14)
# plt.ylabel('Frequency', fontsize=14)
# plt.grid(axis='y', linestyle='--', alpha=0.7)
# plt.axvline(np.mean(weights), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(weights):.4f}')
# plt.legend()
# plt.show()


### Main

In [None]:
# Call the function
edges_result = get_edges_date(df, radius, start_date, end_date, n)

# Inspect results
for date, windows in edges_result.items():
    print(f"Date: {date}")
    for window, edges in windows.items():
        print(f"  Window {window}: {len(edges)} edges")


In [None]:
serializable_data = {key.strftime("%Y-%m-%d %H:%M:%S"): value for key, value in edges_result.items()}

In [None]:
# Save the dictionary to a JSON file
result_file_name = f'edges_from_{start_date.strftime("%Y-%m-%d")}_to_{end_date.strftime("%Y-%m-%d")}.json'
with open(f'{directory}{result_file_name}', "w") as json_file:
    json.dump(serializable_data, json_file, indent=4)