In [4]:
import sys
import os
import textwrap
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import tqdm
import numpy as np
from collections import defaultdict, Counter
import networkx as nx
from scipy import stats
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration, T5Config


  from .autonotebook import tqdm as notebook_tqdm


In [6]:

def tokenize_line(line):
    """Convert a line to lowercase and extract tokens (words)."""
    import re
    tokens = re.findall(r'\b\w+\b', line.lower())
    return tokens

def extract_original_id(line):
    """Extract the original ID from a line (e.g., 'i13585' from 'i13585 musica catalana')."""
    import re
    match = re.match(r'^(\w+)', line.strip())
    return match.group(1) if match else None

def lines_are_similar(line1, line2):
    """Check if two lines have similar word sets (order doesn't matter)."""
    words1 = set(tokenize_line(line1))
    words2 = set(tokenize_line(line2))
    return words1 == words2

def analyze_file_differences(file1, file2):
    """Analyze differences between two files and return percentage statistics."""
    
    # Check if files exist
    if not os.path.exists(file1):
        print(f"Error: File '{file1}' does not exist.")
        return None
    
    if not os.path.exists(file2):
        print(f"Error: File '{file2}' does not exist.")
        return None
    
    
    # Read both files
    try:
        with open(file1, 'r', encoding='utf-8') as f:
            lines1 = f.readlines()
        with open(file2, 'r', encoding='utf-8') as f:
            lines2 = f.readlines()
    except Exception as e:
        print(f"Error reading files: {e}")
        return None
    
    # Strip whitespace and get line counts
    lines1 = [line.strip() for line in lines1]
    lines2 = [line.strip() for line in lines2]
    file1_lines = len(lines1)
    file2_lines = len(lines2)

    
    # Compare lines using word-based similarity
    max_lines = max(file1_lines, file2_lines)
    min_lines = min(file1_lines, file2_lines)
    
    similar_lines = 0
    different_lines = 0
    different_original_ids = []  # Track original IDs with different textual content
    
    # Compare lines up to the minimum length
    for i in range(min_lines):
        if lines_are_similar(lines1[i], lines2[i]):
            similar_lines += 1
        else:
            different_lines += 1
            # Extract original ID from the line with differences
            original_id = extract_original_id(lines1[i])
            if original_id:
                different_original_ids.append(original_id)
                
                # print(lines1[i])
                # print(lines2[i])
    
    # Count extra lines in the longer file as differences
    extra_lines = max_lines - min_lines
    different_lines += extra_lines
    
    # For extra lines, also track their original IDs
    if extra_lines > 0:
        longer_file_lines = lines1 if file1_lines > file2_lines else lines2
        for i in range(min_lines, max_lines):
            original_id = extract_original_id(longer_file_lines[i])
            if original_id:
                different_original_ids.append(original_id)

    
    # Calculate percentages
    if max_lines > 0:
        diff_percentage = (different_lines / max_lines) * 100
        similar_percentage = (similar_lines / max_lines) * 100
    
    return {
        'diff_percentage': diff_percentage if max_lines > 0 else 0,
        'different_original_ids': different_original_ids
    }

def compare_files(file1, file2):
    """Compare two files and print the differences with formatted output."""
    import textwrap
    if not os.path.isabs(file1):
        # Try current directory first
        if not os.path.exists(file1):
            # Try in the copy directory
            copy_path = f"{DATASET_BASE}/copy/" + file1
            if os.path.exists(copy_path):
                file1 = copy_path

    if not os.path.isabs(file2):
        # Try current directory first
        if not os.path.exists(file2):
            # Try in the copy directory
            copy_path = f"{DATASET_BASE}/copy/" + file2
            if os.path.exists(copy_path):
                file2 = copy_path

    # Run the analysis
    result = analyze_file_differences(file1, file2)

    # Print percentage with 2 decimal precision
    print(f"Difference percentage: {result['diff_percentage']:.2f}% with {len(result['different_original_ids'])} differences")
    #print(result['different_original_ids'])
    return result['different_original_ids']

def find_one_hop_neighbors(uid):
        """Find one-hop neighbors of a user in the social graph."""
        friend_sequence = f'{DATASET_BASE}/friend_sequence.txt'
        with open(friend_sequence, 'r') as f:
            for line in f:
                parts = line.split()
                if parts and parts[0] == uid:
                    return parts[1:]
        return []
    
def find_common_items(uid1, uid2):
    """Find common items between two users."""
    user_sequence = f'{DATASET_BASE}/user_sequence.txt'
    items1, items2 = None, None
    with open(user_sequence, 'r') as f:
        for line in f:
            parts = line.split()
            if parts and parts[0] == uid1:
                items1 = parts[1:]
            if parts and parts[0] == uid2:
                items2 = parts[1:]
    if items1 and items2:
        return set(items1) & set(items2)
    return set()

def get_textual_ids(user_ids, indexing_file):
    """Get textual representations for a list of user IDs from an indexing file."""
    textual_ids = {}
    with open(indexing_file, 'r') as f:
        for line in f:
            parts = line.split()
            if parts and parts[0] in user_ids:
                textual_ids[parts[0]] = " ".join(parts[1:])
    return textual_ids[user_ids[0]]


def find_two_hop_neighbors(uid):
    one_hop_neighbors = find_one_hop_neighbors(uid)
    two_hop_neighbors = []
    for neighbor in one_hop_neighbors:
        two_hop_neighbors.extend(find_one_hop_neighbors(neighbor))
    return list(set(two_hop_neighbors))

def find_three_hop_neighbors(uid):
    one_hop_neighbors = find_two_hop_neighbors(uid)
    three_hop_neighbors = []
    for neighbor in one_hop_neighbors:
        three_hop_neighbors.extend(find_one_hop_neighbors(neighbor))
    return list(set(three_hop_neighbors))

def find_four_hop_neighbors(uid):
    three_hop_neighbors = find_three_hop_neighbors(uid)
    four_hop_neighbors = []
    for neighbor in three_hop_neighbors:
        four_hop_neighbors.extend(find_one_hop_neighbors(neighbor))
    return list(set(four_hop_neighbors))


In [7]:
DATASET = 'yelp_30_2_5'  
DATASET_BASE = f'../rec_datasets/{DATASET}'
file1 = os.path.join(DATASET_BASE, "yelp_original/user_generative_index_phase_2.txt")
file2 = os.path.join(DATASET_BASE, "yelp_socialtoid/user_generative_index_phase_2.txt")

In [18]:
model_path = '/home/derrick/idgenrec_singlegpu/model/yelp_30_2_5_train_20251015_103424/model_rec_round2_final.pt'
def get_user_embedding(user_text, model_path, device='cpu'):
    if model_path.endswith('.pt'):
        base_model = "t5-small"
        config = T5Config.from_pretrained(base_model)
        tokenizer = AutoTokenizer.from_pretrained(base_model)
        model = T5ForConditionalGeneration.from_pretrained(base_model, config=config)
        # Load checkpoint to get vocab size
        checkpoint = torch.load(model_path, map_location='cpu')
        vocab_size = checkpoint['shared.weight'].shape[0]
        
        # Resize model to match checkpoint vocab size
        model.resize_token_embeddings(vocab_size)
        
        # Load the state dict
        model.load_state_dict(checkpoint)
    else:
        
        config = T5Config.from_pretrained(model_path)
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = T5ForConditionalGeneration.from_pretrained(model_path, config=config)
    
    model.to(device)
    model.eval()
    
    with torch.no_grad():
        inputs = tokenizer(user_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        encoder_outputs = model.encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], return_dict=True)
        last_hidden_states = encoder_outputs.last_hidden_state
        attention_mask = inputs['attention_mask'].unsqueeze(-1).expand_as(last_hidden_states)
        masked_hidden_states = last_hidden_states * attention_mask
        sum_hidden_states = torch.sum(masked_hidden_states, dim=1)
        sum_attention_mask = torch.clamp(torch.sum(attention_mask, dim=1), min=1e-9)
        return (sum_hidden_states / sum_attention_mask).cpu().numpy()


In [20]:
import torch
print(torch.cuda.is_available())

False


In [19]:
original =0 
socialtoid = 0
no_common=0
for u in tqdm.tqdm(compare_files(file1, file2)):
    fr_embeddings = []
    for fr in find_one_hop_neighbors(u):
        if fr != u:
            fr_embeddings.append(get_user_embedding([get_textual_ids([fr], file1)], model_path))
    avg_fr_embedding = np.mean(fr_embeddings, axis=0) if fr_embeddings else None
    print(avg_fr_embedding.shape)
#     user_embedding_original = [get_textual_ids([u], file1)])
#     user_embedding_socialtoid = sentence_transformer.encode([get_textual_ids([u], file2)])
    
#     if avg_fr_embedding is not None:
#         original_similarity = cosine_similarity(user_embedding_original, avg_fr_embedding)[0]
#         socialtoid_similarity = cosine_similarity(user_embedding_socialtoid, avg_fr_embedding)[0]
#         if original_similarity > socialtoid_similarity:
#             original += 1
#         else:
#             socialtoid += 1
# print(f"original: {original}, socialtoid: {socialtoid}")

Difference percentage: 8.18% with 49 differences


  2%|▏         | 1/49 [00:16<13:21, 16.69s/it]

(1, 512)


  4%|▍         | 2/49 [00:45<18:38, 23.81s/it]

(1, 512)


  4%|▍         | 2/49 [01:37<38:08, 48.69s/it]


KeyboardInterrupt: 

In [26]:

def load_social_graph(friend_sequence_file):
    """Load social graph from friend sequence file."""
    G = nx.Graph()
    degrees = []
    
    with open(friend_sequence_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) > 1:
                user = parts[0]
                friends = parts[1:]
                degrees.append(len(friends))
                
                # Add edges (undirected)
                for friend in friends:
                    G.add_edge(user, friend)
    
    return G, degrees

def calculate_degree_statistics(degrees):
    """Calculate degree statistics."""
    degrees = np.array(degrees)
    
    mean_degree = np.mean(degrees)
    median_degree = np.median(degrees)
    percentile_90 = np.percentile(degrees, 90)
    
    return {
        'mean': mean_degree,
        'median': median_degree,
        '90th_percentile': percentile_90,
        'min': np.min(degrees),
        'max': np.max(degrees),
        'std': np.std(degrees)
    }

def calculate_density(G):
    """Calculate graph density."""
    n = G.number_of_nodes()
    m = G.number_of_edges()
    
    if n <= 1:
        return 0.0
    
    # For undirected graph: density = 2*|E| / (|V| * (|V| - 1))
    max_possible_edges = n * (n - 1) / 2
    density = (2 * m) / (n * (n - 1)) if max_possible_edges > 0 else 0.0
    
    return density

def calculate_clustering_coefficient(G):
    """Calculate average clustering coefficient."""
    if G.number_of_nodes() == 0:
        return 0.0
    
    clustering_coeffs = nx.clustering(G)
    avg_clustering = np.mean(list(clustering_coeffs.values()))
    
    return avg_clustering

def calculate_expansion_rate(G):
    """Calculate 2-hop expansion rate for each user."""
    expansion_rates = []
    
    for node in G.nodes():
        # Get 1-hop neighbors
        one_hop = set(G.neighbors(node))
        
        if len(one_hop) == 0:
            continue
            
        # Get 2-hop neighbors (excluding 1-hop neighbors and the node itself)
        two_hop = set()
        for neighbor in one_hop:
            two_hop.update(G.neighbors(neighbor))
        
        # Remove 1-hop neighbors and the node itself
        two_hop = two_hop - one_hop - {node}
        
        # Calculate expansion rate
        expansion_rate = len(two_hop) / len(one_hop) if len(one_hop) > 0 else 0
        expansion_rates.append(expansion_rate)
    
    return expansion_rates

def analyze_social_graph(friend_sequence_file):
    """Comprehensive social graph analysis."""
    print(f"Analyzing social graph from: {friend_sequence_file}")
    print("=" * 60)
    
    # Load graph
    G, degrees = load_social_graph(friend_sequence_file)
    
    print(f"Graph loaded: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
    print()
    
    # 1. Degree statistics
    print("1. DEGREE STATISTICS")
    print("-" * 30)
    degree_stats = calculate_degree_statistics(degrees)
    print(f"Mean degree: {degree_stats['mean']:.2f}")
    print(f"Median degree: {degree_stats['median']:.2f}")
    print(f"90th percentile degree: {degree_stats['90th_percentile']:.2f}")
    print(f"Min degree: {degree_stats['min']}")
    print(f"Max degree: {degree_stats['max']}")
    print(f"Std deviation: {degree_stats['std']:.2f}")
    print()
    
    # 2. Graph density
    print("2. GRAPH DENSITY")
    print("-" * 30)
    density = calculate_density(G)
    print(f"Density: {density:.6f}")
    print(f"Density percentage: {density * 100:.4f}%")
    print()
    
    # 3. Clustering coefficient
    print("3. CLUSTERING COEFFICIENT")
    print("-" * 30)
    avg_clustering = calculate_clustering_coefficient(G)
    print(f"Average clustering coefficient: {avg_clustering:.4f}")
    print()
    
    # 4. 2-hop expansion rate
    print("4. 2-HOP EXPANSION RATE")
    print("-" * 30)
    expansion_rates = calculate_expansion_rate(G)
    if expansion_rates:
        mean_expansion = np.mean(expansion_rates)
        median_expansion = np.median(expansion_rates)
        percentile_90_expansion = np.percentile(expansion_rates, 90)
        
        print(f"Mean expansion rate: {mean_expansion:.2f}")
        print(f"Median expansion rate: {median_expansion:.2f}")
        print(f"90th percentile expansion rate: {percentile_90_expansion:.2f}")
        print(f"Max expansion rate: {np.max(expansion_rates):.2f}")
    else:
        print("No expansion data available")
    print()
    
    # Additional useful metrics
    print("5. ADDITIONAL METRICS")
    print("-" * 30)
    
    # Connected components
    num_components = nx.number_connected_components(G)
    largest_component_size = len(max(nx.connected_components(G), key=len)) if num_components > 0 else 0
    
    print(f"Number of connected components: {num_components}")
    print(f"Largest component size: {largest_component_size}")
    print(f"Largest component percentage: {largest_component_size / G.number_of_nodes() * 100:.2f}%")
    
    # Degree distribution summary
    degree_counts = Counter(degrees)
    print(f"Users with 0 friends: {degree_counts[0] if 0 in degree_counts else 0}")
    print(f"Users with 1 friend: {degree_counts[1] if 1 in degree_counts else 0}")
    print(f"Users with 2+ friends: {sum(count for deg, count in degree_counts.items() if deg >= 2)}")
    
    return {
        'graph': G,
        'degrees': degrees,
        'degree_stats': degree_stats,
        'density': density,
        'clustering': avg_clustering,
        'expansion_rates': expansion_rates,
        'num_components': num_components,
        'largest_component_size': largest_component_size
    }

def visualize_social_graph(friend_sequence_file):
    G, degrees = load_social_graph(friend_sequence_file)
    nx.draw(G, node_size=10, node_color='lightblue', edge_color='gray', width=0.5)
    plt.title(f'Social Graph for {DATASET}', fontsize=24)
    nx.layout.planar_layout(G)
    plt.show()

# Run analysis for the dataset
DATASET = 'lastfm_full_10_10'  
DATASET_BASE = f'../rec_datasets/{DATASET}'
friend_sequence_file = os.path.join(DATASET_BASE, "friend_sequence.txt")
results = analyze_social_graph(friend_sequence_file)
#visualize_social_graph(friend_sequence_file)


Analyzing social graph from: ../rec_datasets/lastfm_full_10_10/friend_sequence.txt
Graph loaded: 324 nodes, 1526 edges

1. DEGREE STATISTICS
------------------------------
Mean degree: 9.42
Median degree: 5.00
90th percentile degree: 22.00
Min degree: 1
Max degree: 44
Std deviation: 8.60

2. GRAPH DENSITY
------------------------------
Density: 0.029163
Density percentage: 2.9163%

3. CLUSTERING COEFFICIENT
------------------------------
Average clustering coefficient: 0.2427

4. 2-HOP EXPANSION RATE
------------------------------
Mean expansion rate: 12.98
Median expansion rate: 12.50
90th percentile expansion rate: 20.00
Max expansion rate: 43.00

5. ADDITIONAL METRICS
------------------------------
Number of connected components: 1
Largest component size: 324
Largest component percentage: 100.00%
Users with 0 friends: 0
Users with 1 friend: 20
Users with 2+ friends: 304


In [27]:
DATASET = 'yelp_30_2_5'  
DATASET_BASE = f'../rec_datasets/{DATASET}'
friend_sequence_file = os.path.join(DATASET_BASE, "friend_sequence.txt")
results = analyze_social_graph(friend_sequence_file)

Analyzing social graph from: ../rec_datasets/yelp_30_2_5/friend_sequence.txt
Graph loaded: 598 nodes, 5743 edges

1. DEGREE STATISTICS
------------------------------
Mean degree: 19.21
Median degree: 7.00
90th percentile degree: 52.00
Min degree: 1
Max degree: 244
Std deviation: 26.95

2. GRAPH DENSITY
------------------------------
Density: 0.032173
Density percentage: 3.2173%

3. CLUSTERING COEFFICIENT
------------------------------
Average clustering coefficient: 0.4208

4. 2-HOP EXPANSION RATE
------------------------------
Mean expansion rate: 18.77
Median expansion rate: 10.00
90th percentile expansion rate: 39.00
Max expansion rate: 243.00

5. ADDITIONAL METRICS
------------------------------
Number of connected components: 7
Largest component size: 538
Largest component percentage: 89.97%
Users with 0 friends: 0
Users with 1 friend: 102
Users with 2+ friends: 496
