In [1]:
import pickle

In [2]:
# Load entities list A (0 - 249)    
#with open('../data/extracted_entities_250.pkl', 'rb') as f:
#    entities_250 = pickle.load(f)
#
# Load entities list B (251 - 608)
#with open('../data/extracted_entities_608.pkl', 'rb') as f:
#    entities_608 = pickle.load(f)
#
# Merge both lists
#entities = {**entities_250, **entities_608}
#
## Save the merged dictionary
#with open('../data/multihop_dataset_entities.pkl', 'wb') as f:
#    pickle.dump(entities, f)

In [3]:
# Load the dictionary
with open('../data/multihop_dataset_raw_entities.pkl', 'rb') as f:
    entities = pickle.load(f)

In [4]:
# Initial counters
total_entities = 0  # All entities (including duplicates)
entity_appearances = {}  # Dictionary to track entity appearances

#---------------
# 1. Unify list of entities, unifying duplicates intra and inter article, and including reference to articles where entity is referenced
#---------------
# Process each article
for article_id, article_data in entities.items():
    # Count all entities in this article
    entities_list = article_data["entities"]
    total_entities += len(entities_list)
    
    # Process each entity in the article
    for entity in entities_list:
        # Create a tuple key with the entity and its type
        entity_key = (entity["word"], entity["entity_group"])
        
        # If entity exists, add article_id to its set of articles
        # If it doesn't exist, create new entry with a set containing this article_id
        if entity_key in entity_appearances:
            entity_appearances[entity_key].add(article_id)
        else:
            entity_appearances[entity_key] = {article_id}

# Convert dictionary to list of tuples (entity, type, set_of_articles)
unique_entities_with_articles = [
    (entity, entity_type, article_ids)
    for (entity, entity_type), article_ids in entity_appearances.items()
]

In [5]:
#---------------
# 2. Filter entities: Configuration flags
#---------------
FILTER_HASHTAGS = True      # Enable/disable hashtag filtering
FILTER_MISC_TYPE = True    # Enable/disable MISC type filtering
FILTER_SINGLE_ARTICLE = True  # Enable/disable single-article entity filtering

#---------------
# 2.1 Filter out hashtag entities
#---------------
if FILTER_HASHTAGS:
    no_hashtag_entities = [
        (entity, entity_type, article_ids) 
        for entity, entity_type, article_ids in unique_entities_with_articles 
        if '#' not in entity
    ]
else:
    no_hashtag_entities = unique_entities_with_articles

#---------------
# 2.2 Filter out 'MISC' type entities
#---------------
if FILTER_MISC_TYPE:
    no_misc_entities = [
        (entity, entity_type, article_ids) 
        for entity, entity_type, article_ids in no_hashtag_entities 
        if entity_type != 'MISC'
    ]
else:
    no_misc_entities = no_hashtag_entities

#---------------
# 2.3 Filter out entities that appear in only one article
#---------------
if FILTER_SINGLE_ARTICLE:
    filtered_entities = [
        (entity, entity_type, article_ids) 
        for entity, entity_type, article_ids in no_misc_entities 
        if len(article_ids) > 1
    ]
else:
    filtered_entities = no_misc_entities

# Print filtering results
print("\n=== FILTERING RESULTS ===")
print(f"1. Original entities: {len(unique_entities_with_articles)}")
if FILTER_HASHTAGS:
    print(f"2. After removing hashtags: {len(no_hashtag_entities)}")
if FILTER_MISC_TYPE:
    print(f"3. After removing 'MISC' type: {len(no_misc_entities)}")
if FILTER_SINGLE_ARTICLE:
    print(f"4. After removing single-article entities: {len(filtered_entities)}")
print(f"\nFinal number of entities: {len(filtered_entities)}")


=== FILTERING RESULTS ===
1. Original entities: 7699
2. After removing hashtags: 7452
3. After removing 'MISC' type: 3254
4. After removing single-article entities: 743


In [8]:
# Save filtered entities dictionary
with open('../data/multihop_dataset_filtered_entities.pkl', 'wb') as f:
    pickle.dump(filtered_entities, f)

In [9]:
# Summary final set of entities

entity_type_counts = {}
for _, entity_type, _ in filtered_entities:
    entity_type_counts[entity_type] = entity_type_counts.get(entity_type, 0) + 1

print("\n=== ENTITY ANALYSIS SUMMARY ===")
print(f"1. Total entities found (including duplicates): {total_entities}")
print(f"2. Total unique entities (without duplicates): {len(unique_entities_with_articles)}")
print(f"3. Total entities after filtering: {len(filtered_entities)}")

print("\n4. Examples of unique entities and their appearances (first 5):")
for entity, entity_type, article_ids in unique_entities_with_articles[:5]:
    print(f"   - {entity} ({entity_type}) appears in {len(article_ids)} articles: {sorted(article_ids)[:3]}...")

print("\n5. Distribution by entity type after filtering:")
for entity_type, count in sorted(entity_type_counts.items()):
    print(f"   - {entity_type}: {count} entities")

# Calculate overall average of appearances
total_appearances = sum(len(article_ids) for _, _, article_ids in filtered_entities)
average_appearances = total_appearances / len(filtered_entities)

# Calculate averages by type
type_stats = {}
for _, entity_type, article_ids in filtered_entities:
    if entity_type not in type_stats:
        type_stats[entity_type] = {
            'total_appearances': 0,
            'entity_count': 0
        }
    type_stats[entity_type]['total_appearances'] += len(article_ids)
    type_stats[entity_type]['entity_count'] += 1

# Print results
print(f"\n6. Overall average appearances per entity: {average_appearances:.2f} articles")
print("\n7. Average appearances by entity type:")
for entity_type, stats in sorted(type_stats.items()):
    avg = stats['total_appearances'] / stats['entity_count']
    count = stats['entity_count']
    print(f"   - {entity_type} ({count} entities): {avg:.2f} articles per entity")


=== ENTITY ANALYSIS SUMMARY ===
1. Total entities found (including duplicates): 13852
2. Total unique entities (without duplicates): 7699
3. Total entities after filtering: 743

4. Examples of unique entities and their appearances (first 5):
   - Amazon (ORG) appears in 35 articles: [0, 1, 2]...
   - Cyber Monday (MISC) appears in 4 articles: [0, 91, 222]...
   - Black Friday (MISC) appears in 9 articles: [0, 221, 222]...
   - Echo (MISC) appears in 4 articles: [0, 188, 222]...
   - Fire TV (MISC) appears in 1 articles: [0]...

5. Distribution by entity type after filtering:
   - LOC: 90 entities
   - ORG: 314 entities
   - PER: 339 entities

6. Overall average appearances per entity: 3.64 articles

7. Average appearances by entity type:
   - LOC (90 entities): 4.81 articles per entity
   - ORG (314 entities): 4.08 articles per entity
   - PER (339 entities): 2.91 articles per entity


In [11]:
#filtered_entities