In [1]:
import ast
from collections import defaultdict

def count_triplets(file_path):
    """Count the number of tweets and triplets in a file"""
    tweet_count = 0
    triplet_counts = {'POS': 0, 'NEU': 0, 'NEG': 0}
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip():
                continue
                
            tweet_count += 1
            parts = line.split('#### #### ####')
            if len(parts) > 1:
                triplets_str = parts[1].strip()
                try:
                    triplets = ast.literal_eval(triplets_str) if triplets_str != '[]' else []
                    if triplets:  # Handle both single and multiple triplets
                        if isinstance(triplets[0], (list, tuple)):  # Multiple triplets
                            for triplet in triplets:
                                polarity = triplet[2]
                                triplet_counts[polarity] += 1
                        else:  # Single triplet
                            polarity = triplets[2]
                            triplet_counts[polarity] += 1
                except:
                    continue
                    
    total_triplets = sum(triplet_counts.values())
    return tweet_count, total_triplets, triplet_counts

# Paths to your files
train_file = 'train.txt'
dev_file = 'dev.txt'
test_file = 'test.txt'

# Count triplets in each file
train_tweets, train_total, train_polarities = count_triplets(train_file)
dev_tweets, dev_total, dev_polarities = count_triplets(dev_file)
test_tweets, test_total, test_polarities = count_triplets(test_file)

# Calculate totals
total_tweets = train_tweets + dev_tweets + test_tweets
total_triplets = train_total + dev_total + test_total
total_pos = train_polarities['POS'] + dev_polarities['POS'] + test_polarities['POS']
total_neu = train_polarities['NEU'] + dev_polarities['NEU'] + test_polarities['NEU']
total_neg = train_polarities['NEG'] + dev_polarities['NEG'] + test_polarities['NEG']

# Prepare table data
table_data = [
    ["Train", train_tweets, train_total, train_polarities['POS'], train_polarities['NEU'], train_polarities['NEG']],
    ["Dev", dev_tweets, dev_total, dev_polarities['POS'], dev_polarities['NEU'], dev_polarities['NEG']],
    ["Test", test_tweets, test_total, test_polarities['POS'], test_polarities['NEU'], test_polarities['NEG']],
    ["Total", total_tweets, total_triplets, total_pos, total_neu, total_neg]
]

# Print the table in markdown format
print("|    | Tweet | Triplet | Positif | Netral | Negatif |")
print("|---|---|---|---|---|---|")
for row in table_data:
    print(f"| {row[0]} | {row[1]} | {row[2]} | {row[3]} | {row[4]} | {row[5]} |")

# For LaTeX table format (uncomment if needed)
# print("\nLaTeX format:")
# print("\\begin{tabular}{|l|r|r|r|r|r|}")
# print("\\hline")
# print(" & Tweet & Triplet & Positif & Netral & Negatif \\\\")
# print("\\hline")
# for row in table_data:
#     print(f"{row[0]} & {row[1]} & {row[2]} & {row[3]} & {row[4]} & {row[5]} \\\\")
# print("\\hline")
# print("\\end{tabular}")

|    | Tweet | Triplet | Positif | Netral | Negatif |
|---|---|---|---|---|---|
| Train | 1764 | 2549 | 742 | 724 | 1083 |
| Dev | 379 | 568 | 161 | 157 | 250 |
| Test | 379 | 559 | 155 | 154 | 250 |
| Total | 2522 | 3676 | 1058 | 1035 | 1583 |
