In [1]:
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
train_dir = Path('/home/aatman/Aatman/Study/Challenges in Computatinal Linguistics/code/data/dev_phase/subtask1/train')
dev_dir = Path('/home/aatman/Aatman/Study/Challenges in Computatinal Linguistics/code/data/dev_phase/subtask1/dev')

In [3]:
# Load all data
train_dfs = {f.stem: pd.read_csv(f) for f in train_dir.glob('*.csv')}
dev_dfs = {f.stem: pd.read_csv(f) for f in dev_dir.glob('*.csv')}

In [4]:
# Combine for stats
train_all = pd.concat(train_dfs.values(), ignore_index=True)
dev_all = pd.concat(dev_dfs.values(), ignore_index=True)

In [None]:
print(f"Total train samples: {len(train_all):,}")
print(f"Total dev samples: {len(dev_all):,}")
print(f"Number of languages: {len(train_dfs)}")
print(f"\nClass distribution in train:")
print(train_all['polarization'].value_counts().to_dict())
print(f"Class balance: {train_all['polarization'].value_counts(normalize=True).to_dict()}")

Total train samples: 73,681
Total dev samples: 3,687
Number of languages: 22

Class distribution in train:
{1: 39145, 0: 34536}
Class balance: {1: 0.5312767199142249, 0: 0.4687232800857752}


In [6]:
# Per-language stats
lang_stats = []
for lang in sorted(train_dfs.keys()):
    train_size = len(train_dfs[lang])
    dev_size = len(dev_dfs[lang])

    # Train class distribution
    class_dist = train_dfs[lang]['polarization'].value_counts().to_dict()
    class_0 = class_dist.get(0, 0)
    class_1 = class_dist.get(1, 0)
    total = class_0 + class_1

    # Dev class distribution
    class_dist_dev = dev_dfs[lang]['polarization'].value_counts().to_dict()
    class_0_dev = class_dist_dev.get(0, 0)
    class_1_dev = class_dist_dev.get(1, 0)
    total_dev = class_0_dev + class_1_dev

    lang_stats.append({
        'language': lang,

        # Train stats
        'train': train_size,
        'train_class_0': class_0,
        'train_class_1': class_1,
        'train_pct_class_0': class_0 / total if total > 0 else 0,
        'train_pct_class_1': class_1 / total if total > 0 else 0,

        # Dev stats
        'dev': dev_size,
        'dev_class_0': class_0_dev,
        'dev_class_1': class_1_dev,
        'dev_pct_class_0': class_0_dev / total_dev if total_dev > 0 else 0,
        'dev_pct_class_1': class_1_dev / total_dev if total_dev > 0 else 0,
    })

stats_df = pd.DataFrame(lang_stats)
stats_df

Unnamed: 0,language,train,train_class_0,train_class_1,train_pct_class_0,train_pct_class_1,dev,dev_class_0,dev_class_1,dev_pct_class_0,dev_pct_class_1
0,amh,3332,814,2518,0.244298,0.755702,166,0,0,0,0
1,arb,3380,1868,1512,0.552663,0.447337,169,0,0,0,0
2,ben,3333,1909,1424,0.572757,0.427243,166,0,0,0,0
3,deu,3180,1668,1512,0.524528,0.475472,159,0,0,0,0
4,eng,3222,2047,1175,0.63532,0.36468,160,0,0,0,0
5,fas,3295,855,2440,0.259484,0.740516,164,0,0,0,0
6,hau,3651,3259,392,0.892632,0.107368,182,0,0,0,0
7,hin,2744,398,2346,0.145044,0.854956,137,0,0,0,0
8,ita,3334,1966,1368,0.589682,0.410318,166,0,0,0,0
9,khm,6640,611,6029,0.092018,0.907982,332,0,0,0,0


In [7]:
train_all.head()

Unnamed: 0,id,text,polarization
0,hin_b6e7e6873bbf9764781b10c5b5c0fb57,#‡§ú‡•Å‡§Æ‡•ç‡§Æ‡§æ ‡§ï‡•á ‡§¢‡•Ä‡§≤ ‡§¶ #‡§∂‡§®‡§ø‡§ö‡§∞ ‡§ï‡•á ‡§õ‡§ø‡§≤ ‡§¶.....üòÖüòÇ #‡§¨‡§ú‡§∞‡§Ç‡§ó...,1
1,hin_a32097a7013fd2ffd8ca98c02c85ab99,‡§≠‡§æ‡§à ‡§§‡•Ç #‡§¨‡§ú‡§∞‡§Ç‡§ó_‡§¶‡§≤ ‡§ï‡§æ ‚Äú‡§ó‡•Å‡§Ç‡§°‡§æüëπ‚Äù ‡§Æ‡§æ‡§§‡•ç‡§∞ ‡§π‡•à .. ‡§î‡§∞ ‡§Ø‡•á...,1
2,hin_9fcf6d4be5c7258e9fe11ff15620dcf6,#‡§Æ‡•ã‡§¶‡•Ä_‡§π‡§ü‡§æ‡§ì_‡§¶‡•á‡§∂_‡§¨‡§ö‡§æ‡§ì,1
3,hin_e4528857d186c19e65c99f6ba098688d,‡§Ö‡§™‡§®‡•á ‡§π‡•Ä ‡§≤‡•ã‡§ó ‡§ó‡§¶‡•ç‡§¶‡§æ‡§∞ ‡§π‡•à‡•§ #‡§ú‡§Ø_‡§≠‡•Ä‡§Æ üíôüåπ,1
4,hin_2c08742760a726dd48ea861062bfa020,‡§°‡•Ä‡§≤‡§∞ ‡§®‡§π‡•Ä‡§Ç ‡§≤‡§ø‡§°‡§∞ ‡§™‡•à‡§¶‡§æ ‡§ï‡§∞‡•ã ‡§∏‡§Æ‡§æ‡§ú ‡§Æ‡•á‡§Ç‡•§ #‡§ú‡§Ø_‡§≠‡•Ä‡§Æ,0
