<a href="https://colab.research.google.com/github/ektaj247/Fall_2025_NLP_Project_Team_44/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd

df = pd.read_csv("hf://datasets/ESGBERT/social_2k/social_2k.csv")
print(df)

      Unnamed: 0                                               text  soc
0              0  The balance included in-kind donations prepare...    1
1              1  Internal controls and risk management The Boar...    0
2              2  This analysis does not take into account the e...    0
3              3  Change in Chief Financial Officer and plc Boar...    0
4              4  Consideration and prioritisation of climate ri...    0
...          ...                                                ...  ...
1995         233  Performed a retrospective test over the NAV va...    0
1996          31  Risk oversees the application of the liquidity...    0
1997         210  The Board carried out a skills audit during th...    0
1998         113  Outlook Your Company’s portfolio (including ir...    0
1999          25  Lease financing is recorded at the value of am...    0

[2000 rows x 3 columns]


In [9]:
print(df.columns)

Index(['Unnamed: 0', 'text', 'soc'], dtype='object')


In [7]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import numpy as np

In [10]:
# --- 1. Dataset Splitting ---
# The social_2k dataset is small and already pre-processed.
# We'll perform a stratified split for train, dev, and test sets.
# Typical splits are 80/10/10 or 70/15/15. We'll use 80/10/10 for robustness.

train_val_df, test_df = train_test_split(
    df,
    test_size=0.1,  # 10% for the test set
    random_state=42,
    stratify=df['soc']
)

train_df, dev_df = train_test_split(
    train_val_df,
    test_size=0.111,  # (10% / 90%) to get 10% of total data for dev
    random_state=42,
    stratify=train_val_df['soc']
)

In [11]:
# --- 2. Tokenization and Length Calculation ---
# We'll use the same tokenizer as DistilBERT to get accurate token counts.
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_and_get_length(text_series):
    token_lengths = text_series.apply(
        lambda x: len(tokenizer.tokenize(x))
    )
    return token_lengths.mean(), token_lengths.std()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [16]:
# --- 3. Gather Statistics ---
# A dictionary to hold all the values for each split
data_stats = {}

for split_name, split_df in [('Train', train_df), ('Dev', dev_df), ('Test', test_df)]:
    num_instances = len(split_df)

    # Calculate class percentages
    label_counts = split_df['soc'].value_counts(normalize=True) * 100
    soc_percentage = label_counts.get(1, 0)
    not_soc_percentage = label_counts.get(0, 0)

    # Calculate average tokens and standard deviation
    avg_tokens, std_tokens = tokenize_and_get_length(split_df['text'])

    data_stats[split_name] = {
        '# instances': num_instances,
        'Substantive (%)': soc_percentage,
        'Vague (%)': not_soc_percentage,
        'Avg. tokens (± sd)': f'{avg_tokens:.2f} ± {std_tokens:.2f}'
    }

In [17]:
# --- 4. Format for the LaTeX Table ---
# Create a DataFrame from the collected stats for easy display.
stats_df = pd.DataFrame(data_stats)

# The original table structure had "Substantive" and "Vague"
# which isn't correct for this dataset. We should use the actual labels.
# The following will match the previous corrected text.
stats_df.index = ['# instances', 'soc (%)', 'not-soc (%)', 'Avg. tokens (± sd)']
print(stats_df)

                            Train            Dev           Test
# instances                  1600            200            200
soc (%)                     40.25           40.0           40.0
not-soc (%)                 59.75           60.0           60.0
Avg. tokens (± sd)  32.90 ± 17.08  32.34 ± 18.64  31.16 ± 14.56
