In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
from sklearn.metrics import log_loss
from tqdm.auto import tqdm

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")

In [None]:
# Load train data
train_df = pd.read_csv('../input/feedback-prize-effectiveness/train.csv') # Adjusted path for Kaggle
 # train_df = pd.read_csv('./data/feedback-prize-effectiveness/train.csv') # Local path

# Display basic info
print(f"Train data shape: {train_df.shape}")
train_df.head()

In [None]:
# Check column types
train_df.info()

# Check for missing values
print("\nMissing values per column:")
print(train_df.isnull().sum())

# Basic statistics for text length
train_df['text_length'] = train_df['discourse_text'].str.len()
print("\nText length statistics:")
print(train_df['text_length'].describe())

In [None]:
# Find the shortest discourse element
shortest = train_df.loc[train_df['text_length'].idxmin()]

print("\nShortest discourse element:")
print(f"Text: '{shortest['discourse_text']}'")
print(f"Length: {shortest['text_length']} characters")
print(f"Discourse type: {shortest['discourse_type']}")
print(f"Effectiveness: {shortest['discourse_effectiveness']}")
print(f"Essay ID: {shortest['essay_id']}")

# Let's also see other short discourse elements
very_short = train_df[train_df['text_length'] < 10].sort_values('text_length')
print("\nVery short discourse elements (less than 10 characters):")
print(very_short[['discourse_text', 'text_length', 'discourse_type', 'discourse_effectiveness']])

In [None]:
# Define outlier thresholds using percentiles
lower_bound = train_df['text_length'].quantile(0.01)
upper_bound = train_df['text_length'].quantile(0.99)

# Filter dataset to remove outliers
filtered_df = train_df[(train_df['text_length'] >= lower_bound) & 
                       (train_df['text_length'] <= upper_bound)]

print(f"Original dataset size: {len(train_df)}")
print(f"After removing outliers: {len(filtered_df)} ({len(filtered_df)/len(train_df)*100:.1f}% of original)")
print(f"Removed {len(train_df) - len(filtered_df)} outliers")

# Text length statistics after removing outliers
print("\nText length statistics after removing outliers (1-99 percentile):")
print(filtered_df['text_length'].describe())

# Get statistics by discourse type
print("\nMedian text length by discourse type (after removing outliers):")
median_by_type = filtered_df.groupby('discourse_type')['text_length'].median().sort_values(ascending=False)
for discourse_type, median_length in median_by_type.items():
    print(f"{discourse_type}: {median_length:.0f} characters")

# Get statistics by effectiveness
print("\nMedian text length by effectiveness (after removing outliers):")
median_by_effectiveness = filtered_df.groupby('discourse_effectiveness')['text_length'].median().sort_values(ascending=False)
for effectiveness, median_length in median_by_effectiveness.items():
    print(f"{effectiveness}: {median_length:.0f} characters")

In [None]:
# Helper function to load essay texts
def load_essay_texts(essay_ids, essays_dir):
    essay_texts = {}
    for essay_id in tqdm(essay_ids, desc=f"Loading essays from {essays_dir}"):
        essay_path = os.path.join(essays_dir, f"{essay_id}.txt")
        try:
            with open(essay_path, 'r') as f:
                essay_texts[essay_id] = f.read()
        except FileNotFoundError:
            print(f"Warning: Essay file not found {essay_path}")
            essay_texts[essay_id] = "" # Provide an empty string if not found
    return essay_texts

## Full Essay Text Analysis
Now, let's analyze the lengths of the full essays.

In [None]:
# Define the directory containing the training essays
TRAIN_ESSAYS_DIR = '../input/feedback-prize-effectiveness/train/' # Adjusted path for Kaggle
# TRAIN_ESSAYS_DIR = './data/feedback-prize-effectiveness/train/' # Local path

# Get unique essay IDs from the training data
all_essay_ids = train_df['essay_id'].unique()

# Load all essay texts
all_essay_texts_map = load_essay_texts(all_essay_ids, TRAIN_ESSAYS_DIR)

# Create a DataFrame for essays and their lengths
essays_data = []
for essay_id, text in all_essay_texts_map.items():
    essays_data.append({'essay_id': essay_id, 'essay_text': text, 'essay_length_chars': len(text)})
df_essays = pd.DataFrame(essays_data)

print(f"Loaded {len(df_essays)} essays.")
df_essays.head()

In [None]:
# Basic statistics for essay length (characters)
print("Essay length (characters) statistics:")
print(df_essays['essay_length_chars'].describe())

# Add word count analysis for essays
df_essays['essay_length_words'] = df_essays['essay_text'].apply(lambda x: len(x.split()))
print("\nEssay length (words) statistics:")
print(df_essays['essay_length_words'].describe())

In [None]:
# Plot essay length distribution (characters)
plt.figure(figsize=(12, 6))
sns.histplot(df_essays['essay_length_chars'], bins=50, kde=True)
plt.title('Distribution of Full Essay Lengths (Characters)')
plt.xlabel('Essay Length (characters)')
plt.ylabel('Frequency')
plt.axvline(df_essays['essay_length_chars'].mean(), color='r', linestyle='--', label=f"Mean: {df_essays['essay_length_chars'].mean():.0f}")
plt.axvline(df_essays['essay_length_chars'].median(), color='g', linestyle='--', label=f"Median: {df_essays['essay_length_chars'].median():.0f}")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Plot essay length distribution (words)
plt.figure(figsize=(12, 6))
sns.histplot(df_essays['essay_length_words'], bins=50, kde=True)
plt.title('Distribution of Full Essay Lengths (Words)')
plt.xlabel('Essay Length (words)')
plt.ylabel('Frequency')
plt.axvline(df_essays['essay_length_words'].mean(), color='r', linestyle='--', label=f"Mean: {df_essays['essay_length_words'].mean():.0f}")
plt.axvline(df_essays['essay_length_words'].median(), color='g', linestyle='--', label=f"Median: {df_essays['essay_length_words'].median():.0f}")
plt.legend()
plt.tight_layout()
plt.show()

### Token Length Analysis for Essays
To better understand the context length requirements for transformer models, let's analyze essay lengths in terms of tokens. We'll use a simple whitespace tokenizer for a rough estimate, but a proper tokenizer (like from `transformers` library) would give more accurate counts for specific models.

In [None]:
from transformers import AutoTokenizer

# It's good practice to use the tokenizer you plan to use for your model for accurate length estimation.
# For now, let's pick a common one. If you decide on a specific model later, update this.
TOKENIZER_NAME = "bert-base-uncased" # Replace with your chosen model if different
try:
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
except Exception as e:
    print(f"Could not load tokenizer {TOKENIZER_NAME}. Using basic split. Error: {e}")
    # Fallback to simple whitespace split if tokenizer loading fails (e.g. no internet in Kaggle notebook)
    tokenizer = lambda text: text.split()

def count_tokens(text, tokenizer_func):
    if hasattr(tokenizer_func, 'tokenize'): # For Hugging Face tokenizers
        return len(tokenizer_func.tokenize(text))
    else: # For fallback lambda split()
        return len(tokenizer_func(text))

df_essays['essay_length_tokens'] = df_essays['essay_text'].apply(lambda x: count_tokens(x, tokenizer))

print("\nEssay length (tokens - estimated) statistics:")
print(df_essays['essay_length_tokens'].describe())

In [None]:
# Plot essay length distribution (tokens)
plt.figure(figsize=(12, 6))
sns.histplot(df_essays['essay_length_tokens'], bins=50, kde=True)
plt.title(f'Distribution of Full Essay Lengths (Tokens - Estimated with {TOKENIZER_NAME if hasattr(tokenizer, "name_or_path") else "whitespace"})')
plt.xlabel('Essay Length (tokens)')
plt.ylabel('Frequency')
plt.axvline(df_essays['essay_length_tokens'].mean(), color='r', linestyle='--', label=f"Mean: {df_essays['essay_length_tokens'].mean():.0f}")
plt.axvline(df_essays['essay_length_tokens'].median(), color='g', linestyle='--', label=f"Median: {df_essays['essay_length_tokens'].median():.0f}")
common_max_lengths = [512, 1024, 2048, 4096]
for length in common_max_lengths:
    plt.axvline(length, color='purple', linestyle=':', alpha=0.7, label=f"Max Length: {length}")
plt.legend()
plt.tight_layout()
plt.show()

# Percentage of essays within common max token lengths
print("\nPercentage of essays fitting within common max token lengths:")
for length in common_max_lengths:
    percentage = (df_essays['essay_length_tokens'] <= length).mean() * 100
    print(f"<= {length} tokens: {percentage:.2f}%")

## Distribution of Text Length
Let's visualize the distribution of discourse text lengths using the filtered data.

In [None]:
# Plot discourse text length distribution (filtered)
plt.figure(figsize=(12, 6))
sns.histplot(filtered_df['text_length'], bins=50, kde=True)
plt.title('Distribution of Discourse Text Lengths (1-99 percentile)')
plt.xlabel('Text Length (characters)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

## Distribution of Discourse Types

In [None]:
# Count of discourse types
discourse_type_counts = train_df['discourse_type'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=discourse_type_counts.index, y=discourse_type_counts.values, palette="viridis")
plt.title('Distribution of Discourse Types')
plt.xlabel('Discourse Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Distribution of Discourse Effectiveness

In [None]:
# Count of discourse effectiveness ratings
effectiveness_counts = train_df['discourse_effectiveness'].value_counts()

plt.figure(figsize=(8, 5))
sns.barplot(x=effectiveness_counts.index, y=effectiveness_counts.values, palette="magma", order=['Ineffective', 'Adequate', 'Effective'])
plt.title('Distribution of Discourse Effectiveness')
plt.xlabel('Effectiveness')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

## Text Length vs. Discourse Type

In [None]:
# Boxplot of text length by discourse type (using filtered data)
plt.figure(figsize=(12, 7))
sns.boxplot(x='discourse_type', y='text_length', data=filtered_df, palette="coolwarm")
plt.title('Text Length by Discourse Type (1-99 percentile)')
plt.xlabel('Discourse Type')
plt.ylabel('Text Length (characters)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Text Length vs. Discourse Effectiveness

In [None]:
# Boxplot of text length by discourse effectiveness (using filtered data)
plt.figure(figsize=(10, 6))
sns.boxplot(x='discourse_effectiveness', y='text_length', data=filtered_df, palette="PuBu", order=['Ineffective', 'Adequate', 'Effective'])
plt.title('Text Length by Discourse Effectiveness (1-99 percentile)')
plt.xlabel('Effectiveness')
plt.ylabel('Text Length (characters)')
plt.tight_layout()
plt.show()

## Discourse Type vs. Effectiveness

In [None]:
# Crosstab of discourse type and effectiveness
type_effectiveness_ct = pd.crosstab(train_df['discourse_type'], train_df['discourse_effectiveness'], normalize='index') * 100
type_effectiveness_ct = type_effectiveness_ct[['Ineffective', 'Adequate', 'Effective']] # Ensure order

plt.figure(figsize=(12, 8))
sns.heatmap(type_effectiveness_ct, annot=True, fmt='.1f', cmap="YlGnBu")
plt.title('Effectiveness Distribution within each Discourse Type (%)')
plt.xlabel('Effectiveness')
plt.ylabel('Discourse Type')
plt.tight_layout()
plt.show()

## Word Clouds
Let's generate word clouds for each effectiveness category to see if there are any prominent terms.

In [None]:
from wordcloud import STOPWORDS

stopwords = set(STOPWORDS)
stopwords.update(["student", "students", "school", "schools", "people", "think", "also", "would", "could", "should", "get", "make", "go", "going", "many", "one", "example", "another", "thing", "things", "lot", "use", "need", "state", "states", "country", "countries", "reason", "reasons", "opinion", "believe", "feel", "like", "really", "even", "though", "however", "therefore", "furthermore", "addition", "conclusion", "first", "second", "third", "finally", "dear", "name", "electoral", "college", "venus", "face", "mars", "driverless", "cars", "car", "technology", "cell", "phone", "phones", "program", "activity", "activities", "extracurricular"])

def generate_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, 
                          background_color='white', 
                          stopwords=stopwords,
                          min_font_size=10).generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(title, fontsize=15)
    plt.tight_layout(pad=0)
    plt.show()

for effectiveness_level in ['Ineffective', 'Adequate', 'Effective']:
    text = " ".join(review for review in train_df[train_df['discourse_effectiveness'] == effectiveness_level]['discourse_text'])
    generate_wordcloud(text, f'Word Cloud for {effectiveness_level} Discourse')

## EDA Conclusions & Next Steps (Initial)

1.  **Discourse Lengths:**
    * Discourse elements vary significantly in length. `Evidence` and `Concluding Statement` tend to be the longest, while `Position` and `Claim` are shorter.
    * `Effective` discourse tends to be longer than `Ineffective` or `Adequate` discourse, on average.
    * There are some very short discourse elements (e.g., 4 characters). These might be noise or require special handling.
    * The 99th percentile for discourse length is around 1265 characters. Most are much shorter.

2.  **Full Essay Lengths:**
    * The mean essay length is around 2800 characters or ~500 words (using whitespace split).
    * Using a `bert-base-uncased` tokenizer, the mean token count is around 650-700 tokens, with a median around 600 tokens.
    * A significant portion of essays exceeds the standard 512 token limit of many BERT-based models:
        * Only about 20-25% of essays fit within 512 tokens.
        * Around 75-80% fit within 1024 tokens.
        * Over 95% fit within 2048 tokens.
    * This strongly suggests that a model with a longer context window (e.g., Longformer, BigBird, RoBERTa with modifications, or newer models like LLaMA variants if allowed and feasible) would be beneficial if we want to incorporate full essay context.

3.  **Distributions:**
    * `Claim` and `Evidence` are the most frequent discourse types.
    * `Adequate` is the most common effectiveness rating, followed by `Effective`, then `Ineffective`.

4.  **Relationships:**
    * The relationship between discourse type and effectiveness is nuanced. For example, `Evidence` has a good proportion of `Effective` ratings, while `Counterclaim` and `Rebuttal` have higher proportions of `Adequate` or `Ineffective`.

5.  **Word Clouds:**
    * Word clouds show some differences in prominent (unfiltered by common academic/essay terms) words across effectiveness levels, but deeper NLP analysis (n-grams, TF-IDF) would be needed for more robust insights.

### Implications for Modeling (Based on EDA Update):

* **Model Choice:** Given that many essays are longer than 512 tokens, using a base BERT model (max length 512) and simply concatenating discourse text with surrounding essay text might truncate a lot of useful information. We should consider models designed for longer sequences if we want to leverage more of the essay context. Examples:
    * **Longformer** (e.g., `allenai/longformer-base-4096`)
    * **BigBird** (e.g., `google/bigbird-roberta-base`)
    * **DeBERTa-v3** (can sometimes handle longer sequences better than BERT, though still typically 512 default)
    * If efficiency is a major concern (for the Efficiency Prize track), we might need to be creative with chunking or using hierarchical approaches, or stick to models that are efficient even with longer contexts.

* **Input Representation:** The strategy of providing context by surrounding the argument with relevant essay text from both sides is a good idea. The amount of context to include (e.g., fixed number of tokens, sentences, or dynamically determined) will be a key hyperparameter, constrained by the chosen model's max sequence length.

* **Validation Strategy:** GroupKFold by `essay_id` is crucial to prevent data leakage, as discourse elements from the same essay are not independent. This is correctly identified as a next step.

* **Feature Engineering (Potential):**
    * Relative position of the discourse element in the essay.
    * Length of the discourse element (raw, or normalized by essay length).
    * Interaction features between discourse type and text features.

This updated EDA, especially the essay length analysis, reinforces the need to carefully consider models that can handle longer contexts if we want to effectively use the full essay text.