In [None]:
import pandas as pd
import spacy
import re
from collections import Counter
from nltk.util import ngrams
from utils.variables import *
import matplotlib.pyplot as plt
import seaborn as sn
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import math
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pio.renderers.default = "browser"

# Load spaCy tokenizers for English, French, and German
nlp_dict = {
    "eng": spacy.load("en_core_web_sm"),
    "fra": spacy.load("fr_core_news_sm"),
    "deu": spacy.load("de_core_news_sm")
}

In [2]:
def counter_tokens_text(df, name):
    """
    Count the number of tokens in a dataset and save the result to a CSV file.
    """
    total_tokens = []

    for sentence in df["text"]:
        if pd.isna(sentence):
            continue
        tokens = re.findall(r"\w+|[^\w\s]", sentence)
        tokens = [token for token in tokens if token.isalpha()]
        total_tokens.extend(tokens)
    
    print(f"Total tokens in {name}: {len(total_tokens)}")
    return len(total_tokens)

def counter_tokens_text_spacy(df, name, language):
    """
    Count the number of tokens in a dataset and save the result to a CSV file.
    """
    total_tokens = []
    if language == 'eng':
        nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    elif language == 'fra':
        nlp = spacy.load("fr_core_news_sm", disable=["parser", "ner"])
    else:
        nlp = spacy.load("de_core_news_sm", disable=["parser", "ner"])
        
    for sentence in df["text"]:
        doc = nlp(sentence)
        sentence_tokens = [token.text for token in doc if not token.is_space and not token.is_punct]  # Exclude spaces and punctuation
        total_tokens.extend(sentence_tokens)
    
    print(f"Total tokens in {name}: {len(total_tokens)}")
    return len(total_tokens)

        

In [4]:
print('ENGLISH')
train_eng_childes = pd.read_csv(TRAINING_CHILDES_ENG)
valid_eng_childes = pd.read_csv(VALIDATION_CHILDES_ENG)


train_eng_c = counter_tokens_text_spacy(train_eng_childes, "train_eng_childes", "eng")
eval_eng_c = counter_tokens_text_spacy(valid_eng_childes, "valid_eng_childes",'eng')
perc_childes_eng = (eval_eng_c/train_eng_c)*100
print(f"Percentage of tokens in eval set compared to train set: {perc_childes_eng:.2f}%")


train_eng_wiki = pd.read_csv(TRAINING_WIKI_ENG)
valid_eng_wiki = pd.read_csv(VALIDATION_WIKI_ENG)

train_eng_w = counter_tokens_text_spacy(train_eng_wiki, "train_eng_wiki", 'eng')
eval_eng_w = counter_tokens_text_spacy(valid_eng_wiki, "valid_eng_wiki", 'eng')
perc_wiki_eng = (eval_eng_w/train_eng_w)*100
print(f"Percentage of tokens in eval set compared to train set: {perc_wiki_eng:.2f}%")

ENGLISH
Total tokens in train_eng_childes: 3942501
Total tokens in valid_eng_childes: 321683
Percentage of tokens in eval set compared to train set: 8.16%
Total tokens in train_eng_wiki: 3923045
Total tokens in valid_eng_wiki: 341139
Percentage of tokens in eval set compared to train set: 8.70%


In [7]:
TOTAL_TOKENS_ENG_CHILDES = train_eng_c + eval_eng_c
TOTAL_TOKENS_ENG_WIKI = train_eng_w + eval_eng_w

TOTAL_TOKENS_ENG_CHILDES = TOTAL_TOKENS_ENG_CHILDES / 1_000_000
TOTAL_TOKENS_ENG_WIKI = TOTAL_TOKENS_ENG_WIKI / 1_000_000
print(f"Total tokens in train and eval set: {TOTAL_TOKENS_ENG_CHILDES:.4f}M")
print(f"Total tokens in train and eval set: {TOTAL_TOKENS_ENG_WIKI:.4f}M")

Total tokens in train and eval set: 4.2642M
Total tokens in train and eval set: 4.2642M


In [None]:
print('FRENCH')
train_fr_childes = pd.read_csv(TRAINING_CHILDES_FR)
valid_fr_childes = pd.read_csv(VALIDATION_CHILDES_FR)

train_fr_c = counter_tokens_text_spacy(train_fr_childes, "train_fr_childes", 'fra')
eval_fr_c = counter_tokens_text_spacy(valid_fr_childes, "valid_fr_childes", 'fra')
perc_childes_fr = (eval_fr_c/train_fr_c)*100
print(f"Percentage of tokens in eval set compared to train set: {perc_childes_fr:.2f}%")

train_fr_wiki = pd.read_csv(TRAINING_WIKI_FR)
valid_fr_wiki = pd.read_csv(VALIDATION_WIKI_FR)
train_fr_w = counter_tokens_text_spacy(train_fr_wiki, "train_fr_wiki",'fra')
eval_fr_w = counter_tokens_text_spacy(valid_fr_wiki, "valid_fr_wiki",'fra')
perc_wiki_fr = (eval_fr_w/train_fr_w)*100
print(f"Percentage of tokens in eval set compared to train set: {perc_wiki_fr:.2f}%")

In [None]:
TOTAL_TOKENS_FR_CHILDES = train_fr_c + eval_fr_c
TOTAL_TOKENS_FR_WIKI = train_fr_w + eval_fr_w

TOTAL_TOKENS_FR_CHILDES = TOTAL_TOKENS_FR_CHILDES / 1_000_000
TOTAL_TOKENS_FR_WIKI = TOTAL_TOKENS_FR_WIKI / 1_000_000
print(f"Total tokens in train and eval set: {TOTAL_TOKENS_FR_CHILDES:.2f}M")
print(f"Total tokens in train and eval set: {TOTAL_TOKENS_FR_WIKI:.2f}M")


In [None]:
print('GERMAN')
train_de_childes = pd.read_csv(TRAINING_CHILDES_DE)
valid_de_childes = pd.read_csv(VALIDATION_CHILDES_DE)

train_de_c = counter_tokens_text_spacy(train_de_childes, "train_de_childes", 'deu')
eval_de_c = counter_tokens_text_spacy(valid_de_childes, "valid_de_childes", 'deu')
perc_childes_de = (eval_de_c/train_de_c)*100
print(f"Percentage of tokens in eval set compared to train set: {perc_childes_de:.2f}%")

train_de_wiki = pd.read_csv(TRAINING_WIKI_DE)
valid_de_wiki = pd.read_csv(VALIDATION_WIKI_DE)
train_de_w = counter_tokens_text_spacy(train_de_wiki, "train_de_wiki", 'deu')
eval_de_w = counter_tokens_text_spacy(valid_de_wiki, "valid_de_wiki", 'deu')
perc_wiki_de = (eval_de_w/train_de_w)*100
print(f"Percentage of tokens in eval set compared to train set: {perc_wiki_de:.2f}%")


In [None]:
TOTAL_TOKENS_DE_CHILDES = train_de_c + eval_de_c
TOTAL_TOKENS_DE_WIKI = eval_de_w + train_de_w

TOTAL_TOKENS_DE_CHILDES = TOTAL_TOKENS_DE_CHILDES / 1_000_000
TOTAL_TOKENS_DE_WIKI = TOTAL_TOKENS_DE_WIKI / 1_000_000
print(f"Total tokens in train and eval set: {TOTAL_TOKENS_DE_CHILDES:.2f}M")
print(f"Total tokens in train and eval set: {TOTAL_TOKENS_DE_WIKI:.2f}M")

### TOTAL TOKENS IN THE DATASETS overall 
The values below are those reported in the paper (Table 1)

In [None]:
datasets = {
     "AO_CHILDES_ENG": AO_CHILDES_ENGLISH,
    "AO_CHILDES_FR": AO_CHILDES_FRENCH,
    "AO_CHILDES_DE": AO_CHILDES_GERMAN,
    "WIKIPEDIA_ENG": WIKIPEDIA_ENG,
    "WIKIPEDIA_FR": WIKIPEDIA_FR,
    "WIKIPEDIA_DE": WIKIPEDIA_DE,
}

def ngram_ratio(file_path,name, n):
    df = pd.read_csv(file_path)

    tokens_all = []
    for sentence in df['sentences']:
        if pd.isna(sentence):
            continue
        tokens = re.findall(r"\w+|[^\w\s]", sentence)
        tokens_all.extend(tokens)

    ngs = list(ngrams(tokens_all, n))
    total = len(ngs)
    unique = len(set(ngs))
    
    return unique / total if total > 0 else 0
    

for name, path in datasets.items():
    print(f"\nAnalyzing: {name}")
    unigram_ratio = ngram_ratio(path,name, 1)
    bigram_ratio = ngram_ratio(path,name, 2)
    trigram_ratio = ngram_ratio(path,name, 3)
    
    print(f"Unigram Ratio: {unigram_ratio:.3f}")
    print(f"Bigram Ratio: {bigram_ratio:.3f}")
    print(f"Trigram Ratio: {trigram_ratio:.3f}")


Different ways of computing token counts

In [6]:
import re 
def tokenize_text(text, lang="eng"):
    """
    Tokenizes the input text based on the specified language.
    
    Parameters:
        text (str): The input utterance.
        lang (str): Language code ('eng', 'fra', 'deu').

    Returns:
        tokens (list): List of tokens.
    """
    if lang not in nlp_dict:
        raise ValueError(f"Unsupported language: {lang}. Choose from 'eng', 'fra', 'deu'.")
    tokens = re.findall(r"\w+|[^\w\s]", text)
    
    return tokens

def process_transcripts(transcripts, lang="eng"):
    """
    Processes a dataset of utterances by tokenizing and counting tokens.

    Parameters:
        transcripts (list of str): List of utterances.
        lang (str): Language code ('eng', 'fra', 'deu').

    Returns:
        dict: Total tokens, unique tokens, and token frequency.
    """
    all_tokens = []
    
    for utterance in transcripts:
        tokens = tokenize_text(utterance, lang)
        all_tokens.extend(tokens)

    total_tokens = len(all_tokens)
    unique_tokens = len(set(all_tokens))
    token_freq = Counter(all_tokens)

    return {
        "total_tokens": total_tokens,
        "unique_tokens": unique_tokens,
        "unigram_freq": token_freq
    }


In [None]:
import random
import numpy as np

def compute_ttr(sentences, lang="eng", sample_size=10000, num_samples=10):
    """
    Computes Type-Token Ratio (TTR) by sampling 10,000 sentences 10 times and averaging TTR scores.
    
    Parameters:
        sentences (list of str): List of utterances.
        lang (str): Language code ('eng', 'fra', 'deu').
        sample_size (int): Number of sentences to sample per run.
        num_samples (int): Number of sampling iterations.

    Returns:
        float: Average TTR across 10 samples.
    """
    if len(sentences) < sample_size:
        raise ValueError("Not enough sentences in dataset. Reduce sample_size or provide more data.")
    
    ttr_values = []

    for _ in range(num_samples):
        sampled_sentences = random.sample(sentences, sample_size)
        all_tokens = []

        for sentence in sampled_sentences:
            tokens = tokenize_text(sentence, lang)
            all_tokens.extend(tokens)

        total_tokens = len(all_tokens)
        unique_tokens = len(set(all_tokens))
        ttr = unique_tokens / total_tokens if total_tokens > 0 else 0
        ttr_values.append(ttr)

    avg_ttr = np.mean(ttr_values)
    return avg_ttr

In [None]:
def count_questions(sentences):
    
    # Filter sentences that end with a question mark
    question_count = sum(1 for sentence in sentences if sentence.strip().endswith('?'))
    
    return question_count


In [None]:
def average_sentence_length(sentences, lang="eng"):
    """
    Computes the average sentence length in terms of tokens.
    
    Parameters:
        sentences (list of str): List of utterances/sentences.
        lang (str): Language code ('eng', 'fra', 'deu').
    
    Returns:
        float: The average sentence length in tokens.
    """
    total_tokens = 0
    total_sentences = len(sentences)
    
    for sentence in sentences:
        tokens = tokenize_text(sentence, lang)
        total_tokens += len(tokens)

    avg_length = total_tokens / total_sentences if total_sentences > 0 else 0
    return avg_length

def compute_sentence_lengths(sentences):
    """
    Computes the number of words in each sentence using simple whitespace splitting.
    
    Parameters:
        sentences (list of str): List of utterances/sentences.
    
    Returns:
        list of int: List containing the length of each sentence.
    """
    return [len(sentence.strip().split()) for sentence in sentences]

### ENGLISH CHILDES

In [None]:
childes_eng = pd.read_csv(AO_CHILDES_ENGLISH)
transcripts_eng_childes = childes_eng['sentences'].tolist()


In [None]:
childes_eng["sentence_length"] = compute_sentence_lengths(childes_eng["sentences"].tolist())

In [None]:
results_eng_childes = process_transcripts(transcripts_eng_childes, lang="eng")

In [None]:
unique_years_of_life = childes_eng['year_of_life'].nunique()
print(f"Unique years of life: {unique_years_of_life}")

# Unique transcript_ids
unique_transcript_ids_list = childes_eng['transcript_id'].nunique()
print(f"Unique transcript_id values: {unique_transcript_ids_list}")

# Unique age_in_days values
unique_age_in_days_list = childes_eng['age_in_days'].nunique()
print(f"Unique age_in_days values: {unique_age_in_days_list}")


In [None]:

unigram_freq_eng = pd.DataFrame(results_eng_childes['unigram_freq'].items(), columns=['token', 'freq'])


In [None]:
ttr_eng = compute_ttr(transcripts_eng_childes, lang="eng")

In [None]:
eng_questions = count_questions(transcripts_eng_childes)
print((eng_questions/len(transcripts_eng_childes))*100)

In [None]:
eng_avg_sent_length = average_sentence_length(transcripts_eng_childes, lang="eng")
eng_avg_sent_length

### GERMAN CHILDES

In [None]:
childes_deu = pd.read_csv(AO_CHILDES_GERMAN)
transcripts_deu_childes = childes_deu['sentences'].tolist()

Given that the encoding of the age in days is wrong, we apply this transformation

In [None]:
childes_deu.rename(columns={'age_in_days': 'months'}, inplace=True)
childes_deu['year_of_life_fine_grained'] = round(childes_deu['months'] / 12,2)
childes_deu['year_of_life'] = childes_deu['year_of_life_fine_grained'].astype(int)
childes_deu['age_in_days'] = childes_deu['months'] * 30.44
childes_deu.drop(columns=['age_in_months'], inplace=True)

childes_deu['sentence_length'] = compute_sentence_lengths(childes_deu["sentences"].tolist())

In [None]:
results_deu_childes = process_transcripts(transcripts_deu_childes, lang="deu")

In [None]:
unigram_freq_deu= pd.DataFrame(results_deu_childes['unigram_freq'].items(), columns=['token', 'freq'])

In [None]:
unique_years_of_life = childes_deu['year_of_life'].nunique()
print(f"Unique years of life: {unique_years_of_life}")

# Unique transcript_ids
unique_transcript_ids_list = childes_deu['transcript_id'].nunique()
print(f"Unique transcript_id values: {unique_transcript_ids_list}")

# Unique age_in_days values
unique_age_in_days_list = childes_deu['age_in_days'].nunique()
print(f"Unique age_in_days values: {unique_age_in_days_list}")

In [None]:
ttr_deu = compute_ttr(transcripts_deu_childes, lang="deu")

In [None]:
print(f"German TTR: {ttr_deu:.4f}")

In [None]:
deu_questions = count_questions(transcripts_deu_childes)
print((deu_questions/len(transcripts_deu_childes))*100)

In [None]:
deu_avg_sent_length = average_sentence_length(transcripts_deu_childes, lang="deu")
deu_avg_sent_length

### FRENCH CHILDES

In [None]:
childes_fra = pd.read_csv(AO_CHILDES_FRENCH)
transcripts_fra_childes = childes_fra['sentences'].tolist()

Given that the encoding of the age in days is wrong, we apply this transformation

In [None]:
childes_fra.rename(columns={'age_in_days': 'months'}, inplace=True)
childes_fra['year_of_life_fine_grained'] = round(childes_fra['months'] / 12, 2)
childes_fra['year_of_life'] = childes_fra['year_of_life_fine_grained'].astype(int)
childes_fra['age_in_days'] = childes_fra['months'] * 30.44
childes_fra.drop(columns=['age_in_months'], inplace=True)

In [None]:
childes_fra['sentence_length'] = compute_sentence_lengths(childes_fra["sentences"].tolist())

In [None]:
results_fra_childes = process_transcripts(transcripts_fra_childes, lang="fra")

In [None]:
unigram_freq_fra = pd.DataFrame(results_fra_childes['unigram_freq'].items(), columns=['token', 'freq'])

In [None]:
ttr_fra = compute_ttr(transcripts_fra_childes, lang="fra")

In [None]:
unique_years_of_life = childes_fra['year_of_life'].nunique()
print(f"Unique years of life: {unique_years_of_life}")

# Unique transcript_ids
unique_transcript_ids_list = childes_fra['transcript_id'].nunique()
print(f"Unique transcript_id values: {unique_transcript_ids_list}")

# Unique age_in_days values
unique_age_in_days_list = childes_fra['age_in_days'].nunique()
print(f"Unique age_in_days values: {unique_age_in_days_list}")

In [None]:
fra_questions = count_questions(transcripts_fra_childes)
print((fra_questions/len(transcripts_fra_childes))*100)

In [None]:
fra_avg_sent_length = average_sentence_length(transcripts_fra_childes, lang="fra")
fra_avg_sent_length

### ENGLISH WIKIPEDIA

In [None]:
wiki_eng = pd.read_csv(WIKIPEDIA_ENG)
transcripts_eng_wiki = wiki_eng['sentences'].tolist()

wiki_eng['sentence_length'] = compute_sentence_lengths(wiki_eng["sentences"].tolist())

In [None]:
results_eng_wiki = process_transcripts(transcripts_eng_wiki, lang="eng")

In [None]:
unigram_freq_eng = pd.DataFrame(results_eng_wiki['unigram_freq'].items(), columns=['token', 'freq'])

In [None]:
ttr_eng_wiki = compute_ttr(transcripts_eng_wiki, lang="eng")

In [None]:
print(f"English TTR Wiki: {ttr_eng_wiki:.4f}")

In [None]:
eng_questions_wiki = count_questions(transcripts_eng_wiki)
print(eng_questions_wiki)
print((eng_questions_wiki/len(transcripts_eng_wiki))*100)

In [None]:
eng_avg_sent_length = average_sentence_length(transcripts_eng_wiki, lang="eng")
eng_avg_sent_length

### GERMAN WIKIPEDIA

In [None]:
wiki_deu = pd.read_csv(WIKIPEDIA_DE)
transcripts_deu_wiki = wiki_deu['sentences'].tolist()

In [None]:
wiki_deu['sentence_length'] = compute_sentence_lengths(wiki_deu["sentences"].tolist())

In [None]:
results_deu_wiki = process_transcripts(transcripts_deu_wiki, lang="deu")

In [None]:
unigram_freq_deu = pd.DataFrame(results_deu_wiki['unigram_freq'].items(), columns=['token', 'freq'])

In [None]:
ttr_deu_wiki = compute_ttr(transcripts_deu_wiki, lang="deu")

In [None]:
print(f"German TTR Wiki: {ttr_deu_wiki:.4f}")

In [None]:
deu_questions_wiki = count_questions(transcripts_deu_wiki)
print(deu_questions_wiki)
print((deu_questions_wiki/len(transcripts_deu_wiki))*100)

In [None]:
deu_avg_sent_length = average_sentence_length(transcripts_deu_wiki, lang="deu")
deu_avg_sent_length

### FRENCH WIKIPEDIA

In [None]:
wiki_fra = pd.read_csv(WIKIPEDIA_FR)
transcripts_fra_wiki = wiki_fra['sentences'].tolist()

wiki_fra['sentence_length'] = compute_sentence_lengths(wiki_fra["sentences"].tolist())

In [None]:
results_fra_wiki = process_transcripts(transcripts_fra_wiki, lang="fra")
results_fra_wiki

In [None]:
unigram_freq_fra = pd.DataFrame(results_fra_wiki['unigram_freq'].items(), columns=['token', 'freq'])

In [None]:
ttr_fra_wiki = compute_ttr(transcripts_fra_wiki, lang="fra")

In [None]:
print(f"French TTR Wiki: {ttr_fra_wiki:.4f}")

In [None]:
fra_questions_wiki = count_questions(transcripts_fra_wiki)
print(fra_questions_wiki)
print((fra_questions_wiki/len(transcripts_fra_wiki))*100)

In [None]:
fra_avg_sent_length = average_sentence_length(transcripts_fra_wiki, lang="fra")
fra_avg_sent_length

### PLOT 1

In [None]:

# Define your bins and labels
bins = [0, 3, 6, 12, 200]
bin_labels = ['1–3', '4–6', '7–12', '13–200']

def prepare_binned_counts(df, domain_name):
    """
    Bins sentence lengths and returns bin counts for plotting.
    
    Parameters:
        df (pd.DataFrame): Must include 'sentence_length' column.
        domain_name (str): Either 'Childes' or 'Wikipedia'.
    
    Returns:
        pd.DataFrame: Counts of sentences per bin.
    """
    binned = pd.cut(df['sentence_length'], bins=bins, labels=bin_labels, right=True)
    bin_counts = binned.value_counts().sort_index()
    return pd.DataFrame({
        'Length Bin': bin_labels,
        'Count': bin_counts.values,
        'Domain': domain_name
    })


domain_color_map = {
    "CHILDES": "#e74c3c", 
    "Wikipedia": "#f1c40f"  
}

# English 
combined_eng = pd.concat([
    prepare_binned_counts(childes_eng, "CHILDES"),
    prepare_binned_counts(wiki_eng, "Wikipedia")
])

# French
combined_fra = pd.concat([
    prepare_binned_counts(childes_fra, "CHILDES"),
    prepare_binned_counts(wiki_fra, "Wikipedia")
])
# German
combined_deu = pd.concat([
    prepare_binned_counts(childes_deu, "CHILDES"),
    prepare_binned_counts(wiki_deu, "Wikipedia")
])


In [None]:
combined_data = [combined_eng, combined_fra, combined_deu]
language_titles = ['English', 'French', 'German']
global_max = max(df['Count'].max() for df in combined_data)

width_in = 6.875  # or 3.25 for 1-column width
height_in = 4.0
dpi = 300

width_px = int(width_in * dpi)
height_px = int(height_in * dpi)

# Set y-axis max
y_max = math.ceil(global_max / 20000) * 20000 

# Create 3-column subplot
fig = make_subplots(rows=1, cols=3, shared_yaxes=True,
                    subplot_titles=[f"{lang}" for lang in language_titles])

# Plot each language's grouped bar chart
for i, data in enumerate(combined_data):
    col = i + 1  # subplot column index
    domains = data['Domain'].unique()

    for domain in domains:
        domain_data = data[data['Domain'] == domain]
        fig.add_trace(
            go.Bar(
                x=domain_data["Length Bin"],
                y=domain_data["Count"],
                name=domain,
                marker_color=domain_color_map.get(domain, "#cccccc"),
                showlegend=(i == 0)
            ),
            row=1, col=col
        )

# Set custom x-tick labels for all subplots
x_ticks = ['1-3 tok', '4-6 tok', '7-12 tok', '13-200 tok']
fig.update_xaxes(
    tickvals=[0, 1, 2, 3],
    ticktext=x_ticks,
    tickfont=dict(size=26),  # Set font size for all x-axis ticks
    row=1, col=1
)
fig.update_xaxes(
    tickvals=[0, 1, 2, 3],
    ticktext=x_ticks,
    tickfont=dict(size=26),  # Set font size for all x-axis ticks
    row=1, col=2
)
fig.update_xaxes(
    tickvals=[0, 1, 2, 3],
    ticktext=x_ticks,
    tickfont=dict(size=26),  # Set font size for all x-axis ticks
    row=1, col=3
)

# Update layout and aesthetics
fig.update_layout(
    height=500,
    width=1000,
    barmode='group',
    yaxis=dict(
        title="Number of Sentences", 
        range=[0, y_max], 
        tick0=0, 
        dtick=50000,
        title_font_size=28,  
        tickfont=dict(size=26)
    ),
    xaxis=dict(
        title="Sentence Length Bin (Tokens)",
        title_font_size=28, 
        tickfont=dict(size=26)  
    ),
    xaxis_title_standoff=28,
    showlegend=True,
    template='plotly_white',
    font=dict(size=14),
    legend=dict(
        title=dict(text='Model', font=dict(size=28)),
        font=dict(size=28),
        x=0.95, 
        y=0.95,
        xanchor='right', 
        yanchor='top',   
        bgcolor='rgba(255,255,255,0.8)', 
        bordercolor='gray',
        borderwidth=1
    ),
    annotations=[

        dict(
            text="English", font=dict(size=28), showarrow=False
        ),
        dict(
            text="French", font=dict(size=28), showarrow=False
        ),
        dict(
            text="German", font=dict(size=28), showarrow=False
        )
    ]
)

fig.show()
fig.write_image("sentence_lengths.png", width=width_px, height=height_px, scale=1)

### PLOT 2

In [None]:
# Define a function to assign buckets
def assign_age_bucket(age):
    if age <= 2:
        return 0
    elif age <= 5:
        return 1
    elif age <= 8:
        return 2
    elif age > 8:
        return 3

In [None]:
def binned_distribution(df, lang):
    df = df.copy()
    df = df[pd.to_numeric(df['year_of_life'], errors='coerce').notna()]
    df['year_of_life'] = df['year_of_life'].astype(float)

    df['year_binned'] = df['year_of_life'].round().astype(int)
    dist = df['year_binned'].value_counts().sort_index()

    return pd.DataFrame({
        'Age (Years)': dist.index,
        'Total # of Sentences': dist.values/len(df),
        'Language': lang
    })

# Apply to each language
eng_df = binned_distribution(childes_eng, 'English')
fra_df = binned_distribution(childes_fra, 'French')
deu_df = binned_distribution(childes_deu, 'German')

# Combine
combined_df = pd.concat([eng_df, fra_df, deu_df])

In [None]:
color_map = {
    'English': '#E63946',
    'French': '#F4A261', 
    'German': '#1f77b4'
}

fig = px.line(
    combined_df,
    x='Age (Years)',
    y='Total # of Sentences',
    color='Language',
    markers=transcripts_deu_childes,
    labels={'Total # of Words': 'Total Utterances', 'Age (Years)': 'Age (Years)'},
    color_discrete_map=color_map
)

fig.update_traces(marker=dict(size=6))
tickvals = list(range(0, 13)) 
ticktext = [str(i) for i in range(0, 13)]
fig.update_layout(
    title_font_size=25,
    xaxis=dict(
    tickmode='array',
    tickvals=tickvals,
    ticktext=ticktext,
    tickfont=dict(size=18),
    title=dict(text='Age (Years)', font=dict(size=25)),
    range=[-0.50, 12.03]
),
    yaxis=dict(
        tickformat='.0%',
        tickfont=dict(size=22),
        title=dict(text='Proportion of Utterances', font=dict(size=25)),
        range=[-0.02, 0.5]
    ),
    legend=dict(
        x=0.7,
        y=0.9,
        traceorder='normal',
        font=dict(size=22),
        borderwidth=1,
        title=dict(font=dict(size=20))

        
    ),
    template='simple_white'
)

fig.show()
fig.write_image("utterances_by_age.png", width=800, height=500, scale=2)


### PLOT 3

In [None]:
for df in [childes_eng, childes_fra, childes_deu]:
    df['bucket'] = df['year_of_life_fine_grained'].apply(assign_age_bucket)

In [None]:
eng_counts = childes_eng['bucket'].value_counts().sort_index().rename('English')
fra_counts = childes_fra['bucket'].value_counts().sort_index().rename('French')
deu_counts = childes_deu['bucket'].value_counts().sort_index().rename('German')

bucket_counts = pd.concat([eng_counts, fra_counts, deu_counts], axis=1).fillna(0).astype(int)
bucket_counts.index.name = 'Age Bucket'
bucket_counts.reset_index(inplace=True)

In [None]:
childes_eng['language'] = 'English'
childes_fra['language'] = 'French'
childes_deu['language'] = 'German'

# Combine all
combined_df = pd.concat([childes_eng, childes_fra, childes_deu], ignore_index=True)

In [None]:
import plotly.graph_objects as go

# Define your custom tick labels
bucket_labels = ['0: 0–2 yrs', '1: 3–5 yrs', '2: 6–8 yrs', '3: 9+ yrs']
custom_xticks = ["0: 1-3 tokens", "1: 4-6 tokens", "2: 7-12 tokens", "3: 13-200 tokens"]
custom_tickvals = [0, 1, 2, 3]  # Assuming those are your bin indices

# Create the figure
fig = go.Figure()

width_in = 6.875
height_in = 4.0 
dpi = 300

# Convert to pixels
width_px = int(width_in * dpi)
height_px = int(height_in * dpi)

# Loop through each language and add a bar trace
languages = ['English', 'French', 'German']
colors = ['#E63946', '#F4A261', '#E9C46A']

for lang, color in zip(languages, colors):
    fig.add_trace(go.Bar(
        x=bucket_labels,
        y=bucket_counts[lang],
        name=lang,
        marker_color=color,
        text=bucket_counts[lang],
        textposition='auto',
        textfont=dict(size=26)
    ))

# Update the layout
fig.update_layout(
    barmode='group',
    xaxis=dict(
        title=dict(
            text='Age Buckets',
            font=dict(size=28)
        ),
        tickmode='array',
        tickvals=[0, 1, 2, 3],  # Use the correct tick values
        ticktext=bucket_labels,
        tickfont=dict(size=26)
    ),
    yaxis=dict(
        title=dict(
            text='Number of Sentences',
            font=dict(size=28)
        ),
        tickfont=dict(size=26)
    ),
    legend=dict(
    title=dict(text='Language', font=dict(size=28)),
    font=dict(size=28),
    x=0.95,  # X position (0 to 1)
    y=0.95,  # Y position (0 to 1)
    xanchor='right',  # Anchor the legend's x at the right
    yanchor='top',    # Anchor the legend's y at the top
    bgcolor='rgba(255,255,255,0.8)',  # Optional: semi-transparent white background
    bordercolor='gray',
    borderwidth=1
),
    template='plotly_white',
    font=dict(size=26),
    bargap=0.2
)

fig.show()
fig.write_image("sentence_counts_by_age.png", width=width_px, height=height_px, scale=1)


### PLOT 4 
Keep Unigram Frequency of the Training dataset or of the full dataset?

In [12]:
unigram_freq_deu_childes = pd.read_csv(UNIGRAM_CHILDES_DE)
unigram_freq_eng_childes = pd.read_csv(UNIGRAM_CHILDES_ENG)
unigram_freq_fra_childes = pd.read_csv(UNIGRAM_CHILDES_FR)

unigram_freq_deu_wiki = pd.read_csv(UNIGRAM_WIKI_DE)
unigram_freq_eng_wiki = pd.read_csv(UNIGRAM_WIKI_ENG)
unigram_freq_fra_wiki = pd.read_csv(UNIGRAM_WIKI_FR)

In [None]:
# Function to count words in each bin for a dataset
def count_words_in_bins(data, num_bins=10):
    bin_counts = [0] * num_bins
    for bin_index in data['bin']:
        bin_counts[bin_index] += 1
    return bin_counts

In [None]:
# Function to assign bins logarithmically
def assign_bins_log(data, num_bins=10):
    # Extract the frequency values
    freq_values = data['count'].values

    # Determine bin edges based on logarithmic scale
    log_min = np.log10(min(freq_values))
    log_max = np.log10(max(freq_values))
    bin_edges = np.logspace(log_min, log_max, num_bins + 1)
    
    binned_data = np.digitize(freq_values, bin_edges) - 1
    binned_data[binned_data == num_bins] = num_bins - 1  # Ensure values outside range go to the last bin

    return binned_data, bin_edges

In [None]:
# Function to add the 'bin' column to each dataset
def add_bin_column(data, num_bins=10):
    binned_data, bin_edges = assign_bins_log(data, num_bins)
    data['bin'] = binned_data
    return data, bin_edges

# Apply to all datasets
unigram_freq_deu_childes, bin_edges_deu_childes = add_bin_column(unigram_freq_deu_childes)
unigram_freq_eng_childes, bin_edges_eng_childes = add_bin_column(unigram_freq_eng_childes)
unigram_freq_fra_childes, bin_edges_fra_childes = add_bin_column(unigram_freq_fra_childes)

unigram_freq_deu_wiki, bin_edges_deu_wiki = add_bin_column(unigram_freq_deu_wiki)
unigram_freq_eng_wiki, bin_edges_eng_wiki = add_bin_column(unigram_freq_eng_wiki)
unigram_freq_fra_wiki, bin_edges_fra_wiki = add_bin_column(unigram_freq_fra_wiki)

# Example: Check the new 'bin' column in one of the datasets
print(unigram_freq_deu_childes.head())

In [None]:
width_in = 6.875
height_in = 4.0 
dpi = 300

# Convert to pixels
width_px = int(width_in * dpi)
height_px = int(height_in * dpi)

# Function to count words in each bin for a dataset
def count_words_in_bins(data, num_bins=10):
    bin_counts = [0] * num_bins
    for bin_index in data['bin']:
        bin_counts[bin_index] += 1
    return bin_counts

# Function to create a vertical bar plot for each language
def create_subplot_for_language(fig, data_childes, data_wiki, language, row, col, num_bins=10):
    # Count words in bins for both datasets
    bin_counts_childes = count_words_in_bins(data_childes, num_bins)
    bin_counts_wiki = count_words_in_bins(data_wiki, num_bins)
    
    # Create horizontal bar plot (side-by-side bars for each bin)
    fig.add_trace(
        go.Bar(
            y=np.arange(num_bins),  # Bin numbers (now on y-axis)
            x=bin_counts_childes,  # Word count for Childes dataset (now on x-axis)
            name='CHILDES',
            marker_color='#e74c3c',
            orientation='h',  # Make it horizontal
            showlegend=(row == 1 and col == 1),
            text=bin_counts_childes,
            textposition='auto',
            textfont=dict(size=26)
        ), row=row, col=col
    )

    fig.add_trace(
        go.Bar(
            y=np.arange(num_bins),
            x=bin_counts_wiki,
            name='Wikipedia',
            marker_color='#f1c40f',
            orientation='h',
            showlegend=(row == 1 and col == 1),
            text=bin_counts_wiki,
            textposition='auto',
            textfont=dict(size=26)
        ), row=row, col=col
    )

# Create the subplots for the three languages
languages = ['eng', 'fra', 'deu']
fig = make_subplots(
    rows=1, cols=3, 
    shared_yaxes=True,
    subplot_titles=["English", "French", "German"],
    horizontal_spacing=0.05,  # Reduce space between subplots
    vertical_spacing=0.2  # Adjust vertical spacing between rows if needed
)

# Loop through each language and create a subplot
for i, language in enumerate(languages):
    if language == 'eng':
        data_childes = unigram_freq_eng_childes
        data_wiki = unigram_freq_eng_wiki
    elif language == 'fra':
        data_childes = unigram_freq_fra_childes
        data_wiki = unigram_freq_fra_wiki
    elif language == 'deu':
        data_childes = unigram_freq_deu_childes
        data_wiki = unigram_freq_deu_wiki
    
    # Create a subplot for this language
    create_subplot_for_language(fig, data_childes, data_wiki, language, row=1, col=i + 1)

fig.update_layout(
    height=800,  # You can make it taller now if needed
    width=1000,
    barmode='group',
    xaxis_title="Word Count",  # Swapped!
    yaxis_title="Log Frequency Bin",
    showlegend=True,
    template='plotly_white',
    font=dict(size=28),
    annotations=[
        dict(
            text="English", font=dict(size=28), showarrow=False
        ),
        dict(
            text="French", font=dict(size=28), showarrow=False
        ),
        dict(
            text="German", font=dict(size=28), showarrow=False
        )
    ],
    legend=dict(
        title="Model",  # Add a title for the legend
        title_font=dict(size=28),  # Set font size for legend title
        font=dict(size=26),
        bordercolor='gray',
        borderwidth=1,
        x=0.9,  # Move legend left (default is 1.0, right-aligned)
        y=0.97   # Keep legend vertically aligned at top
    )
)

for i in range(1, 4):
    fig.update_yaxes(
        tickvals=np.arange(0, 10),
        ticktext=[str(i) for i in range(10)],
        tickfont=dict(size=26),
        row=1, col=i
    )

# Show plot
fig.show()
fig.write_image("log_frequency_bins.png", width=width_px, height=height_px, scale=1)