# Scoring of sentences for BabyLM challenge

In [None]:
import pandas as pd
import numpy as np
import string
from collections import Counter
import os
import unicodedata

Test set:

In [None]:
example_sentences = ['I like dogs .',
                    'I like burgers .',
                    'Why did you do that ?',
                    'What did you do that for ?',
                    'The bee stings .',
                    'The bee stings me .',
                    'Go to the toilet , please !',
                    'The doggie is yellow .',
                    'My chair is too big .',
                    'Who is a big boy ?']

In [None]:
filename = "/path/to/babylm_data/babylm_10M/simple_wikipedia.train"
example_sentences = []
with open(filename) as file:
    example_sentences = [line.rstrip() for line in file]

### Clean-up test:

Removes unnecessary control characters:

In [None]:
def remove_control_characters(s):
    return "".join(ch for ch in s if unicodedata.category(ch)[0]!="C")

In [None]:
example_sentences2 = []
for sentence in example_sentences:
    sentence = remove_control_characters(sentence)
    example_sentences2.append(sentence)

In [None]:
example_sentences2

In [None]:
example_sentences3 = []
d = "."
for line in example_sentences2:
    s = [e+d for e in line.split(d) if e]
    example_sentences3.append(s)

In [None]:
example_sentences3

In [None]:
flat_list = [item for sublist in example_sentences3 for item in sublist]

In [None]:
flat_list

## Scoring functions

In [None]:
def remove_control_characters(s):
    return "".join(ch for ch in s if unicodedata.category(ch)[0]!="C")

### frame

In [None]:
def get_frames(sentence_list):
    frames = []
    for sentence in sentence_list:
        sentence = sentence.lower()
        sentence = sentence.split()
        sentence = ' '.join(sentence[0:3])
        frames.append(sentence)
    freq = Counter(frames).most_common()
    return(dict(freq))

In [None]:
frames = get_frames(example_sentences)

In [None]:
def frame_freq(sentence, frame_dict):
    sentence = sentence.lower()
    sentence = sentence.split()
    sentence = ' '.join(sentence[0:3])
    return(frame_dict[sentence])

In [None]:
frame_freq(example_sentences[5], frames)

### utterance length

In [None]:
def utterance_length(sentence):
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    return(len(sentence.split()))

In [None]:
utterance_length(example_sentences[0])

### word length

In [None]:
def mean_word_length(sentence):
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    split_sentence = sentence.split()
    lengths = []
    for word in split_sentence:
        lengths.append(len(word))
    return(np.mean(lengths))

In [None]:
mean_word_length(example_sentences[0])

### word frequency

In [None]:
def get_freqs(sentence_list):
    all_texts = ''
    for text in sentence_list:
        text = text.translate(str.maketrans('', '', string.punctuation))
        all_texts = all_texts + ' ' + text.lower()
    token_frequencies = Counter(all_texts.split()).most_common()
    token_frequencies = dict(token_frequencies)
    return(token_frequencies)

In [None]:
def average_freq(sentence, freq_dict):
    sentence = sentence.lower()
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    split_sentence = sentence.split()
    word_frequencies = []
    for word in split_sentence:
        word_frequencies.append(freq_dict[word])
    return(np.mean(word_frequencies))

In [None]:
freq_dict = get_freqs(example_sentences)

In [None]:
average_freq(example_sentences[4],freq_dict)

# Full calculation and ranking:

Load data:

In [None]:
file_list = []
for dirpath, dirnames, filenames in os.walk('/path/to/babylm_data/babylm_10M/'):
    for filename in [f for f in filenames if f.endswith(".train")]:
        file_list.append(os.path.join(dirpath, filename))

In [None]:
file_list

Load into flat list:

In [None]:
example_sentences = []
for filename in file_list:
    with open(filename) as file:
        example_sentences = example_sentences + [line.rstrip() for line in file]
example_sentences2 = []
for sentence in example_sentences:
    sentence = remove_control_characters(sentence)
    example_sentences2.append(sentence)
example_sentences3 = []
d = "."
for line in example_sentences2:
    s = [e+d for e in line.split(d) if e]
    example_sentences3.append(s)
flat_list = [item for sublist in example_sentences3 for item in sublist]

In [None]:
flat_list

In [None]:
df = pd.DataFrame(flat_list, columns = ['sentence'])

Calculate values:

In [None]:
frames = get_frames(flat_list)
freq_dict = get_freqs(flat_list)
frame_freqs = []
utt_lengths = []
word_lengths = []
word_freqs = []
for sentence in flat_list:
    frame_freqs.append(frame_freq(sentence,frames))
    utt_lengths.append(utterance_length(sentence))
    word_lengths.append(mean_word_length(sentence))
    word_freqs.append(average_freq(sentence,freq_dict))

Add to data frame:

In [None]:
df['frame freq'] = frame_freqs
df['utterance length'] = utt_lengths
df['mean word length'] = word_lengths
df['mean word frequency'] = word_freqs

In [None]:
df

In [None]:
df['mean word frequency'].std()

In [None]:
df.sort_values('frame freq')

In [None]:
nozero = df[df['utterance length'] != 0].sort_values('frame freq')

In [None]:
nozero['mean word length'].max()

Sort:

In [None]:
df['frame rank'] = df['frame freq'].rank(ascending = False)
df['utterance rank'] = df['utterance length'].rank(ascending = True)
df['mean length rank'] = df['mean word length'].rank(ascending = True)
df['mean freq rank'] = df['mean word frequency'].rank(ascending = False)

In [None]:
df['final rank'] = df['frame rank'] + df['utterance rank'] + df['mean length rank'] + df['mean freq rank']

In [None]:
ordered_df = df.sort_values(by=['final rank'])

In [None]:
ordered_list = ordered_df['sentence'].to_list()

Export:

In [None]:
with open('ordered_text.txt', 'w') as fp:
    for item in ordered_list:
        # write each item on a new line
        fp.write("%s\n" % item)