In [1]:
# we use textstat on each of the dataset files to get readability scores
import textstat
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import glob
import re

In [14]:
# get the readability scores for each of the files using rule-based and statistical methods
def get_readability_scores(text):
    textstat.set_lang('en')
    readability_scores = {}
    # reading ease scores (e.g. 90-100 is very easy, 80-89 is easy, etc.)
    readability_scores['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
    # grade level scores (e.g. 9.3 is 9th grade level)
    readability_scores['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
    readability_scores['gunning_fog'] = textstat.gunning_fog(text)
    readability_scores['smog_index'] = textstat.smog_index(text)
    readability_scores['automated_readability_index'] = textstat.automated_readability_index(text)
    readability_scores['coleman_liau_index'] = textstat.coleman_liau_index(text)
    readability_scores['linsear_write_formula'] = textstat.linsear_write_formula(text)
    readability_scores['dale_chall_readability_score'] = textstat.dale_chall_readability_score(text)

    # consensus score based on the above scores
    readability_scores['text_standard'] = textstat.text_standard(text)

    # counts and other metrics
    readability_scores['difficult_words'] = textstat.difficult_words(text)
    readability_scores['spache_readability'] = textstat.spache_readability(text)
    readability_scores['reading_time'] = textstat.reading_time(text)
    readability_scores['sentence_count'] = textstat.sentence_count(text)
    readability_scores['syllable_count'] = textstat.syllable_count(text)
    readability_scores['lexicon_count'] = textstat.lexicon_count(text)
    readability_scores['polysyllabcount'] = textstat.polysyllabcount(text)

    return readability_scores


In [28]:
test_data = """
aboo .
kee .
hey .
smile ?
hm hm .
smile .
hi .
aguh .
mguh !
mguh ?
abuabuabluabuh .
pff phphphphphph .
hm hm hm hm .
mm .
oh ?
say hello .
hm hm .
say mama .
say mama !
what's the matter ?
alright ?
we'll turn that off for now ?
hello .
ah what's that .
what's that ?
hi ?
bang .
you dropped it .
can you make those wheels spin around ?
that's pretty cool .
look at that .
vroom vroom vroom .
bang bang .
you are so father ?
"""

In [41]:
# scores = {file_name} : get_readability_scores(file_name)
scores = {}
for file in glob.glob('data/babylm_10M/*.train'):
    file_name = re.search('data/babylm_10M/(.*).train', file).group(1)
    with open(file, 'r') as f:
        text = f.read()
    scores[file_name] = get_readability_scores(text)

In [57]:
df = pd.DataFrame.from_records(scores).T
df.head()

Unnamed: 0,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,text_standard,difficult_words,spache_readability,reading_time,sentence_count,syllable_count,lexicon_count,polysyllabcount
data/babylm_10M/aochildes.train,106.97,0.0,2.76,4.8,1.1,1.68,4.777778,0.61,4th and 5th grade,2887,1.85,21470.1,53923,395761,359002,4632
data/babylm_10M/bnc_spoken.train,89.99,4.5,6.28,7.7,6.0,5.39,32.0,1.1,5th and 6th grade,12013,3.1,52869.78,56414,1033106,854600,35698
data/babylm_10M/cbt.train,86.54,5.8,7.68,7.3,8.0,6.09,8.0,1.41,7th and 8th grade,9278,3.61,30214.89,25996,567460,480937,13841
data/babylm_10M/children_stories.train,83.59,6.9,8.81,7.4,9.2,6.21,9.0,1.56,8th and 9th grade,6343,4.02,21346.13,16012,399998,342767,8769
data/babylm_10M/gutenberg.train,78.59,6.8,7.67,9.7,9.9,8.58,7.142857,1.56,6th and 7th grade,26622,3.62,68727.56,54792,1305557,988779,71932


In [25]:
# plot the dale-chall readability score for each of the files
def plot_dale_chall_standard_scores(df):
    plt.figure(figsize=(20, 10))
    sns.set_style("whitegrid")
    sns.set_context("paper")
    sns.set_palette("Set2")
    ax = sns.barplot(x="dale_chall_readability_score", y="file", data=df)
    ax.set(xlabel="Dale-Chall Standard", ylabel="File")
    plt.show()


In [26]:
plot_dale_chall_standard_scores()

TypeError: Neither the `x` nor `y` variable appears to be numeric.

<Figure size 2000x1000 with 0 Axes>

In [None]:
#using pylatex to create a table of the readability scores
from pylatex import Document, Section, Subsection, Tabular, Math, TikZ, Axis, \
    Plot, Figure, Matrix, Alignat, NoEscape, MiniPage
from pylatex.utils import italic, bold

def create_readability_table(df):
    # Create a document
    geometry_options = {"tmargin": "1cm", "lmargin": "1cm"}
    doc = Document(geometry_options=geometry_options)
    

