In [175]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

from textstat.textstat import textstat

from scipy import stats

In [7]:
# run the sample from the website.
test_data = """Playing games has always been thought to be important to the development of well-balanced and creative children; however, what part, if any, they should play in the lives of adults has never been researched that deeply. I believe that playing games is every bit as important for adults as for children. Not only is taking time out to play games with our children and other adults valuable to building interpersonal relationships but is also a wonderful way to release built up tension."""


In [11]:
print(textstat.flesch_reading_ease(test_data))
print(textstat.smog_index(test_data))
print(textstat.flesch_kincaid_grade(test_data))
print(textstat.coleman_liau_index(test_data))
print(textstat.automated_readability_index(test_data))
print(textstat.dale_chall_readability_score(test_data))
print(textstat.difficult_words(test_data))
print(textstat.linsear_write_formula(test_data))
print(textstat.gunning_fog(test_data))
print(textstat.text_standard(test_data))

52.23
12.5
12.8
11.61
15.5
7.49
13
13.833333333333334
19.26146341463415
12th and 13th grade


In [15]:
# See list of specific books at https://tinyurl.com/yczvzuzk
# I hand-trimmed out front and back matter, and also threw out 1966 and 2000 since my cursory glance identified
# them as collections of short stories, which aren't relevant to the experimental hypothesis.

base_dir = (
    '/mnt/Windows/Downloads/ff6/pulitzer_fiction/'
    'Pulitzer Prize Winners Fiction eBooks Collection - '
    '60 Large ebooks collections From 1920-2013 (ePub, Mobi)/'
    'working/by_year/trimmed/'
)

In [24]:
dat = {}
for year in range(1918, 2013+1):
    # try to grab text of that book
    try:
        with open(base_dir + str(year) + '.txt', 'r') as f:
            file_dat = f.read()
            file_dat = file_dat.replace('\n', '')
            dat[ year ] = file_dat
    except Exception as e:
        pass


In [25]:
len(dat) # did imports work?

53

In [33]:
# function grabbing a specified chunk of a book, but cutting at word boundary
def slice_frac(in_string, start_frac, end_frac):
    start_pos = int(len(in_string) * 1.0 * start_frac)
    while start_pos < len(in_string) and not in_string[start_pos].isspace():
        start_pos += 1
    
    end_pos = int(len(in_string) * 1.0 * end_frac)
    while end_pos < len(in_string) and not in_string[end_pos].isspace():
        end_pos += 1
    
    
    return in_string[ start_pos : end_pos ]

In [153]:
# modify text_standard to return a numeric value for aggregation purposes.
def text_standard_modified(string_in):
    raw = textstat.text_standard(string_in)
    return [
        int(''.join(filter(str.isdigit, x)))
        for x in raw.split()
        if ''.join(filter(str.isdigit, x)) != ''
    ][-1]

readability_measures = {
    "flesch_reading_ease": textstat.flesch_reading_ease,
    "smog_index": textstat.smog_index,
    "flesch_kincaid_grade": textstat.flesch_kincaid_grade,
    "coleman_liau_index": textstat.coleman_liau_index,
    "automated_readability_index": textstat.automated_readability_index,
    "dale_chall_readability_score": textstat.dale_chall_readability_score,
    "difficult_words": textstat.difficult_words,
    "linsear_write_formula": textstat.linsear_write_formula,
    "gunning_fog": textstat.gunning_fog,
    "text_standard": text_standard_modified,
}

In [154]:
def readability_by_chunks(num_chunks):
    results = {}
    for year in dat:
        print(year)
        results[year] = {}

        for chunk in range(num_chunks):
            results[year][chunk] = {}
            chunk_start = chunk / num_chunks
            chunk_end = (chunk+1) / num_chunks

            for measure_name, measure_function in readability_measures.items():
                results[year][chunk][measure_name] = measure_function(
                    slice_frac(
                        dat[year],
                        chunk_start,
                        chunk_end
                    )
                )
                
    return results

In [155]:
results = {}

In [157]:
for nc in [5, 4, 3, 2, 1]:
    results[nc] = readability_by_chunks(nc)

1921
1922
1923
1926
1928
1932
1937
1939
1940
1945
1947
1948
1950
1952
1953
1955
1958
1961
1965
1966
1967
1968
1972
1973
1976
1979
1980
1981
1982
1983
1984
1985
1986
1988
1989
1991
1992
1994
1995
1996
1998
1999
2000
2003
2004
2006
2007
2008
2009
2011
2013
1918
1919
1921
1922
1923
1926
1928
1932
1937
1939
1940
1945
1947
1948
1950
1952
1953
1955
1958
1961
1965
1966
1967
1968
1972
1973
1976
1979
1980
1981
1982
1983
1984
1985
1986
1988
1989
1991
1992
1994
1995
1996
1998
1999
2000
2003
2004
2006
2007
2008
2009
2011
2013
1918
1919
1921
1922
1923
1926
1928
1932
1937
1939
1940
1945
1947
1948
1950
1952
1953
1955
1958
1961
1965
1966
1967
1968
1972
1973
1976
1979
1980
1981
1982
1983
1984
1985
1986
1988
1989
1991
1992
1994
1995
1996
1998
1999
2000
2003
2004
2006
2007
2008
2009
2011
2013
1918
1919
1921
1922
1923
1926
1928
1932
1937
1939
1940
1945
1947
1948
1950
1952
1953
1955
1958
1961
1965
1966
1967
1968
1972
1973
1976
1979
1980
1981
1982
1983
1984
1985
1986
1988
1989
1991
1992
1994
1995
1996
1998


In [112]:
def avg(li):
    return sum(li) / len(li)

In [214]:
def aggregate(num_chunks):
    for measure_name in readability_measures:
        diffs = [
            results[num_chunks][year][num_chunks-1][measure_name] - results[num_chunks][year][0][measure_name]
            for year in results[num_chunks]
        ]
        mean = avg( diffs )
        sample_var = 1.0 / (len(diffs) - 1) * sum([(d - mean)**2 for d in diffs])
        sample_stddev = sample_var ** (.5)

        # what's the probability that the end is >= as complex as the start?
        p_times_two = stats.ttest_1samp(diffs, 0.0).pvalue
        p = p_times_two / 2  # ttest_1samp gives 2-sided p; we want 1-sided.
        print(
            '{measure_name}:\n'
            '\tSample mean of (complexity of end - complexity of beginning): {mean}\n'
            '\tProbability that true mean is NOT on the same side of 0.0: {p}\n'.format(
                measure_name=measure_name,
                mean=mean,
                p=p,
            ),
            end='',
        )
        

In [216]:
aggregate(num_chunks=5)

difficult_words:
	Sample mean of (complexity of end - complexity of beginning): -90.83018867924528
	Probability that true mean is NOT on the same side of 0.0: 0.03181393040556373
gunning_fog:
	Sample mean of (complexity of end - complexity of beginning): -0.7691501709055725
	Probability that true mean is NOT on the same side of 0.0: 0.00012868179169424706
smog_index:
	Sample mean of (complexity of end - complexity of beginning): -0.43396226415094324
	Probability that true mean is NOT on the same side of 0.0: 8.033031247232298e-05
linsear_write_formula:
	Sample mean of (complexity of end - complexity of beginning): -3.8540230340937884
	Probability that true mean is NOT on the same side of 0.0: 0.00046002492603910386
flesch_kincaid_grade:
	Sample mean of (complexity of end - complexity of beginning): -0.6905660377358488
	Probability that true mean is NOT on the same side of 0.0: 0.0010848608430562442
coleman_liau_index:
	Sample mean of (complexity of end - complexity of beginning): -0.35

In [217]:
aggregate(num_chunks=4)

difficult_words:
	Sample mean of (complexity of end - complexity of beginning): -74.56603773584905
	Probability that true mean is NOT on the same side of 0.0: 0.10358071076359542
gunning_fog:
	Sample mean of (complexity of end - complexity of beginning): -0.5993359377715477
	Probability that true mean is NOT on the same side of 0.0: 0.0009281935858784745
smog_index:
	Sample mean of (complexity of end - complexity of beginning): -0.34528301886792456
	Probability that true mean is NOT on the same side of 0.0: 0.00043825402857421765
linsear_write_formula:
	Sample mean of (complexity of end - complexity of beginning): -3.3930718128831336
	Probability that true mean is NOT on the same side of 0.0: 0.0017018207675137589
flesch_kincaid_grade:
	Sample mean of (complexity of end - complexity of beginning): -0.5056603773584907
	Probability that true mean is NOT on the same side of 0.0: 0.005619256590865399
coleman_liau_index:
	Sample mean of (complexity of end - complexity of beginning): -0.2915

In [218]:
aggregate(num_chunks=3)

difficult_words:
	Sample mean of (complexity of end - complexity of beginning): -99.45283018867924
	Probability that true mean is NOT on the same side of 0.0: 0.0696379109056587
gunning_fog:
	Sample mean of (complexity of end - complexity of beginning): -0.49660277978671796
	Probability that true mean is NOT on the same side of 0.0: 0.0020787189925936325
smog_index:
	Sample mean of (complexity of end - complexity of beginning): -0.28490566037735837
	Probability that true mean is NOT on the same side of 0.0: 0.0009226263891264874
linsear_write_formula:
	Sample mean of (complexity of end - complexity of beginning): -3.0085454439505495
	Probability that true mean is NOT on the same side of 0.0: 0.01767221592930925
flesch_kincaid_grade:
	Sample mean of (complexity of end - complexity of beginning): -0.47547169811320755
	Probability that true mean is NOT on the same side of 0.0: 0.0056281006608893975
coleman_liau_index:
	Sample mean of (complexity of end - complexity of beginning): -0.29981

In [219]:
aggregate(num_chunks=2)

difficult_words:
	Sample mean of (complexity of end - complexity of beginning): -101.45283018867924
	Probability that true mean is NOT on the same side of 0.0: 0.046660429009561646
gunning_fog:
	Sample mean of (complexity of end - complexity of beginning): -0.2739327285911698
	Probability that true mean is NOT on the same side of 0.0: 0.0158647110204746
smog_index:
	Sample mean of (complexity of end - complexity of beginning): -0.15849056603773568
	Probability that true mean is NOT on the same side of 0.0: 0.005911514884857048
linsear_write_formula:
	Sample mean of (complexity of end - complexity of beginning): -4.1168009401499965
	Probability that true mean is NOT on the same side of 0.0: 9.790424617775001e-05
flesch_kincaid_grade:
	Sample mean of (complexity of end - complexity of beginning): -0.2490566037735851
	Probability that true mean is NOT on the same side of 0.0: 0.049671547475266246
coleman_liau_index:
	Sample mean of (complexity of end - complexity of beginning): -0.1779245