In [None]:
import os
import sys
import json

import re
import statistics as stats
from collections import Counter

directory="../review_json/"

def load_reviews(directory):
    file_names = os.listdir(directory)
    reviews = [json.loads(open(directory+fn, 'r').read()) for fn in file_names]
    reviews.sort(key=lambda r:len(r['text'])) # sort by length of review
    reviews = reviews[5:] # get rid of 5 shortest reviews
    for r in reviews:
        r.pop('by') # remove 'by' field in each review
    return reviews

def display(r):
    print("""
    Review: %s
    ------------------------------
    Title: %s
    Release year: %s
    Running time: %s
    MPAA rating: %s
    Genres: %s
    Star rating: %s
    Date Published: %s
    Review URL: %s
    """ % (
    r['text'],r['title'],r['release_year'],r['running_time'],r['mpaa_rating'],
    ', '.join(r['genres']) if r['genres'] else 'N/A',
    r['star_rating'],r['date_published'],r['url']
    ))

reviews = load_reviews(directory)
reviews.sort(key=lambda r:r['release_year']) # sort by release_year
titles = {r['title']:r for r in reviews} # dict of reviews by title

####

In [None]:
mr = titles['Minority Report (2002)'] # the Minority Report review
mr
display(mr)

In [None]:
list(mr.keys()) # see review fields

In [None]:
list(mr.items()) # view key-value pairs

In [None]:
# Inspecting the text with built-in Python functions
mr_text = mr['text']
print(mr_text)

In [None]:
len(mr_text) # character count

In [None]:
mr_text.count('Pre-Cogs')

In [None]:
mr_text.count('Anderton')

In [None]:
mr_text.count('the') # 78 occurrences

In [None]:
print(mr_text.lower()) # normalize letter casing

In [None]:
mr_text.lower().count('the') # but 88 occurrences in lowercased text

In [None]:
print(mr_text.upper())

In [None]:
mr_text.upper().count('THE') # and 88 occurrences in uppercased text

In [None]:
# Inspect paragraphs
print(mr_text)

In [None]:
mr_paras = mr_text.split('\n\n')
len(mr_paras)

In [None]:
mr_paras[0]

In [None]:
mr_paras[-1]

In [None]:
mr_paras.sort(key=lambda p:len(p)) # sort by character length of paragraph
mr_paras[0]

In [None]:
mr_paras[-1]

In [None]:
len(mr_paras[0])

In [None]:
len(mr_paras[-1])

In [None]:
# Inspect sentences
mr_sents = mr_text.split('. ')
mr_sents[:5] # uh oh--newlines are not properly split

In [None]:
mr_sents = mr_text.split('.')
mr_sents[:5] # looks good

In [None]:
mr_sents[-5:] # looks good

In [None]:
mr_sents[18:22] # ellipsis results in error!

In [None]:
mr_text[3000:3200] # the original text

In [None]:
# import re
mr_sents = re.split('[\.\!\?]\s', mr_text) # but we lose that punctuation information
mr_sents

In [None]:
mr_sents = re.split('([^\.\!\?]+[\.\!\?]\s+)', mr_text) # group sentences
mr_sents

In [None]:
mr_sents[44:46] # but what about quotation marks and parentheticals?

In [None]:
mr_sents = [sent for sent in mr_sents if sent] # clear empty strings
len(mr_sents)

In [None]:
mr_sents.sort(key=lambda s:len(s)) # sort by sentence character length
mr_sents[:10] # shortest sentences

In [None]:
mr_sents[-3:] # longest sentences

In [None]:
len(mr_sents[0]) # character length of shortest sentence

In [None]:
len(mr_sents[-1]) # character length of longest sentence

In [None]:
sent_lengths = [len(sent) for sent in mr_sents]
# import statistics as stats
stats.mean(sent_lengths) # 132 characters

In [None]:
stats.median(sent_lengths) # 131 characters

In [None]:
stats.stdev(sent_lengths) # 68 characters
# this is setup for computing reading level

In [None]:
# Inspect tokens (words and punctuation)
mr_text.split() # simplest possible approach to English tokenization

In [None]:
mr_text.split(' ') # error is that punctuation and whitespaces glom to words

In [None]:
# Splitting with regular expressions
# import re
re.split(' ', mr_text)

In [None]:
re.split('\s', mr_text) # splits on whitespace characters

In [None]:
tokens = re.split('(\W)', mr_text) # groups on non-word characters
tokens

In [None]:
tokens = [tok for tok in tokens if tok.strip()] # filters spaces
tokens

In [None]:
len(tokens) # 1391

In [None]:
tokens.sort(key=lambda tok:len(tok)) # sort by token length
tokens[:10] # shortest tokens

In [None]:
tokens[-10:] # longest tokens

In [None]:
# from collections import Counter
token_counts = Counter(tokens) # get counts of each token
token_counts.most_common(50) # see most common tokens

In [None]:
token_counts['film']

In [None]:
token_counts['movie']

In [None]:
token_counts['scene']

In [None]:
token_counts['sequence']

In [None]:
token_counts['character']

In [None]:
sorted(tokens) # alphabetical list of all tokens

In [None]:
token_set = set(tokens) # unique tokens
sorted(token_set) # alphabetical list of unique tokens

In [None]:
len(tokens) # count of all tokens

In [None]:
len(token_set) # count of unique tokens

In [None]:
len(token_set) / len(tokens) # lexical diversity 0.3989935298346513

In [None]:
len(tokens) / len(token_set) # average token occurrences (2.5) misleading

In [None]:
# import statistics as stats
stats.median(token_counts.values()) # 1 (most tokens appear once)

In [None]:
max(token_counts.values()) # 69 (but some tokens occur very frequently)

In [None]:
token_lengths = [len(token) for token in tokens] # token character lengths
min(token_lengths) # 1

In [None]:
max(token_lengths) # 15

In [None]:
stats.mean(token_lengths) # 4.020848310567937

In [None]:
stats.median(token_lengths) # 3

In [None]:
stats.mode(token_lengths) # 1

In [None]:
stats.stdev(token_lengths) # 2.6701649504450606

In [None]:
sorted(token_lengths)[-20:] # character lengths of 20 longest words

In [None]:
# Finale: compute reading level / text complexity
# https://en.wikipedia.org/wiki/Readability
# Most algorithms use word lists and syllable counts
# Ours: "The Code4Lib Readability Index" uses no extra data
words = [tok for tok in re.split('(\W)', mr_text) if tok.isalpha()]
sents = re.split('[\.\!\?]\s', mr_text)
tokenized_sents = []
# let's created a list of tokenized sentences
for sent in sents:
    sent_tokens = [tok for tok in re.split('(\W)', sent) if tok.isalpha()]
    tokenized_sents.append(sent_tokens)

In [None]:
tokenized_sents[0]

In [None]:
tokenized_sents[1]

In [None]:
tokenized_sents[-1]

In [None]:
avg_sent_length = stats.mean([len(sent) for sent in tokenized_sents]) # 24 words
avg_sent_length

In [None]:
avg_word_length = stats.mean([len(word) for word in words]) # 4.6 characters
avg_word_length

In [None]:
def leveler(avg_word_length, avg_sent_length):
    reading_level = -5 + 2*avg_word_length + 0.25*avg_sent_length
    return reading_level

In [None]:
# let's test our leveler before applying it to our Minority Report text
# big words, long sentences
leveler(5, 30) # 12.5

In [None]:
# smaller words, long sentences
leveler(4, 30) # 10.5

In [None]:
# medium words, medium sentences
leveler(4,20) # 8.0

In [None]:
# short words, medium sentences
leveler(3,20) # 6.0

In [None]:
# short words, short sentences
leveler(3,10) # 3.5

In [None]:
# Minority Report: (4.6, 24)
leveler(avg_word_length, avg_sent_length) # 10.147311027415258

In [None]:
# let's turn this into a function that accepts a text input
def leveler(text):
    words = [tok for tok in re.split('(\W)', text) if tok.isalpha()]
    sents = re.split('[\.\!\?]\s', text)
    tokenized_sents = []
    for sent in sents:
        sent_tokens = [tok for tok in re.split('(\W)', sent) if tok.isalpha()]
        tokenized_sents.append(sent_tokens)
    avg_sent_length = stats.mean([len(sent) for sent in tokenized_sents]) # 23 words
    avg_word_length = stats.mean([len(word) for word in words]) # 4.6 characters
    reading_level = -5 + 2*avg_word_length + 0.25*avg_sent_length
    return reading_level

In [None]:
leveler(mr_text) # yes, 10.147311027415258

In [None]:
# let's sort our reviews by reading level
reviews.sort(key=lambda r:leveler(r['text']))

In [None]:
leveler(reviews[0]['text']) # 4.966795091324202

In [None]:
display(reviews[0]) # short sentences with simple language

In [None]:
display(reviews[1]) # repartee dialogue

In [None]:
leveler(reviews[-1]['text']) # 28.858155204460967

In [None]:
display(reviews[-1]) # erroneous commas at the end of each paragraph

In [None]:
reviews[-1]['url'] # visit http://www.rogerebert.com/reviews/switch-1991

In [None]:
leveler(reviews[-2]['text']) # 18.722480395004354

In [None]:
display(reviews[-2]) # long, complex sentences