### Ideas worked upon: 


1. Length of story : Longer the story, harder to understand
2. Length of sentence : Longer the sentence, harder to understand
3. Length Of Words : Longer the Words, harder to understand
4. Frequency of words : Lesser the frequency, more the unfamiliarity of the words

Based on these data points we can work on the different indexes for readability like Gunning fog index or Flesch Kincaid indexes or create our own indexes based in ranges to decide what can be the difficulty for the story.

In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_colwidth = 10000

In [2]:
import glob   
from statistics import mean

In [3]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

In [4]:
import re

In [5]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

In [6]:
stop_words = set(stopwords.words('english')) 
len(stop_words)

179

#### Location to the path where files are stored

In [7]:
path = './Story text files/*.txt'   
files=glob.glob(path) 

In [8]:
files[359]

'./Story text files/541.txt'

#### Reading the files

In [9]:
data = []
for fileToRead in files:
    with open(fileToRead, 'r') as file:
        data.append(file.read())

In [10]:
data[105]

''

In [11]:
dataDf = pd.DataFrame(data)

In [12]:
dataDf['fileName'] = files

In [13]:
dataDf.columns = ['Text', 'fileName']

In [14]:
regexp = re.compile('[0-9]|-')

In [15]:
dataDf.loc[:,'Text'] = dataDf.Text.apply(lambda x: regexp.sub('',x))

In [16]:
dataDf.loc[:, 'StoryLength'] = dataDf.Text.apply(lambda x: len(x))

##### Some files were empty, thus removing those files

In [17]:
dataDf.drop(dataDf[dataDf.StoryLength == 0].index, axis = 0, inplace = True)

In [18]:
min(dataDf.StoryLength)

13

In [19]:
dataDf[dataDf.StoryLength==13]

Unnamed: 0,Text,fileName,StoryLength
359,BOW!MEOW!WOW!,./Story text files/541.txt,13


In [20]:
dataDf.Text.head().apply(lambda x: [j for j in x.split() if j.lower() not in stop_words])

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  [Four, friends, want, race, toys.Veen, green, car., Meena, yellow, auto.Sanju, red, bus., Manju, blue, lorry.Ready, ,, GO!, comes, Lucky., Haha,, Lucky, joined, race.Look,, look!, Lucky, fastest, all., Lucky, wins, race.]
1                 [Rabbit, sleeping, apple, tree., apple, fell, branch.A, voice, said,, "Run, Rabbit, run!, ", woke, quickly, ran, away, great, speed.She, met, Chicken., "Why, running?, ", asked, Chicken., Rabbit, replied,, "I, kno

In [21]:
dataDf.reset_index(drop=True, inplace = True)

#### Word Length Features

In [48]:
dataDf['wordLength'] = dataDf.Text.apply(lambda x: [len(j) for j in re.split('\.|\ |\!|\?', x) if j.lower() not in stop_words])

In [49]:
# Getting the longest length word in the story
dataDf['maxWordLength'] = dataDf.wordLength.apply(lambda x: max(x))

In [50]:
# Getting the average word length in the story
dataDf['avgWordLength'] = dataDf.wordLength.apply(lambda x: mean(x))

In [51]:
# Taking words with length smaller than 8 i.e. we are segregating smaller and longer words as longer 
# words might be difficult to read
dataDf['smallWords'] = dataDf.wordLength.apply(lambda x: sum(np.array(x) < 8))  
dataDf['largeWords'] = dataDf.wordLength.apply(lambda x: sum(np.array(x) >= 8))   

#### Sentence Length Character Features

In [52]:
dataDf['sentenceLengthChar'] = dataDf.Text.apply(lambda x: [len(j) for j in re.split("!|\.|\?", x)])

In [53]:
dataDf['maxSentenceLengthChar'] = dataDf.sentenceLengthChar.apply(lambda x: max(x))

In [54]:
dataDf['avgSentenceLengthChar'] = dataDf.sentenceLengthChar.apply(lambda x: mean(x))

In [55]:
# Taking sentences with length smaller than 50 chars i.e. we are segregating smaller and longer sentences as longer 
# sentences might be difficult to read
dataDf['smallSentenceChar'] = dataDf.sentenceLengthChar.apply(lambda x: sum(np.array(x) < 50))  
dataDf['largeSentenceChar'] = dataDf.sentenceLengthChar.apply(lambda x: sum(np.array(x) >= 50))   

#### Sentence Length Word Features

In [56]:
dataDf['sentenceLengthWord'] = dataDf.Text.apply(lambda x: [len(re.findall(" ", j)) + 1 for j in re.split("!|\.|\?", x)])

In [57]:
dataDf['maxSentenceLengthWord'] = dataDf.sentenceLengthWord.apply(lambda x: max(x))

In [58]:
dataDf['avgSentenceLengthWord'] = dataDf.sentenceLengthWord.apply(lambda x: mean(x))

In [59]:
# Taking sentences with length smaller than 9 words i.e. we are segregating smaller and longer sentences as longer 
# sentences might be difficult to read
dataDf['smallSentenceWord'] = dataDf.sentenceLengthWord.apply(lambda x: sum(np.array(x) < 9))  
dataDf['largeSentenceWord'] = dataDf.sentenceLengthWord.apply(lambda x: sum(np.array(x) >= 9))   

#### Finding rare words in the text documets can help us in deciding which words might be less familiar or occur less number of times

In [74]:
wordFreq = {}
for row in dataDf.Text.iteritems():
    for word in set(re.split(r"\.|\!|\?|\ |\'|\\|\"|\,", row[1])):
        lemmatizedWord = lemmatizer.lemmatize(word)
        if lemmatizedWord.lower():
            if lemmatizedWord not in wordFreq:
                wordFreq[lemmatizedWord] = 1
            else:
                wordFreq[lemmatizedWord] += 1

In [75]:
lemmatizer.lemmatize('being').lower() in stop_words

True

In [76]:
wordFreq['being']

93

In [77]:
sorted(wordFreq.items(), key = lambda item: item[1]) 

[('Meena', 1),
 ('Veen', 1),
 ('lorry', 1),
 ('Lucky', 1),
 ('starte', 1),
 ('Horse', 1),
 ('grandaunt', 1),
 ('“Punch', 1),
 ('gnashing', 1),
 ('snivelling', 1),
 ('behaving', 1),
 ('Sniffles', 1),
 ('Sniffles’', 1),
 ('sniffling', 1),
 ('punch', 1),
 ('men’s', 1),
 ('acted', 1),
 ('“Men', 1),
 ('Punch', 1),
 ('whipping', 1),
 ('feeler', 1),
 ('menacingly', 1),
 ('next—can', 1),
 ('numbers—let’s', 1),
 ('days*', 1),
 ('pacing', 1),
 ('“Wha', 1),
 ('cured', 1),
 ('bloomed', 1),
 ('minimum', 1),
 ('factors**', 1),
 ('Koya', 1),
 ('LCM', 1),
 ('b—is', 1),
 ('Mathematics', 1),
 ('Midu', 1),
 ('factorization', 1),
 ('hmmm', 1),
 ('lunar', 1),
 ('Pakshipur', 1),
 ('”Baku’s', 1),
 ('Midu’s', 1),
 ('one—the', 1),
 ('tenyearold', 1),
 ('cycle—that', 1),
 ('Baku', 1),
 ('vadapav', 1),
 ('(LCM)', 1),
 ('“Baku', 1),
 ('Lowest', 1),
 ('average', 1),
 ('**If', 1),
 ('pavs', 1),
 ('Multiple', 1),
 ('(Hint:', 1),
 ('coincide', 1),
 ('Neelumbera', 1),
 ('Jauna', 1),
 ('Joseph', 1),
 ('yarn', 1),
 ('Ge

In [78]:
text = "I am being shy"
[wordFreq[lemmatizer.lemmatize(j)] for j in text.split()]

[389, 148, 93, 10]

In [81]:
dataDf['wordFreqCorpus'] = dataDf.Text.apply(lambda x: [wordFreq[lemmatizer.lemmatize(j)] for j in re.split(r"\.|\!|\?|\ |\'|\\|\"|\,", x) if j.lower() not in stop_words and len(j) > 1])

In [82]:
# Assuming words occuring less than 10 number of times as the less familiar words

dataDf['familiarWords'] = dataDf.wordFreqCorpus.apply(lambda x: sum(np.array(x) >= 10))
dataDf['nonFamiliarWords'] = dataDf.wordFreqCorpus.apply(lambda x: sum(np.array(x) < 10))



#### Syllable Count

In [85]:
# Syllable count helps us in deciding if the word is difficult to understand

def syllable_count(word):
    if(len(word) == 0):
        return 0
    count = 0
    vowels = "aeiouyAEIOUY"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count



In [86]:
dataDf['syllableCount'] = dataDf.Text.apply(lambda x: [syllable_count(j) for j in re.split("!|\.|\?|\ ", x) if j.lower() not in stop_words])

In [87]:
dataDf['maxSyllableCount'] = dataDf.syllableCount.apply(lambda x: max(x))

In [88]:
dataDf['avgSyllableCount'] = dataDf.syllableCount.apply(lambda x: mean(x))

In [89]:
# Number of syllables will also help in deciding complexity of the text
dataDf['smallSyllableCount'] = dataDf.syllableCount.apply(lambda x: sum(np.array(x) < 3))  
dataDf['largeSyllableCount'] = dataDf.syllableCount.apply(lambda x: sum(np.array(x) >= 3))   

In [90]:
dataDf.columns

Index(['Text', 'fileName', 'StoryLength', 'wordLength', 'maxWordLength',
       'avgWordLength', 'smallWords', 'largeWords', 'sentenceLengthChar',
       'maxSentenceLengthChar', 'avgSentenceLengthChar', 'smallSentenceChar',
       'largeSentenceChar', 'sentenceLengthWord', 'maxSentenceLengthWord',
       'avgSentenceLengthWord', 'smallSentenceWord', 'largeSentenceWord',
       'wordFreqCorpus', 'familiarWords', 'nonFamiliarWords', 'syllableCount',
       'maxSyllableCount', 'avgSyllableCount', 'smallSyllableCount',
       'largeSyllableCount'],
      dtype='object')

In [91]:
dataDf.reset_index(inplace=True, drop=True)

### Saving to a file

In [295]:
dataDf.to_csv?

In [297]:
dataDf[['Text', 'fileName', 'StoryLength', 'maxWordLength', 'avgWordLength','smallWords', 'largeWords',
       'maxSentenceLengthChar', 'avgSentenceLengthChar', 'smallSentenceChar',
       'largeSentenceChar', 'maxSentenceLengthWord','avgSentenceLengthWord', 
       'smallSentenceWord', 'largeSentenceWord','familiarWords', 'nonFamiliarWords',
       'maxSyllableCount', 'avgSyllableCount', 'smallSyllableCount',
       'largeSyllableCount']].to_csv('processedOutput.tsv', sep = '\t',index = False, line_terminator='\n')