Potential useful link
http://streamhacker.com/2009/02/23/chunk-extraction-with-nltk/
Does things like pronoun, word type, etc

In [2]:
import os
import pandas as pd
import numpy as np

from nltk.stem.snowball import SnowballStemmer
import string
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
import nltk
import nltk.tag

In [10]:
class Speech():
    pronoun_groups = {
        'self':('i','me','my','myself',),
        'we':('we','us','ours','ourselves',),
        'you':('you','your','yours','yourself'),
        'another':('he','his','her','hers','she','him','himself','herself',),
        'them':('they','them','their','theirs','themselves','themself',),
    }
    stemmer = SnowballStemmer('english')
    stop = stopwords.words('english')
    stemmer.stopwords = stop
    
    def __init__(self, string):
        self.string = string.lower()
        self.text = nltk.Text(word_tokenize(self.string))
        self.tokens = self.text.tokens
        self.stemmed_tokens = [Speech.stemmer.stem(token) for token in self.tokens]
        self.sentences = self.make_sentence_lists()
        
    def make_sentence_lists(self):
        sent_list = sent_tokenize(self.string)
        tokenized_sent = [word_tokenize(sentence) for sentence in sent_list]
        return tokenized_sent
    
    def get_freq_dist(self, stemmed=False):
        if tokens:
            return nltk.FreqDist(self.stemmed_tokens)
        return nltk.FreqDist(self.tokens)
    
    
        
f = open('data/plato_apology.txt')
data = f.read().lower()

speech = Speech(data)
        
        

In [9]:
speech.sentences

[['how',
  'you',
  'have',
  'felt',
  ',',
  'o',
  'men',
  'of',
  'athens',
  ',',
  'at',
  'hearing',
  'the',
  'speeches',
  'of',
  'my',
  'accusers',
  ',',
  'i',
  'can',
  'not',
  'tell',
  ';',
  'but',
  'i',
  'know',
  'that',
  'their',
  'persuasive',
  'words',
  'almost',
  'made',
  'me',
  'forget',
  'who',
  'i',
  'was',
  '-',
  'such',
  'was',
  'the',
  'effect',
  'of',
  'them',
  ';',
  'and',
  'yet',
  'they',
  'have',
  'hardly',
  'spoken',
  'a',
  'word',
  'of',
  'truth',
  '.'],
 ['but',
  'many',
  'as',
  'their',
  'falsehoods',
  'were',
  ',',
  'there',
  'was',
  'one',
  'of',
  'them',
  'which',
  'quite',
  'amazed',
  'me',
  ';',
  '-',
  'i',
  'mean',
  'when',
  'they',
  'told',
  'you',
  'to',
  'be',
  'upon',
  'your',
  'guard',
  ',',
  'and',
  'not',
  'to',
  'let',
  'yourselves',
  'be',
  'deceived',
  'by',
  'the',
  'force',
  'of',
  'my',
  'eloquence',
  '.'],
 ['they',
  'ought',
  'to',
  'have',
  'been

* Number of words
* Number of sentences
* Average word size
* Average sentence size
* Ratio of unique words to num words (exclude stops)
* Punctuation distribution
* Pronoun distribution

In [None]:
stemmer = SnowballStemmer('english')
stop = stopwords.words('english')
stemmer.stopwords = stop

In [8]:
word_count = 0
sentence_count = 0
word_lengths_chars = Counter()
sentence_lengths_words = Counter()
words = Counter()
pronouns = Counter()

In [83]:
pronoun_groups = {
    'self':('i','me','my','myself',),
    'we':('we','us','ours','ourselves',),
    'you':('you','your','yours','yourself'),
    'another':('he','his','her','hers','she','him','himself','herself',),
    'them':('they','them','their','theirs','themselves','themself',),
}

def get_pronoun_group(pronoun):
    for key in pronoun_groups:
        if pronoun in pronoun_groups[key]:
            return key
    return 'other'

In [90]:
class DocAnalyzer():
    def __init__(self, fp):
        self.fp = fp
        self.doc = open(fp)
        self.text = self.doc.read().lower()
        self.stemmer = SnowballStemmer('english')
        stop = stopwords.words('english')
        self.stemmer.stopwords = stop
        
        self.word_count = 0
        self.sentence_count = 0
        self.word_lengths = Counter()
        self.sentence_lengths = Counter()
        self.words = Counter()
        self.word_tags = {}
        
    def ingestSentence(self, s):
        self.sentence_count += 1
        sent_words = word_tokenize(s)
        
        num_words = len(sent_words)
        self.sentence_lengths.update([num_words])
        self.word_count += num_words
        
        tagged_text = nltk.pos_tag(sent_words, tagset='universal')
        for word,tag in tagged_text:
            self.word_lengths.update([len(word)])
            word_stem = self.stemmer.stem(word)
            self.words.update([word_stem])
            if tag in self.word_tags: 
                self.word_tags[tag].update([word])
            else:
                self.word_tags[tag] = Counter()
        
    def ingestDoc(self):
        sentences = sent_tokenize(self.text)
        for sentence in sentences:
            self.ingestSentence(sentence)

    def report(self):
        report = {}
        report['Word Count'] = self.word_count
        report['Sentence Count'] = self.sentence_count
        report['Avg Word Length'] = sum(self.word_lengths.values()) / len(self.word_lengths.values())
        report['Avg Sentence Length'] = sum(self.sentence_lengths.values()) / len(self.sentence_lengths.values())
        report['Unique Word Count'] = len(self.words.keys())
        report['Unique Word to Word Count'] = len(self.words.keys()) / self.word_count
        
        pronouns = self.word_tags['PRON']
        pronoun_counts = {}
        for word , count in pronouns.items():
            group = get_pronoun_group(word)
            pronoun_counts[group] = pronoun_counts.get(group,0) + count
        report['Pronoun Groups'] = pronoun_counts
        
        return report
        
"""
def ingestSentence(s, a_stemmer):
    unique_words = set()
    for sent in sent_tokenize(s):
        for word in word_tokenize(sent):
            if word not in string.punctuation+string.whitespace and word not in a_stemmer.stopwords:
                stem = a_stemmer.stem(word)
                unique_words.update( [stem] )
    return unique_words
"""    


'\ndef ingestSentence(s, a_stemmer):\n    unique_words = set()\n    for sent in sent_tokenize(s):\n        for word in word_tokenize(sent):\n            if word not in string.punctuation+string.whitespace and word not in a_stemmer.stopwords:\n                stem = a_stemmer.stem(word)\n                unique_words.update( [stem] )\n    return unique_words\n'

In [91]:
analyzer = DocAnalyzer('data/plato_apology.txt')
analyzer.ingestDoc()

In [92]:
analyzer.report()

{'Avg Sentence Length': 4.126315789473685,
 'Avg Word Length': 814.6875,
 'Pronoun Groups': {'another': 192,
  'other': 171,
  'self': 729,
  'them': 228,
  'we': 20,
  'you': 280},
 'Sentence Count': 392,
 'Unique Word Count': 1328,
 'Unique Word to Word Count': 0.101879555044112,
 'Word Count': 13035}

In [80]:
pronouns = analyzer.word_tags['PRON']

def get_pronoun_group(pronoun):
    for key in pronoun_groups:
        if pronoun in pronoun_groups[key]:
            return key
    return 'other'

In [82]:
pronouns = analyzer.word_tags['PRON']

pronoun_counts = {}
for word , count in pronouns.items():
    group = get_pronoun_group(word)
    pronoun_counts[group] = pronoun_counts.get(group,0) + count
print(pronoun_counts)

{'them': 228, 'self': 729, 'other': 171, 'you': 280, 'we': 20, 'another': 192}


In [73]:
help(Counter.update)

Help on function update in module collections:

update(*args, **kwds)
    Like dict.update() but add counts instead of replacing them.
    
    Source can be an iterable, a dictionary, or another Counter instance.
    
    >>> c = Counter('which')
    >>> c.update('witch')           # add elements from another iterable
    >>> d = Counter('watch')
    >>> c.update(d)                 # add elements from another counter
    >>> c['h']                      # four 'h' in which, witch, and watch
    4

