In [231]:
l = ['i', 'me', 'you', 'she', 'her', 'he', 'him', 'it', 'we', 'us', 'they', 
                'them', 'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 
                'yourselves', 'themselves', 'my', 'your', 'his', 'her', 'its', 'mine', 'yours', 
                'his', 'hers', 'our', 'your', 'their', 'ours', 'yours', 'theirs']
s = list(set(l))
print(s)
print(len(l))
print(len(s))

['ourselves', 'she', 'themselves', 'you', 'ours', 'it', 'yourself', 'yourselves', 'itself', 'my', 'hers', 'her', 'theirs', 'we', 'i', 'me', 'myself', 'your', 'yours', 'them', 'our', 'himself', 'his', 'their', 'they', 'mine', 'herself', 'us', 'its', 'he', 'him']
35
31


In [246]:
from nltk import sent_tokenize, word_tokenize, pos_tag, ngrams
from nltk.corpus import stopwords
import string
import pandas as pd
import re
from collections import Counter

def get_tokens_as_list(df, args):
    values = df[args].tolist()
    return values

def concat_vals(ser):
    vals = ser.tolist()
    return ' '.join(vals)

def is_stopword(token):
    stops = stopwords.words('english').copy()
    return token in stops

def is_punctuation(token):
    return token in string.punctuation

def is_vowel(char):
    if char in ('a','e','i','o','u','y'):
        return True
    return False

def has_alliteration(word1, word2):
    gram00 = word1[0]
    gram10 = word2[0]
    
    if gram00==gram10:
        if is_vowel(gram00):
            return True
        else:
            if len(word1) > 1 and len(word2) > 1:
                gram01 = word1[1]
                gram11 = word2[1]
                if is_vowel(gram01) and is_vowel(gram11):
                    return True
                elif gram01 == gram11:
                    return True
    return False

def is_question(tokens):
    if tokens[-1] == '?':
        return 1
    return 0
        

class Speech():
    parallel_p = re.compile('(\w+\s,\s){2,}((\w+\sand\s\w+)|(and\s\w+))')
    
    intensifiers = ['amazingly', 'astoundingly', 'dreadfully', 'colossally', 'especially', 'exceptionally',
                'excessively', 'extremely', 'extraordinarily', 'fantastically', 'frightfully', 'fully', 
                'incredibly', 'literally', 'mightily', 'moderately', 'most', 'outrageously', 
                'phenomenally', 'quite', 'radically', 'rather', 'real', 'really', 'remarkably', 'right', 
                'somewhat', 'strikingly', 'super', 'supremely', 'surpassingly', 'terribly', 
                'terrifically', 'too', 'totally', 'uncommonly', 'unusually', 'veritable', 'very']
    
    pronouns = ['ourselves', 'she', 'themselves', 'you', 'ours', 'it', 'yourself', 'yourselves', 
                'itself', 'my', 'hers', 'her', 'theirs', 'we', 'i', 'me', 'myself', 'your', 'yours', 
                'them', 'our', 'himself', 'his', 'their', 'they', 'mine', 'herself', 'us', 'its', 'he', 'him']
    
    similarity_clauses = ['in the first place', 'not only', 'as a matter of fact', 'in like manner', 'in addition',
                      'coupled with', 'in the same fashion', 'in the same way', 'first, second, third', 
                      'in the light of', 'not to mention', 'to say nothing of', 'equally important', 
                      'by the same token', 'again', 'equally', 'identically', 
                      'uniquely', 'like', 'too', 'moreover', 'as well as', 'together with', 'of course', 
                      'likewise', 'comparatively', 'correspondingly', 'similarly', 'furthermore', 'additionally']

    opposition_clauses = ['although this may be true', 'in contrast', 'different from', 'on the other hand', 
                          'on the contrary', 'at the same time', 'in spite of', 'even so', 'even though', 
                          'be that as it may', 'then again', 'above all', 'in reality', 'after all', 'but', 
                          'and still', 'unlike', 'and yet', 'while', 'albeit', 'besides', 'as much as', 
                          'even though', 'although', 'instead', 'whereas', 'despite', 'conversely', 'otherwise', 
                          'however', 'rather', 'nevertheless', 'nonetheless', 'regardless', 'notwithstanding']

    conditional_clauses = ['in the event that', 'granted that', 'as long as', 'so long as', 'for the purpose of', 
                           'with this intention', 'with this in mind', 'in the hope that', 'to the end that', 
                           'for fear that', 'in order to', 'seeing that', 'being that', 'in view of', 'unless', 
                           'when', 'whenever', 'while', 'because of', 'while', 'lest', 'in case', 
                           'provided that', 'given that', 'only if', 'even if', 'so that', 'so as to', 'owing to', 
                           'inasmuch as', 'due to']

    example_clauses = ['in other words', 'to put it differently', 'for one thing', 'as an illustration', 'in this case', 
                'for this reason', 'to put it another way', 'that is to say', 'with attention to', 'by all means', 
                'important to realize', 'another key point', 'first thing to remember', 'most compelling evidence', 
                'must be remembered', 'point often overlooked', 'to point out', 'on the positive side', 
                'on the negative side', 'with this in mind', 'notably', 'including', 'like', 'to be sure', 'namely', 
                'chiefly', 'truly', 'indeed', 'certainly', 'surely', 'markedly', 'such as', 'especially', 'explicitly', 
                'specifically', 'expressly', 'surprisingly', 'frequently', 'significantly', 'particularly', 'in fact', 
                'in general', 'in particular', 'in detail', 'for example', 'for instance', 'to demonstrate', 
                'to emphasize', 'to repeat', 'to clarify', 'to explain', 'to enumerate']

    result_clauses = ['as a result', 'under those circumstances', 'in that case', 'for this reason', 'in effect', 
                      'thus', 'because the', 'hence', 'consequently', 'therefore', 'thereupon', 
                      'forthwith', 'accordingly', 'henceforth']

    conclusion_clauses = ['as a result', 'under those circumstances', 'in that case', 'for this reason', 
                          'in effect', ', for', 'thus', 'because the', 'then', 'hence', 'consequently', 'therefore', 
                          'thereupon', 'forthwith', 'accordingly', 'henceforth']

    sequence_clauses = ['at the present time', 'from time to time', 'sooner or later', 'at the same time',
                        'up to the present time', 'to begin with', 'in due time', 'as soon as', 'as long as',
                        'in the meantime', 'in a moment', 'without delay', 'in the first place', 'all of a sudden',
                        'at this instant', 'first', 'second ', 'immediately', 'quickly', 'finally', 'after', 'later',
                        'last', 'until', 'till', 'since', 'then', 'before', 'hence', 'since', 'when', 'once', 'about',
                        'next', 'now', 'formerly', 'suddenly', 'shortly', 'henceforth', 'whenever', 'eventually',
                        'meanwhile', 'further', 'during', 'in time', 'prior to', 'forthwith', 'straightaway ',
                        'by the time', 'whenever ', 'until now', 'now that ', 'instantly', 'presently', 'occasionally']
    
    def __init__(self, low_string):
        self.string = low_string
        self.tokens = self.make_token_df()
        self.sentences = self.make_sent_df()
        self.metrics = self.make_metrics()
        
    def make_token_df(self):
        rows = []
        sent_id = 0
        for sentence in sent_tokenize(self.string):
            for word in word_tokenize(sentence):
                info = {'token':word, 'sent_id':sent_id}
                rows.append(info)
            sent_id += 1
        parsed_speech = pd.DataFrame(rows)
        parsed_speech['is_stop'] = parsed_speech.token.apply(is_stopword)
        parsed_speech['is_punct'] = parsed_speech['token'].apply(is_punctuation)
        
        pos_tags = pos_tag(parsed_speech.token.tolist())
        just_tags = [x[1] for x in pos_tags]
        parsed_speech['pos'] = pd.Series( just_tags )

        return parsed_speech
        
    def make_sent_df(self):
        speech_sentences = self.tokens.groupby('sent_id').apply(get_tokens_as_list, args=('token'))
        speech_tags = self.tokens.groupby('sent_id').apply(get_tokens_as_list, args=('pos'))
        speech_sentences = pd.concat([speech_sentences,speech_tags], axis=1).reset_index()
        speech_sentences.columns = ['sent_id','tokens','pos_pattern']
        speech_sentences['sentence'] = speech_sentences['tokens'].apply(' '.join)
        speech_sentences['num_tokens'] = speech_sentences['tokens'].apply(len)
        speech_sentences = pd.concat([speech_sentences, speech_sentences.sentence.apply(self.identify_clauses)], axis=1)
        speech_sentences['has_parallel'] = speech_sentences.sentence.apply(self.contains_parallel)
        speech_sentences['alliteration_count'] = speech_sentences.tokens.apply(self.count_alliteration)
        speech_sentences['intensifier_count'] = speech_sentences.tokens.apply(self.count_intensifiers)
        speech_sentences['is_question'] = speech_sentences.tokens.apply(is_question)
        pronouns = speech_sentences.apply(self.count_pronouns, axis=1)
        speech_sentences = pd.concat([speech_sentences,pronouns], axis=1)
        return speech_sentences
    
    def make_metrics(self):
        metric_columns = [x for x in self.sentences.columns if x not in ['sent_id', 'tokens', 'pos_pattern', 'sentence', 'num_tokens']]
        return self.sentences[metric_columns].sum()/len(self.sentences)
    
    def __contains_clause__(self, s, args):
        patterns = args
        for pattern in patterns:
            pattern = '(^%s)|(\s%s\s)' % (pattern,pattern,)
            m = re.search(pattern, s)
            if m:
                return 1
        return 0
    
    def identify_clauses(self, s):
        similarity = self.__contains_clause__(s, Speech.similarity_clauses)#contains_similarity_clause(s)
        opposition = self.__contains_clause__(s, Speech.opposition_clauses)
        conditional = self.__contains_clause__(s, Speech.conditional_clauses)
        example = self.__contains_clause__(s, Speech.example_clauses)
        result = self.__contains_clause__(s, Speech.result_clauses)
        conclusion = self.__contains_clause__(s, Speech.conclusion_clauses)
        sequence = self.__contains_clause__(s, Speech.sequence_clauses)

        d = dict(has_similarity=similarity, has_opposition=opposition, has_conditional=conditional,
                has_example=example, has_result=result, has_conclusion=conclusion, has_sequence=sequence)

        return pd.Series(d)  
    
    def contains_parallel(self, s):
        m = re.search(Speech.parallel_p,s)
        if m:
            return 1
        return 0
    
    def count_pronouns(self, ser):
        tokens = ser['tokens']
        pronoun_dict = {}
        for pronoun in Speech.pronouns:
            pronoun_dict.setdefault(pronoun,0)
        for token in tokens:
            if token in Speech.pronouns:
                pronoun_dict[token] = 1
        return pd.Series(pronoun_dict)
    
    def count_alliteration(self, tokens):
        allit_count = 0
        bigrams = ngrams(tokens, 2)
        for bigram in bigrams:
            if has_alliteration(bigram[0],bigram[1]):
                allit_count += 1      
        trigrams = ngrams(tokens, 3)
        for trigram in trigrams:
            if has_alliteration(trigram[0],trigram[2]):
                allit_count += 1
                
        if allit_count > 0:
            return 1
        return 0
    
    def count_intensifiers(self, tokens):
        count = 0
        for token in tokens:
            if token in Speech.intensifiers:
                count = 1
        return count

fp = 'speeches/FDR-PearlHarbor.txt'
text = open(fp).read().lower()       
test = Speech(text)

print(test.metrics.index.tolist())

['has_conclusion', 'has_conditional', 'has_example', 'has_opposition', 'has_result', 'has_sequence', 'has_similarity', 'has_parallel', 'alliteration_count', 'intensifier_count', 'is_question', 'he', 'her', 'hers', 'herself', 'him', 'himself', 'his', 'i', 'it', 'its', 'itself', 'me', 'mine', 'my', 'myself', 'our', 'ours', 'ourselves', 'she', 'their', 'theirs', 'them', 'themselves', 'they', 'us', 'we', 'you', 'your', 'yours', 'yourself', 'yourselves']


In [247]:
import os
os.listdir('speeches')

['.ipynb_checkpoints',
 'Churchill-Beaches.txt',
 'Churchill-Blood.txt',
 'Churchill-EveryMan.txt',
 'FDR-FourFreedoms.txt',
 'FDR-Inaugural.txt',
 'FDR-PearlHarbor.txt',
 'JFK-CityOnHill.txt',
 'JFK-Houston.txt',
 'JFK-Inaugural.txt',
 'LBJ-WeShallOvercome.txt',
 'MalcolmX-BallotBullet.txt',
 'MLKJ-IHaveADream.txt',
 'MLKJ-Mountaintop.txt',
 'Nixon-Checkers.txt',
 'Reagan-Challenger.txt']

In [248]:
print(len(Speech.pronouns))

31


In [250]:
df = pd.DataFrame(columns=['speaker','name','has_conclusion', 'has_conditional', 'has_example', 'has_opposition', 
                           'has_result', 'has_sequence', 'has_similarity', 'has_parallel', 'intensifier_count', 
                           'is_question', 'alliteration_count']+Speech.pronouns)
for fn in os.listdir('speeches'):
    if fn.endswith('.txt'):
        speaker,name=fn.rstrip('.txt').split('-')
        if speaker in ['JFK','FDR','Churchill']:
            fp = 'speeches/' + fn
            text = open(fp).read().lower()
            speech = Speech(text)
            metrics = speech.metrics
            metrics['name'] = name
            metrics['speaker'] = speaker
            df.loc[len(df)] = metrics

df.to_csv('data/speech_metrics.csv')
df

Unnamed: 0,speaker,name,has_conclusion,has_conditional,has_example,has_opposition,has_result,has_sequence,has_similarity,has_parallel,...,himself,his,their,they,mine,herself,us,its,he,him
0,Churchill,Beaches,0.142857,0.142857,0.142857,0.142857,0.0,0.285714,0.285714,0.0,...,0.0,0.142857,0.285714,0.142857,0.0,0.0,0.0,0.142857,0.0,0.0
1,Churchill,Blood,0.055556,0.027778,0.0,0.083333,0.0,0.25,0.055556,0.083333,...,0.0,0.055556,0.0,0.0,0.0,0.0,0.111111,0.027778,0.0,0.0
2,Churchill,EveryMan,0.105263,0.157895,0.105263,0.342105,0.078947,0.368421,0.105263,0.026316,...,0.052632,0.105263,0.105263,0.157895,0.0,0.0,0.052632,0.052632,0.131579,0.026316
3,FDR,FourFreedoms,0.040541,0.087838,0.033784,0.081081,0.033784,0.209459,0.033784,0.006757,...,0.0,0.006757,0.108108,0.081081,0.0,0.0,0.067568,0.040541,0.0,0.0
4,FDR,Inaugural,0.043956,0.021978,0.010989,0.131868,0.021978,0.175824,0.032967,0.0,...,0.010989,0.010989,0.098901,0.120879,0.0,0.0,0.054945,0.010989,0.032967,0.0
5,FDR,PearlHarbor,0.038462,0.076923,0.038462,0.115385,0.038462,0.346154,0.076923,0.0,...,0.0,0.038462,0.076923,0.0,0.0,0.0,0.153846,0.038462,0.0,0.0
6,JFK,CityOnHill,0.029412,0.058824,0.205882,0.147059,0.0,0.264706,0.147059,0.029412,...,0.0,0.058824,0.058824,0.117647,0.0,0.0,0.117647,0.088235,0.029412,0.0
7,JFK,Houston,0.133333,0.111111,0.088889,0.355556,0.0,0.244444,0.133333,0.044444,...,0.0,0.088889,0.088889,0.111111,0.0,0.0,0.0,0.044444,0.022222,0.088889
8,JFK,Inaugural,0.038462,0.019231,0.038462,0.288462,0.019231,0.230769,0.038462,0.019231,...,0.0,0.038462,0.038462,0.019231,0.019231,0.0,0.192308,0.096154,0.0,0.0


In [251]:
df.columns

Index(['speaker', 'name', 'has_conclusion', 'has_conditional', 'has_example',
       'has_opposition', 'has_result', 'has_sequence', 'has_similarity',
       'has_parallel', 'intensifier_count', 'is_question',
       'alliteration_count', 'ourselves', 'she', 'themselves', 'you', 'ours',
       'it', 'yourself', 'yourselves', 'itself', 'my', 'hers', 'her', 'theirs',
       'we', 'i', 'me', 'myself', 'your', 'yours', 'them', 'our', 'himself',
       'his', 'their', 'they', 'mine', 'herself', 'us', 'its', 'he', 'him'],
      dtype='object')

In [258]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn import datasets

pronoun_cols = Speech.pronouns
clause_cols = ['has_conclusion', 'has_conditional', 'has_example', 'has_opposition', 
                           'has_result', 'has_sequence', 'has_similarity']
technique_cols = ['has_parallel', 'alliteration_count', 'intensifier_count', 'is_question']

In [259]:
y_cols = ['speaker','name']
X = df[pronoun_cols].fillna(0).as_matrix()
est = KMeans(n_clusters=3)
est.fit(X)
labels = est.labels_
members = tuple(zip(df[y_cols].as_matrix(),labels))
centers = pd.DataFrame(est.cluster_centers_, columns=pronoun_cols)
for member, group in members:
    print(str(member), group, sep='\t')
centers

['Churchill' 'Beaches']	1
['Churchill' 'Blood']	0
['Churchill' 'EveryMan']	2
['FDR' 'FourFreedoms']	2
['FDR' 'Inaugural']	2
['FDR' 'PearlHarbor']	2
['JFK' 'CityOnHill']	0
['JFK' 'Houston']	0
['JFK' 'Inaugural']	2


Unnamed: 0,ourselves,she,themselves,you,ours,it,yourself,yourselves,itself,my,...,himself,his,their,they,mine,herself,us,its,he,him
0,0.009804,0.0,0.0,0.057952,0.0,0.171242,0.0,0.0,0.009259,0.148475,...,0.0,0.067756,0.049237,0.076253,0.0,0.0,0.076253,0.053486,0.017211,0.02963
1,0.142857,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,...,0.0,0.142857,0.285714,0.142857,0.0,0.0,0.0,0.142857,0.0,0.0
2,0.020054,0.0,0.017582,0.041141,0.001351,0.188121,0.0,0.0,0.022459,0.037087,...,0.012724,0.039986,0.085531,0.075817,0.003846,0.0,0.10426,0.047755,0.032909,0.005263


In [260]:
y_cols = ['speaker','name']
X = df[clause_cols].fillna(0).as_matrix()
est = KMeans(n_clusters=3)
est.fit(X)
labels = est.labels_
members = tuple(zip(df[y_cols].as_matrix(),labels))
centers = pd.DataFrame(est.cluster_centers_, columns=clause_cols)
for member, group in members:
    print(str(member), group, sep='\t')
centers

['Churchill' 'Beaches']	2
['Churchill' 'Blood']	1
['Churchill' 'EveryMan']	0
['FDR' 'FourFreedoms']	1
['FDR' 'Inaugural']	1
['FDR' 'PearlHarbor']	1
['JFK' 'CityOnHill']	2
['JFK' 'Houston']	0
['JFK' 'Inaugural']	0


Unnamed: 0,has_conclusion,has_conditional,has_example,has_opposition,has_result,has_sequence,has_similarity
0,0.092353,0.096079,0.077538,0.328707,0.032726,0.281212,0.092353
1,0.044628,0.053629,0.020809,0.102917,0.023556,0.245359,0.049807
2,0.086134,0.10084,0.17437,0.144958,0.0,0.27521,0.216387


In [261]:
y_cols = ['speaker','name']
X = df[technique_cols].fillna(0).as_matrix()
est = KMeans(n_clusters=3)
est.fit(X)
labels = est.labels_
members = tuple(zip(df[y_cols].as_matrix(),labels))
centers = pd.DataFrame(est.cluster_centers_, columns=technique_cols)
for member, group in members:
    print(str(member), group, sep='\t')
centers

['Churchill' 'Beaches']	2
['Churchill' 'Blood']	0
['Churchill' 'EveryMan']	1
['FDR' 'FourFreedoms']	0
['FDR' 'Inaugural']	0
['FDR' 'PearlHarbor']	0
['JFK' 'CityOnHill']	0
['JFK' 'Houston']	0
['JFK' 'Inaugural']	0


Unnamed: 0,has_parallel,alliteration_count,intensifier_count,is_question
0,0.026168,0.749701,0.068491,0.029211
1,0.026316,0.789474,0.315789,0.0
2,0.0,1.0,0.0,0.0


In [262]:
members

((array(['Churchill', 'Beaches'], dtype=object), 2),
 (array(['Churchill', 'Blood'], dtype=object), 0),
 (array(['Churchill', 'EveryMan'], dtype=object), 1),
 (array(['FDR', 'FourFreedoms'], dtype=object), 0),
 (array(['FDR', 'Inaugural'], dtype=object), 0),
 (array(['FDR', 'PearlHarbor'], dtype=object), 0),
 (array(['JFK', 'CityOnHill'], dtype=object), 0),
 (array(['JFK', 'Houston'], dtype=object), 0),
 (array(['JFK', 'Inaugural'], dtype=object), 0))

In [257]:
centers

Unnamed: 0,has_parallel,alliteration_count,intensifier_count
0,0.026168,0.749701,0.068491
1,0.0,1.0,0.0
2,0.026316,0.789474,0.315789


In [135]:
len(x_cols)

39

In [144]:
est.cluster_centers_

array([[  2.94871795e-01,   2.56410256e-02,   5.47008547e-01,
          5.98290598e-02,   1.41025641e-01,   0.00000000e+00,
          4.27350427e-03,   2.56410256e-02,   2.56410256e-02,
          4.27350427e-03,   2.47863248e-01,   0.00000000e+00,
          0.00000000e+00,   5.12820513e-02,   4.27350427e-03,
          8.54700855e-02,   8.54700855e-03,   2.99145299e-02,
          4.27350427e-03,   1.66666667e-01,   0.00000000e+00,
          0.00000000e+00,   5.12820513e-02,   5.12820513e-02,
          1.41025641e-01,   2.56410256e-02,   5.55555556e-02,
          5.98290598e-02,   0.00000000e+00,   2.99145299e-02,
          4.27350427e-03,   4.27350427e-03,   0.00000000e+00,
          4.27350427e-03,   2.56410256e-02,   2.56410256e-02,
          6.41025641e-02,   0.00000000e+00,   0.00000000e+00,
          1.53846154e-01,   2.26495726e-01,   5.98290598e-02,
          1.70940171e-02],
       [  2.80955856e-01,   1.27226463e-03,   1.86547504e-01,
          6.18360048e-02,   1.43960628e-01,