This notebook explores the degree to which different characters have measurably different *registers* by training a multiclass classifier on character dialogue to predict the speaker.  This notebooks works with the output of [BookNLP](https://github.com/dbamman/book-nlp), which recognizes quotations and carries out speaker attribution on them.

In [None]:
import operator
from collections import Counter
import math
from os import path
import json
import random
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from scipy import sparse
import nltk
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import accuracy_score
import gzip

np.random.seed(1)

In [None]:
!pip install textstat

In [None]:
import textstat

In [None]:
def print_top_speakers(filename, top_n=10):
        
    with gzip.open(filename) as file:
        data=json.load(file)
        counts={}

        for character in data["characters"]:
            char_id=character["id"]

            gender=character["g"]
            names='; '.join([x["n"] for x in character["names"]])
            quotes=0
            for q in character["speaking"]:
                quotes+=1
            counts[(char_id, names)]=quotes

        sorted_x = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
        print("#quotes\tchar_id\tname")
        for (charid,name),v in sorted_x[:top_n]:
            print("%s\t%s\t%s" % (v,charid, name))


Let's first just examine the characters who have the most dialogue.

In [None]:
print_top_speakers("../data/harry_potter.book.gz")

In [None]:
print_top_speakers("../data/lotr.book.gz")

In [None]:
def get_quote_data(data, targets, max_num):
            
    X=[]
    Y=[]
    
    for character in data["characters"]:
        proper_name_count=character["NNPcount"]
        char_id=character["id"]
        if char_id in targets:
            name=targets[char_id]
            quotes=[]
            for q in character["speaking"]:
                quote=q["w"].lower()
                tokens=nltk.word_tokenize(quote)
                quotes.append(tokens)
            
            random.shuffle(quotes)
            
            assert len(quotes) >= max_num
                
            X.extend(quotes[:max_num])
            Y.extend([name]*max_num)

    return X, Y

In [None]:
def read_data(filename, targets, max_num):

    with gzip.open(filename) as file:
        data=json.load(file)
        X, Y=get_quote_data(data, targets, max_num)
        return X, Y

In [None]:
def build_features(dataX, feature_functions):
    
    """ This function featurizes the data according to the list of parameter feature_functions """
    
    data=[]
    for tokens in dataX:
        feats={}
        
        for function in feature_functions:
            feats.update(function(tokens))

        data.append(feats)
    return data

In [None]:
def features_to_ids(data, feature_vocab):
    
    """ 
    
    This helper function converts a dictionary of feature names to a sparse representation
 that we can fit in a scikit-learn model.  This is important because almost all feature 
 values will be 0 for most documents (note: why?), and we don't want to save them all in 
 memory.

    """
    new_data=sparse.lil_matrix((len(data), len(feature_vocab)))
    for idx,doc in enumerate(data):
        for f in doc:
            if f in feature_vocab:
                new_data[idx,feature_vocab[f]]=doc[f]
    return new_data

In [None]:
def create_vocab(data, top_n=None):
    
    """ 
    
    This helper function converts a dictionary of feature names to unique numerical ids. 
    top_n limits the features to only the n most frequent features observed in the training data 
    (in terms of the number of documents that contains it).
    
    """
    
    counts=Counter()
    for doc in data:
        for feat in doc:
            counts[feat]+=1

    feature_vocab={}

    for idx, (k, v) in enumerate(counts.most_common(top_n)):
        feature_vocab[k]=idx
                
    return feature_vocab

In [None]:
def pipeline(trainX, devX, trainY, devY, feature_functions):

    """ This function evaluates a list of feature functions on the training/dev data arguments """
    
    trainX_feat=build_features(trainX, feature_functions)
    devX_feat=build_features(devX, feature_functions)

    # just create vocabulary from features in *training* data.
    feature_vocab=create_vocab(trainX_feat, top_n=100000)

    trainX_ids=features_to_ids(trainX_feat, feature_vocab)
    devX_ids=features_to_ids(devX_feat, feature_vocab)
    
    clf = linear_model.LogisticRegression(C=1, solver='lbfgs', penalty='l2', max_iter=10000)
    clf.fit(trainX_ids, trainY)
    
    predictions=clf.predict(devX_ids)
    
    return clf, feature_vocab, predictions

In [None]:
def majority_class(trainY, devY):
    labelCounts=Counter()
    for label in trainY:
        labelCounts[label]+=1
    majority_class=labelCounts.most_common(1)[0][0]
    
    return [majority_class]*len(devY)

In [None]:
def print_weights(clf, vocab, n=10):

    reverse_vocab=[None]*len(clf.coef_[0])
    for k in vocab:
        reverse_vocab[vocab[k]]=k
        
    for i, cat in enumerate(clf.classes_):
        
        weights=clf.coef_[i]

        for feature, weight in list(reversed(sorted(zip(reverse_vocab, weights), key = operator.itemgetter(1))))[:n]:
            print("%s\t%.3f\t%s" % (cat, weight, feature))
        print()

In [None]:
def unigram_feature(tokens):
    feats={}
    for word in tokens:
        feats["UNIGRAM_%s" % word]=1
    return feats

In [None]:
preps={}
with open("../data/preposition_list.txt") as file:
    for line in file:
        if not line.startswith("#"):
            preps[line.rstrip()]=1
            
def preposition_feature(tokens):
    feats={}
    for word in tokens:
        if word in preps:
            feats["PREP_%s" % word]=1
    return feats

In [None]:
def length_feature(tokens):
    feats={}
    feats["utterance_length"]=len(tokens)
    
    avg_word_length=0.
    for word in tokens:
        avg_word_length+=len(word)
    avg_word_length/=len(tokens)
    
    feats["avg_word_length"]=avg_word_length
    
    return feats

In [None]:
def readability_feature(tokens):
    feats={}
    data=' '.join(tokens)
    feats["flesch_reading_ease"]=textstat.flesch_reading_ease(data)
    return feats

In [None]:
def punctuation_feature(tokens):
    punct=set(["?", ",", ".", "!", ";", ":"])
    feats={}
    for word in tokens:
        if word in punct:
            feats["PUNCT_%s" % word]=1
    return feats

In [None]:
def process(targets, features, filename, max_num):

    random.seed(1)

    X, Y=read_data(filename, targets, max_num=max_num)
    X=np.array(X, dtype=object)
    Y=np.array(Y, dtype=object)
    kf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    
    preds=[]
    golds=[]
    baseline=[]
    
    for train_index, test_index in kf.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        clf, vocab, predictions=pipeline(X_train, X_test, y_train, y_test, features)
        preds.extend(predictions)
        golds.extend(y_test)
        baseline.extend(majority_class(y_train, y_test))
    
    print("Majority class: %.3f (%s)\n" % (accuracy_score(baseline, golds), len(golds)))
    print("Cross-validated accuracy: %.3f (%s)\n" % (accuracy_score(preds, golds), len(golds)))

    # print weights from last fold
    print_weights(clf, vocab, n=10)

First, let's train a classifier to predict the character identity based on the *content* of their dialogue (effectively learning if different characters talk about kinds of different things).  How do the most characteristic features for each character accord with your own understanding of their language?

In [None]:
targets={216: "Frodo", 106: "Sam", 49: "Gandalf", 317: "Gimli", 259: "Legolas"}
process(targets, [unigram_feature], "../data/lotr.book.gz", 230)

In [None]:
targets={343: "Harry", 247: "Ron", 302: "Hermione", 352: "Dumbledore", 298: "Hagrid"}
process(targets, [unigram_feature], "../data/harry_potter.book.gz", 500)

Next, let's train a classifier on primarily *stylistic* features (average word length, average utterance length, frequency of specific punctuation, reading difficulty).  Can we still see measurable differences between characters?

In [None]:
targets={216: "Frodo", 106: "Sam", 49: "Gandalf", 317: "Gimli", 259: "Legolas"}
process(targets, [length_feature, readability_feature, punctuation_feature], "../data/lotr.book.gz", 230)

In [None]:
targets={343: "Harry", 247: "Ron", 302: "Hermione", 352: "Dumbledore", 298: "Hagrid"}
process(targets, [length_feature, readability_feature, punctuation_feature], "../data/harry_potter.book.gz", 500)

Given these results, let's brainstorm two ideas:

* How could we use these distinctive voices to build a better system for speaker attribution?
* How could we use these results to build a model for *free indirect discourse*? (i.e., where a character's voice influences the narration.)