# Chapter 4 - Epigraphs 

This is the notebook analyzing how different paratextual headers correlate with stylistic differences between British novels of the 1790s. As in chapter two, I compare the object in question (here, novels with chapter epigraphs) to other contemporary forms (epistolary novels and ones with chapter summaries) using a cluster analysis experiment and a classification one. 


### Load required libararies. 

In [None]:
%matplotlib inline 
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from collections import Counter
import time
import numpy as np 
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import pylab as pl
import bookFunctions
import csv

### Load novel corpora

In [None]:
fullCorpus = pickle.load(open('works-fullNovelCorpus.pickle', 'rb')) 

In [None]:
### Observe structure of the corpus
fullCorpus[1].keys()

In [None]:
texts = [] 
paratexts = [] 
authors = [] 
titles = [] 
for i in fullCorpus:
    ### Uncomment the next three lines if you don't want to include the bill-of-fare novels.
    #if i['paratext'] == 'bill-of-fare': 
       # continue
    #else: 
    texts.append(i['text'])
    paratexts.append(i['paratext'])
    titles.append(i['title'])
print(len(texts))
print(len(paratexts))
print(len(titles))

In [None]:
print(Counter(paratexts))

### View rest of the metadata for the novels

In [None]:
with open("/Users/collinjennings/Dropbox/eccoDriveWork/metadata/novelCorpusESTCMeta.txt") as f:
    a = [{k.strip(): v.strip() for k, v in row.items() if v is not None  } for row in csv.DictReader(f, delimiter=';')]    

In [None]:
### Example of first row
print(a[0])

## Analyze the terms of the novels and visualize them

In [None]:
t0 = time.process_time() 
vectorizer = TfidfVectorizer(input='content', decode_error='replace', ngram_range=(3,3), stop_words=None,
                             max_features=200, min_df=.20, norm='l2', use_idf=True)
dtm = vectorizer.fit_transform(texts)
vocab = vectorizer.get_feature_names()
dtm2 = dtm.toarray()
vocab = np.array(vocab)

from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(dtm)

print(len(vocab))
print(time.process_time() - t0)

#### Reduce the number of dimensions with PCA

In [None]:
pca = PCA(n_components=2).fit(dist)
pos = pca.transform(dist)
xs, ys = pos[:, 0], pos[:, 1]

#### Create a dataframe and visualize the PCA results

In [None]:
df = pd.DataFrame(dict(x=xs, y=ys,title=titles, label=paratexts)) 
#group by cluster
groups = df.groupby('label')

#### Visualization with just the epistolary and epigraph novels

In [None]:
idList = [38, 43, 31, 78, 95, 140, 178, 203, 214, 222, 223, 229, 219, 231, 
         42, 32, 84, 92] 
cluster_colors = {'epigraphs': '#EBEBEB', 'bill-of-fare': '#D9D9D9', 'epistolary': '#BBBABA'} 
fig, ax = plt.subplots(figsize=(12, 12)) # set size
ax.margins(0.05)

for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=14, 
            label=name, color=cluster_colors[name]) 
    ax.legend(numpoints=1) 

#add label in x,y position with the label as the film title
for i in range(len(df)):
    if i in idList: 
        ax.text(df.loc[i]['x'], df.loc[i]['y'], df.loc[i]['title'], ## maybe iloc
            horizontalalignment='center', weight='bold', size=10)  
plt.title('1790s Epigraph, Bill-of-Fare, and Epistolary Novels Clustered by Top 400 Trigrams with TSNE', fontsize=16)
plt.show() 

#### Visualization with bill-of-fare novels included

In [None]:
# set up plot
cluster_colors = {'epigraphs': '#F6F6F6', 'bill-of-fare': '#D9D9D9', 'epistolary': '#BBBABA'} 
fig, ax = plt.subplots(figsize=(8, 8)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=14, 
            label=name, color=cluster_colors[name]) #, ## removed cluster_name dict 
            #mec='none')
  
    ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    if i in idList: 
        ax.text(df.loc[i]['x'], df.loc[i]['y'], df.loc[i]['title'], ## maybe iloc
            horizontalalignment='center', weight='bold', size=10)  
#plt.title('1790s Epigraph, Bill-of-Fare, and Epistolary Novels Clustered by Top 400 Trigrams with TSNE', fontsize=16)
plt.savefig('epigraphViz/newLabelBillEpiEpis.jpg', bbox_inches='tight', dpi=400)
plt.show() #show the plot


### Identify shared features of different clusters in the graph
Set the limits for each axis below and then indicate which side of the limit you want to examine--to the left or right of the x-value and above or below the y-value. 

In [None]:
xLimit = None
yLimit = None
aboveVal = True
ToTheLeftVal = True

posGroup, notGroup = bookFunctions.separator(pos, xLimit, yLimit, aboveVal,  ToTheLeftVal)

In [None]:
avgArray, notAvgArray = bookFunctions.analyzer(dtm2, posGroup, notGroup)
gothFull, notGothFull = bookFunctions.valueRanker(graph_analyzer.differencer
                                                             (avgArray, notAvgArray),  vocab, 50)

In [None]:
print(sorted(gothFull, key=lambda x:x[1], reverse=True))
print()
print(sorted(notGothFull, key=lambda x:x[1], reverse=True))

## Classification

Organize the corpus for classifying.

In [None]:
paraCheck = ['epigraphs', 'epistolary', 'bill-of-fare']
texts = [] 
paratexts = [] 
authors = [] 
titles = [] 
for item in fullCorpus:
    if item['paratext'] in paraCheck and 'text' in item.keys():
        paratexts.append(item['paratext'])
        texts.append(item['text'])
        authors.append(item['author'])
        titles.append(item['title']) 

titles2 = [title[:15] for title in titles]

### Set up a dataframe for the classifier

In [None]:
d = {'paratext': paratexts, 'title':titles2, 'text': texts}
df = pd.DataFrame(d)

In [None]:
df['category_id'] = df['paratext'].factorize()[0]
df.head()

In [None]:
category_id_df = df[['paratext', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'paratext']].values)

In [None]:
t0 = time.process_time() 
tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(2,3), stop_words=None, min_df=.50, norm='l2', use_idf=True) ### CountVectorizer
features = tfidf.fit_transform(df.text).toarray()
labels = df.category_id
print(time.process_time() - t0)

In [None]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs'),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV) 
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

### Choose a classifier and print the results

In [None]:
model = LinearSVC()
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, y_pred, 
                                    target_names=df['paratext'].unique())) 

### Visualize the classifier decision boundaries

In [None]:
pca = PCA(n_components=2).fit(X_train)
pca_2d = pca.transform(X_train)
plt.figure(figsize=(10,10))
svmClassifier_2d = LinearSVC().fit(pca_2d, y_train)
for idx, i in enumerate(y_train.keys()):
    if y_train[i] == 0:
        c1 = plt.scatter(pca_2d[idx,0],pca_2d[idx,1],c='r',    s=50,marker='+')
    elif y_train[i] == 1:
        c2 = plt.scatter(pca_2d[idx,0],pca_2d[idx,1],c='g',    s=50,marker='o')
    elif y_train[i] == 2:
        c3 = plt.scatter(pca_2d[idx,0],pca_2d[idx,1],c='b',    s=50,marker='*')
plt.legend([c1, c2, c3], ['Bill-of-Fare', 'Epistolary',   'Epigraph'])
x_min, x_max = pca_2d[:, 0].min() -.5,   pca_2d[:,0].max()+.5   # remove -1, +1
y_min, y_max = pca_2d[:, 1].min()-.5 ,   pca_2d[:, 1].max()+.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, .001),   np.arange(y_min, y_max, .001))
Z = svmClassifier_2d.predict(np.c_[xx.ravel(),  yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contour(xx, yy, Z, alpha=0.8)
plt.title('Support Vector Machine Decision Surface')
plt.axis('off')
plt.show()