# Read in data

In [1]:
import pandas as p
train=p.read_csv("/Users/Elizabeth's/Documents/Folger_Digital_Texts_Complete/richardIII_gender_lite.csv", delimiter=',')


In [2]:
train.dropna(axis=0, inplace=True)
train

Unnamed: 0,speaker,sex,lines
77,RICHARD,male,Now is the winter of our discontent
78,RICHARD,male,"Made glorious summer by this son of York,"
79,RICHARD,male,And all the clouds that loured upon our house
80,RICHARD,male,In the deep bosom of the ocean buried.
81,RICHARD,male,"Now are our brows bound with victorious wreaths,"
...,...,...,...
4485,RICHMOND,male,And make poor England weep in streams of blood.
4486,RICHMOND,male,"Let them not live to taste this land’s increase,"
4487,RICHMOND,male,That would with treason wound this fair land’s...
4488,RICHMOND,male,"Now civil wounds are stopped, peace lives again."


In [3]:
y=train['sex'].values
X=train['lines'].values

### Split train/test data for hold-out test

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(X_train[0])
print(y_train[0])
print(X_test[0])
print(y_test[0])

(2635,) (2635,) (1130,) (1130,)
That in the sty of the most deadly boar
male
And give him from me this most needful note.
male


### Data Checking

In [5]:
# Check how many training examples in each category

training_labels = set(y_train)
print(training_labels)
from scipy.stats import itemfreq
training_category_dist = itemfreq(y_train)
print(training_category_dist)

{'female', 'male'}
[['female' 571]
 ['male' 2064]]


`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`
  training_category_dist = itemfreq(y_train)


In [6]:
testing_labels = set(y_test)
print(testing_labels)
testing_category_dist = itemfreq(y_test)
print(testing_category_dist)

{'female', 'male'}
[['female' 259]
 ['male' 871]]


`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`
  testing_category_dist = itemfreq(y_test)


## Vectorization

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk.stem

snowball_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([snowball_stemmer.stem(w) for w in analyzer(doc)])

In [8]:
# unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
#unigram_count_vectorizer = CountVectorizer( binary=False, min_df=3 , max_df = 0.9, stop_words= 'english')
unigram_count_vectorizer_stem = StemmedCountVectorizer(binary=False, min_df=3 ,max_df = 0.9, stop_words= 'english')

#  bigrams
#gram12_count_vectorizer = CountVectorizer(ngram_range=(1,2), min_df=3, max_df = 0.9, stop_words= 'english')
gram12_count_vectorizer_stem = StemmedCountVectorizer(ngram_range=(1,2), min_df=3, max_df = 0.9, stop_words= 'english')

## Vectorize the training data

In [9]:
# fit vocabulary in training documents and transform the training documents into vectors
X_train_vec_stem = unigram_count_vectorizer_stem.fit_transform(X_train)

# check the content of a document vector
print(X_train_vec_stem.shape)
X_train_vec_stem[0].toarray()

# check the size of the constructed vocabulary
print(len(unigram_count_vectorizer_stem.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(unigram_count_vectorizer_stem.vocabulary_.items())[:15])

(2635, 797)
797
[('dead', 158), ('boar', 69), ('hath', 320), ('turn', 719), ('prayer', 516), ('head', 323), ('better', 56), ('came', 94), ('father', 245), ('just', 368), ('small', 628), ('power', 514), ('tut', 720), ('thou', 696), ('art', 26)]


In [10]:
# fit vocabulary in training documents and transform the training documents into vectors
X_train_gram_vec_stem = gram12_count_vectorizer_stem.fit_transform(X_train)

# check the content of a document vector
print(X_train_gram_vec_stem.shape)
X_train_gram_vec_stem[0].toarray()

# check the size of the constructed vocabulary
print(len(gram12_count_vectorizer_stem.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(gram12_count_vectorizer_stem.vocabulary_.items())[:15])

(2635, 883)
883
[('dead', 167), ('boar', 71), ('hath', 349), ('turn', 805), ('prayer', 566), ('head', 352), ('better', 58), ('came', 98), ('father', 262), ('just', 398), ('small', 689), ('power', 564), ('tut', 806), ('thou', 766), ('art', 26)]


## Vectorize the test data

In [11]:
X_test_vec_stem = unigram_count_vectorizer_stem.transform(X_test)

print(X_test_vec_stem.shape)

(1130, 797)


In [13]:
X_test_gram_vec_stem = gram12_count_vectorizer_stem.transform(X_test)

print(X_test_gram_vec_stem.shape)

(1130, 883)


# Train a LinearSVC classifier

In [14]:
# import the LinearSVC module
from sklearn.svm import LinearSVC

# initialize the LinearSVC model
# default is 1; play with different costs
svm_clf = LinearSVC(C=1)
svm_clf_gram = LinearSVC(C=1)

# use the training data to train the model
svm_clf.fit(X_train_vec_stem,y_train)
svm_clf_gram.fit(X_train_gram_vec_stem,y_train)


LinearSVC(C=1)

## explore and interpret trained LinearSVC model

In [15]:
unigram_count_vectorizer_stem.vocabulary_.get('worthless')
for i in range(0,5):
  print(svm_clf.coef_[i][unigram_count_vectorizer_stem.vocabulary_.get('worthless')])

[[ 5.09883643e-01  4.52189025e-01  5.54415945e-01 -3.68749837e-01
   2.91327217e-01  5.82057462e-01 -1.03547250e-01  3.17941855e-01
  -7.84214314e-03 -6.45474043e-01  5.14596953e-01 -6.00369188e-01
   5.05951285e-01 -9.64536718e-01 -1.87088576e-01  8.87824359e-01
  -1.57388957e-01  3.84743060e-01 -1.62834503e-01 -6.01817771e-02
  -1.77056206e-01  1.36961153e-01 -7.92898248e-02  2.55916144e-01
   4.06468094e-01  3.87013618e-01 -3.56400138e-02  6.28020532e-02
   2.32003618e-02 -1.72245228e-01  2.55827352e-01  9.17515853e-02
  -1.18821772e-01 -4.81257292e-01  1.50698758e-01 -1.14641087e+00
  -6.67280944e-01  3.20859509e-01 -1.81586331e-01  3.90131221e-01
   1.29037908e+00 -5.70059260e-03  4.45261032e-01 -7.79705981e-01
  -4.09583531e-01 -5.65617574e-01 -3.45479006e-01 -1.19889898e-01
   5.71661767e-01  7.98442155e-01  1.38777878e-17 -4.84311233e-01
   5.33661006e-01 -4.34028551e-01 -2.76093519e-01 -6.02797847e-01
   2.06433200e-01  5.03204660e-01  3.66289357e-01 -5.74612438e-02
  -5.85971

IndexError: index 1 is out of bounds for axis 0 with size 1

In [16]:
gram12_count_vectorizer_stem.vocabulary_.get('worthless')
for i in range(0,5):
  print(svm_clf_gram.coef_[i][gram12_count_vectorizer_stem.vocabulary_.get('worthless')])

[[ 4.00148139e-01  6.04622754e-01  5.61857062e-01 -1.93284402e-01
   2.80892667e-01  5.85469252e-01 -2.06325459e-02  2.35263563e-01
  -5.10047499e-02 -8.70847064e-01  5.08621951e-01 -5.87477900e-01
   4.93790169e-01 -9.50574600e-01 -2.96128613e-01  9.16431432e-01
   2.41976775e-01  4.26381679e-01 -2.72313299e-01  1.64493657e-02
  -1.52376408e-01  2.64149788e-01 -5.88630538e-02  2.70596389e-01
   4.21301046e-01  4.08956108e-01 -1.07126032e-01  8.24893657e-02
   5.84624528e-02  1.49732506e-02 -8.25662736e-02  2.61403080e-01
   8.14675100e-02 -5.30383520e-02 -5.43345026e-01  1.26740418e-01
  -1.14865506e+00 -6.79967667e-01  9.63156316e-02 -1.78256204e-01
   4.40562250e-01  1.20158200e+00  1.22089068e-01 -6.70849335e-01
   4.70342295e-01 -7.66034014e-01 -4.25685962e-01 -5.76499777e-01
  -3.70986076e-01 -1.37225737e-01  5.54657120e-01  9.61814358e-01
   0.00000000e+00 -4.63280741e-01  5.16811237e-01 -4.03708980e-01
  -2.75054918e-01 -6.75557212e-01  2.17596947e-01  3.90555158e-01
   3.94280

IndexError: index 1 is out of bounds for axis 0 with size 1

In [27]:
def show_most_and_least_informative_features(vectorizer, clf, class_idx=0, n=10):
    feature_names = vectorizer.get_feature_names_out()
    coefs_with_fns = sorted(zip(clf.coef_[class_idx], feature_names)) 
    top = zip(coefs_with_fns[:n], coefs_with_fns[-n:])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2, fn_2))

In [28]:
show_most_and_least_informative_features(unigram_count_vectorizer_stem, svm_clf, class_idx=0,  n=10)

	-1.3353	teach          		1.0708	pardon         
	-1.3190	subject        		1.0891	twas           
	-1.3188	scene          		1.1490	hors           
	-1.2919	lose           		1.1561	light          
	-1.2223	hell           		1.2103	woman          
	-1.1889	lament         		1.2504	hither         
	-1.1871	trembl         		1.2904	bastard        
	-1.1787	oath           		1.3606	late           
	-1.1464	babe           		1.3644	ladi           
	-1.1412	miseri         		1.4464	guess          


In [30]:
show_most_and_least_informative_features(gram12_count_vectorizer_stem, svm_clf_gram, class_idx=0, n=10)

	-1.5242	thou hast      		1.1224	hors           
	-1.3578	scene          		1.1228	pardon         
	-1.3297	subject        		1.2016	bastard        
	-1.3142	lose           		1.2022	hither         
	-1.2814	teach          		1.2507	light          
	-1.2636	hell           		1.2598	poor clar      
	-1.2397	oath           		1.3385	despair di     
	-1.2138	infect         		1.3391	ladi           
	-1.1709	trembl         		1.4482	late           
	-1.1663	lament         		1.5293	guess          


## Test the LinearSVC classifier

In [32]:
# test the classifier on the test data set, print accuracy score

svm_clf.score(X_test_vec_stem,y_test)

0.7707964601769911

In [33]:
svm_clf_gram.score(X_test_gram_vec_stem,y_test)


0.7707964601769911

In [34]:
from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec_stem)
cm = confusion_matrix(y_test, y_pred, labels=['male','female'])
print(cm)
print()

from sklearn.metrics import classification_report
target_names = ['male','female']
print(classification_report(y_test, y_pred, target_names=target_names))

[[784  87]
 [172  87]]

              precision    recall  f1-score   support

        male       0.50      0.34      0.40       259
      female       0.82      0.90      0.86       871

    accuracy                           0.77      1130
   macro avg       0.66      0.62      0.63      1130
weighted avg       0.75      0.77      0.75      1130



In [35]:
y_pred2 = svm_clf_gram.predict(X_test_gram_vec_stem)
cm2 = confusion_matrix(y_test, y_pred2, labels=['male','female'])
print(cm2)
print()

target_names = ['male','female']
print(classification_report(y_test, y_pred2, target_names=target_names))

[[782  89]
 [170  89]]

              precision    recall  f1-score   support

        male       0.50      0.34      0.41       259
      female       0.82      0.90      0.86       871

    accuracy                           0.77      1130
   macro avg       0.66      0.62      0.63      1130
weighted avg       0.75      0.77      0.75      1130



## Interpret the prediction result

In [36]:
## get the confidence scores 
svm_confidence_scores = svm_clf.decision_function(X_test_vec_stem)
## get the confidence score for the first test example
print(svm_confidence_scores[0])

## Confirm by printing out the actual prediction
print(y_test[0])

1.11364943778176
male


## Error Analysis

In [37]:
# print out specific type of error for further analysis

err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='male' and y_pred[i]=='female'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

Put in her tender heart th’ aspiring flame
Thy voice is thunder, but thy looks are humble.
How canst thou urge God’s dreadful law to us
Say I, her sovereign, am her subject low.
The bleeding witness of my hatred by,
Well, hie thee to thy lord. I kiss his hand.
One raised in blood, and one in blood established;
In that you brook it ill, it makes him worse.
Remember Margaret was a prophetess.—
Lest to thy harm thou move our patience.
Familiarly shall call thy Dorset brother.
Could not believe but that I was in hell,
Shall lose the royalty of England’s throne.
And in their summer beauty kissed each other.
Doth comfort thee in thy sleep. Live and flourish.
What sayst thou now? Speak suddenly. Be brief.
Amen. And make me die a good old man!
Your beauty was the cause of that effect—
O thus, quoth Dighton, lay the gentle babes.
O, make them joyful. Grant their lawful suit.
No other harm but loss of such a lord.
And then, as we have ta’en the sacrament,
Your country’s fat shall pay your pains 

In [38]:
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='female' and y_pred2[i]=='male'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

To revel in the entrails of my lambs.
Yet that, by you deposed, you quake like rebels.—
I fear me both are false.
That in your outward action shows itself
Of these known evils but to give me leave
I never did incense his Majesty
What stay had I but Edward? And he’s gone.
Thou elvish-marked, abortive, rooting hog,
True, when avoided grace makes destiny.
Queen?
Whose deadly web ensnareth thee about?
O no, my reasons are too deep and dead—
And so will I.
The King, on his own royal disposition,
Stay, I will go with you.
Saw you the King today, my lord of Derby?
But with his timorous dreams was still awaked.
Now, for my life, she’s wandering to the Tower,
And not be Richard, that hath done all this.
It is a quarrel just and reasonable
My damnèd son that thy two sweet sons smothered.
If ever he have wife, let her be made
Blush, blush, thou lump of foul deformity,
I fear our happiness is at the height.
God grant him health. Did you confer with him?
And often up and down my sons were tossed
It

## 5-fold cross validation

In [48]:
from sklearn import svm

In [49]:
from sklearn.svm import *

In [50]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [51]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
svm_clf_pipe = Pipeline([('vect', StemmedCountVectorizer(binary=False, min_df= 5, stop_words = 'english')),('svm', SVC(C= 1, kernel= 'rbf', random_state=42))])
scores = cross_val_score(svm_clf_pipe, X, y, cv=5)
avg=sum(scores)/len(scores)
print(avg)

0.7713147410358566


In [52]:
svm_clf_pipe3 = Pipeline([('vect', StemmedCountVectorizer(binary=False, min_df= 5, stop_words = 'english')),('svm', LinearSVC(C=1))])
scores = cross_val_score(svm_clf_pipe3, X, y, cv=5)
avg=sum(scores)/len(scores)
print(avg)

0.7593625498007969


In [53]:
svm_clf_pipe4 = Pipeline([('vect', StemmedCountVectorizer(binary=False, min_df= 5, ngram_range= (1,2), stop_words = 'english')),('svm', LinearSVC(C=1))])
scores = cross_val_score(svm_clf_pipe4, X, y, cv=5)
avg=sum(scores)/len(scores)
print(avg)

0.7556440903054449


## Explore other SVM models

In [54]:
from sklearn.model_selection import GridSearchCV

In [59]:
params = {
    'kernel': ['linear','poly', 'rbf'],
    'C':[1, 3, 5, 8],  # [1, 10, 20, 45]
    #'gamma': ['scale','auto'],
    #'shrinking' : [True,False],
    #'probability': [True,False],
    'verbose': [True],
   # 'decision_function_shape': ['ovo','ovr'],
    #'break_ties': [True,False],
}
svm_model = SVC(random_state=42)

clf_try = GridSearchCV(svm_model, params,  cv=5)
clf_try.fit(X_train_vec_stem, y_train)


[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

GridSearchCV(cv=5, estimator=SVC(random_state=42),
             param_grid={'C': [1, 3, 5, 8], 'kernel': ['linear', 'poly', 'rbf'],
                         'verbose': [True]})

In [60]:
clf_try.cv_results_

{'mean_fit_time': array([0.09617782, 0.1366765 , 0.12932086, 0.13454404, 0.14962392,
        0.14562049, 0.19047232, 0.13747416, 0.14719748, 0.28580761,
        0.15043859, 0.14846063]),
 'std_fit_time': array([0.00456158, 0.007054  , 0.00270677, 0.00681367, 0.00162559,
        0.00257753, 0.01853426, 0.00916464, 0.00353858, 0.03555165,
        0.00490497, 0.00222727]),
 'mean_score_time': array([0.01100755, 0.01861234, 0.02201285, 0.01040673, 0.01801367,
        0.02241726, 0.01021061, 0.01740842, 0.02180572, 0.01001229,
        0.01680875, 0.02180681]),
 'std_score_time': array([6.27919115e-04, 7.96267205e-04, 6.34086044e-04, 7.97245377e-04,
        6.32269663e-04, 4.90018392e-04, 3.98727388e-04, 4.88984321e-04,
        3.99162188e-04, 8.18939188e-06, 4.02963174e-04, 7.47723399e-04]),
 'param_C': masked_array(data=[1, 1, 1, 3, 3, 3, 5, 5, 5, 8, 8, 8],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        f

In [61]:
clf_try.best_params_

{'C': 1, 'kernel': 'poly', 'verbose': True}

In [62]:
clf_poly = SVC(C= 1, kernel= 'poly', random_state=42)

In [63]:
clf_poly.fit(X_train_vec_stem, y_train)

SVC(C=1, kernel='poly', random_state=42)

In [64]:
clf_poly.score(X_test_vec_stem,y_test)

0.7831858407079646

In [65]:
clf_poly.predict(X_test_vec_stem)

array(['male', 'male', 'male', ..., 'male', 'male', 'male'], dtype=object)

In [66]:
y_pred = clf_poly.predict(X_test_vec_stem)
cm=confusion_matrix(y_test, y_pred2, labels=['male','female'])
print(cm)
print()

target_names = ['male','female']
print(classification_report(y_test, y_pred, target_names=target_names))

[[782  89]
 [170  89]]

              precision    recall  f1-score   support

        male       0.68      0.10      0.18       259
      female       0.79      0.99      0.88       871

    accuracy                           0.78      1130
   macro avg       0.73      0.54      0.53      1130
weighted avg       0.76      0.78      0.72      1130



## Bigrams, stemmed

In [67]:
params = {
    'kernel': [ 'poly', 'rbf', 'linear'],
    'C':[1, 2, 3, 5, 10],
    #'gamma': ['scale','auto'],
    #'shrinking' : [True,False],
    #'probability': [True,False],
    'verbose': [True],
   # 'decision_function_shape': ['ovo','ovr'],
    #'break_ties': [True,False],
}
svm_model = SVC(random_state=42)

clf_bi = GridSearchCV(svm_model, params,  cv=3)
clf_bi.fit(X_train_gram_vec_stem, y_train)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

GridSearchCV(cv=3, estimator=SVC(random_state=42),
             param_grid={'C': [1, 2, 3, 5, 10],
                         'kernel': ['poly', 'rbf', 'linear'],
                         'verbose': [True]})

In [68]:
clf_bi.cv_results_

{'mean_fit_time': array([0.09062529, 0.09392269, 0.06972226, 0.11100856, 0.10259477,
        0.08215332, 0.10927463, 0.10869408, 0.08927782, 0.09503023,
        0.11836473, 0.12502027, 0.1120313 , 0.10389892, 0.20796514]),
 'std_fit_time': array([0.00196376, 0.00122795, 0.00255812, 0.01653199, 0.00044582,
        0.00700676, 0.00634434, 0.00410773, 0.00416231, 0.00082328,
        0.01065436, 0.01468348, 0.01414173, 0.00066876, 0.02104227]),
 'mean_score_time': array([0.02700408, 0.0330108 , 0.01667428, 0.02800568, 0.03467361,
        0.01600512, 0.02767555, 0.03467568, 0.01467808, 0.02800481,
        0.03440086, 0.01567197, 0.02800059, 0.03201111, 0.0140113 ]),
 'std_score_time': array([8.24613579e-04, 3.10453176e-06, 4.64964400e-04, 2.15884604e-03,
        1.69788002e-03, 8.17218864e-04, 2.06153809e-03, 2.49398312e-03,
        4.73351627e-04, 8.15074920e-04, 1.98012264e-03, 4.68167203e-04,
        8.43161723e-06, 8.07058696e-04, 7.67479080e-06]),
 'param_C': masked_array(data=[1, 1, 1

In [69]:
clf_bi.best_params_

{'C': 1, 'kernel': 'poly', 'verbose': True}

In [70]:
clf_poly_gram = SVC(C= 1, kernel= 'rbf', random_state=42)
clf_poly_gram.fit(X_train_gram_vec_stem, y_train)

SVC(C=1, random_state=42)

In [71]:
clf_poly_gram.score(X_test_gram_vec_stem,y_test)

0.7778761061946903

In [72]:
y_pred2 = clf_poly.predict(X_test_vec_stem)
cm=confusion_matrix(y_test, y_pred2, labels=['male','female'])
print(cm)
print()

target_names = ['male','female']
print(classification_report(y_test, y_pred2, target_names=target_names))

[[858  13]
 [232  27]]

              precision    recall  f1-score   support

        male       0.68      0.10      0.18       259
      female       0.79      0.99      0.88       871

    accuracy                           0.78      1130
   macro avg       0.73      0.54      0.53      1130
weighted avg       0.76      0.78      0.72      1130

