## Imports and Setup

In [16]:
%matplotlib inline
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
from   sklearn.decomposition import TruncatedSVD
from   sklearn.feature_extraction.text import TfidfVectorizer
from   sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
from   sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

In [17]:
# class corpus metadata
metadata = pd.read_csv("class_corpus_metadata.csv")

metadata.shape

(160, 34)

In [18]:
metadata.head()

Unnamed: 0.1,Unnamed: 0,check_1,check_2,title,year,author1_surname,author1_givenname,author2_surname,author2_givenname,gender_author1,...,feminist fiction,mystery,adventure,tragedy,children,regency,manners,philosophical,coming-of-age,filename
0,nsg57,scw222,lcc82,"Writings in the United Amateur, 1915 - 1922",1922,Lovecraft,Howard,,,Male,...,False,True,False,False,False,False,False,True,False,Lovecraft_WritingsintheUnitedAmateur1915-1922.txt
1,fhh26,gs542,tj256,Whose Body?,1923,Sayers,Dorothy L.,,,Female,...,False,True,False,False,False,False,False,False,False,Sayres_WhoseBody.txt
2,cl2264,,,Voodoo Planet,1959,Norton,Andre,,,Female,...,False,False,True,False,False,False,False,False,False,Norton_VoodooPlanet.txt
3,ehh52,sjr255,kg428,"Varney the Vampire; Or, the Feast of Blood by ...",1845,Rymer,James Malcolm,Prest,Thomas Peckett,Male,...,False,False,False,False,False,False,False,False,False,Prest_Rhymer_VarneyTheVampire.txt
4,dgr73,jlp367,kg428,Uncle Tom's Cabin,1852,Stowe,Harriet Beecher,,,Female,...,False,False,False,False,False,False,False,False,False,Stowe_UncleTom_sCabin.txt


In [19]:
# training data are books that are either horror or detective
training_data = metadata[(metadata['horror']==True) | (metadata['detective']==True)]

# drop books that are both horror and detective
drop = metadata[(metadata['horror']==True) & (metadata['detective']==True)]
training_data = training_data.drop(drop.index)

# testing data are books are neither horror or detective
testing_data = metadata[(metadata['horror']==False) & (metadata['detective']==False)]

# sort titles alphabetically 
training_data = training_data.sort_values('title')
testing_data = testing_data.sort_values('title')
# note: training+testing+dropped row = 159 rows, class corpus = 160 rows, "An Unkindness of Ghosts" has no input for horror and detective column

In [20]:
training_data

Unnamed: 0.1,Unnamed: 0,check_1,check_2,title,year,author1_surname,author1_givenname,author2_surname,author2_givenname,gender_author1,...,feminist fiction,mystery,adventure,tragedy,children,regency,manners,philosophical,coming-of-age,filename
159,tl566,hz542,ja532,813,1910,Leblanc,Maurice,,,Male,...,False,True,False,False,False,False,False,False,False,Leblanc_813.txt
156,gc386,,,A Strange Disappearance,1998,Green,Anna Katharine,,,Female,...,False,True,False,False,False,False,False,False,False,GreenAnnaKatharine_AStrangeDisappearance.txt
155,nca28,tl566,stw43,A Study in Scarlet,1887,Conan Doyle,Arthur,,,Male,...,False,True,False,False,False,False,False,False,False,ConanDoyle_AStudyInScarlet.txt
153,jc2739,,,Agatha Webb,1899,Green,Anna Katharine,,,Female,...,False,True,False,False,False,False,False,False,False,Green_AgathaWebb.txt
146,lcc82,yk499,,Carmilla,1872,Le_Fanu,Joseph Sheridan,,,Male,...,False,False,False,False,False,False,False,False,False,Carmilla.txt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,tr333,sjs457,sl2324,The Valley of Fear,1915,Doyle,Arthur Conan,,,Male,...,False,False,False,False,False,False,False,False,False,Doyle_TheValleyOfFear.txt
10,lrs263,sh785,hz542,The Wisdom of Father Brown,1914,Chesterton,Gilbert Keith,,,Male,...,False,False,False,False,False,False,False,False,False,Chesterton_TheWisdomOfFatherBrown.txt
3,ehh52,sjr255,kg428,"Varney the Vampire; Or, the Feast of Blood by ...",1845,Rymer,James Malcolm,Prest,Thomas Peckett,Male,...,False,False,False,False,False,False,False,False,False,Prest_Rhymer_VarneyTheVampire.txt
1,fhh26,gs542,tj256,Whose Body?,1923,Sayers,Dorothy L.,,,Female,...,False,True,False,False,False,False,False,False,False,Sayres_WhoseBody.txt


### There are 80 combined horror and detective novels in the corpus that we will use to train the classifier.

In [21]:
testing_data

Unnamed: 0.1,Unnamed: 0,check_1,check_2,title,year,author1_surname,author1_givenname,author2_surname,author2_givenname,gender_author1,...,feminist fiction,mystery,adventure,tragedy,children,regency,manners,philosophical,coming-of-age,filename
158,tr333,sjs457,sl2324,A Round Dozen,1883,Coolidge,Susan,,,Female,...,False,False,False,False,False,False,False,False,False,Coolidge_ARoundDozen.txt
157,kwy3,cl922,hk627,A Sicillian Romance,1790,Radcliffe,Ann Ward,,,Female,...,False,False,False,False,False,False,False,False,False,radcliffeann_a_sicillian_romance.txt
154,lqz4,gt294,lcc82,Adele Doring at Boarding-School,1921,North,Grace May,,,Female,...,False,False,False,False,True,False,False,False,False,adele_doring_boarding_school.txt
152,yc2669,xf89,wms87,Agnes Grey,1847,Bronte,Anne,,,Female,...,True,False,False,False,False,False,True,False,False,Bronte_AgnesGrey.txt
151,mn454,ar2465,jlp367,An Old-Fashioned Girl,1869,Alcott,Louisa May,,,Female,...,False,False,False,False,True,False,True,False,True,Alcott_AnOld-FashionedGirl.txt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,jc2739,,,This Side of Paradise,1920,Fitzgerald,F. Scott,,,Male,...,False,False,False,False,False,False,False,False,True,Fitzgerald_ThisSideOfParadise.txt
6,vs339,thh55,,To Kill A Mockingbird,1960,Lee,Harper,,,Female,...,False,False,False,False,False,False,False,False,False,Lee_ToKillAMockingbird.txt
5,fhh26,gs542,tj256,Twenty Thousand Leagues Under the Sea,1870,Verne,Jules,,,Male,...,False,False,True,False,False,False,False,False,False,Verne_TwentyThousandLeagues.txt
4,dgr73,jlp367,kg428,Uncle Tom's Cabin,1852,Stowe,Harriet Beecher,,,Female,...,False,False,False,False,False,False,False,False,False,Stowe_UncleTom_sCabin.txt


### There are 78 combined novels from a variety of genres that are not horror or detective in the corpus.

In [22]:
# get book file names to open
training_names = training_data.filename.values
testing_names = testing_data.filename.values

In [23]:
print('First book in the training dataset:',training_names[0])
print('First book in the testing dataset:',testing_names[0])

First book in the training dataset: Leblanc_813.txt
First book in the testing dataset: Coolidge_ARoundDozen.txt


In [24]:
# 1=detective, 0=horror， gold labels
y_train=(training_data.detective.values*1).astype('int')

In [25]:
y_train

array([1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0])

In [26]:
# open and append training books together

training_books=[]
for book in training_names:
    with open(book, 'r',encoding='utf-8') as f:
        file = f.read().replace("\n", " ") 
        training_books.append(file)
        

In [27]:
# open and append testing books together

testing_books=[]
for book in testing_names:
    with open(book, 'r',encoding='utf-8') as f:
        file = f.read().replace("\n", " ") 
        testing_books.append(file)
        

### Vectorization, stopwords, normalization, standardization
an explanation. we really did try to not do arbitrary things

In [28]:
punct=[]
for x in string.punctuation:
    punct.append(x)
punct.append('--')
punct.append('`')
punct.append("“")
punct.append("”")

In [29]:
# Custom preprocessing to remove escaped characters in input, taken from MP02
def pre_proc(x):
    '''
    Takes a unicode string.
    Lowercases, strips accents, and removes some escapes.
    Returns a standardized version of the string.
    '''
    import unicodedata
    return unicodedata.normalize('NFKD', x.replace("\'", "'").replace("\ in\ form", " inform").lower().strip())

# Set up vectorizer

vectorizer = TfidfVectorizer(
    encoding='utf-8',
    preprocessor=pre_proc,
   # stop_words=punct,
    min_df=2, # Note this
    max_df=0.8, # This, too
    binary=False,
    norm='l2',
    use_idf=True, # And this,
    #max_features=10000
)

# Your code here
X_train = vectorizer.fit_transform(training_books)
print("Matrix shape:", X_train.shape)

Matrix shape: (80, 30376)


In [30]:
#standardization
X_train_Z = StandardScaler(with_mean=False).fit_transform(X_train)
display(X_train_Z)
print('z-scored l2 mean:', round(np.mean(X_train_Z),3))
np.std(X_train)

<80x30376 sparse matrix of type '<class 'numpy.float64'>'
	with 376031 stored elements in Compressed Sparse Row format>

z-scored l2 mean: 0.292


ValueError: setting an array element with a sequence.

### Classifier

In [41]:
# Examine the performance of our simple classifiers
# Freebie function to summarize and display classifier scores
def compare_scores(scores_dict):
    '''
    Takes a dictionary of cross_validate scores.
    Returns a color-coded Pandas dataframe that summarizes those scores.
    '''
    df = pd.DataFrame(scores_dict).T.applymap(np.mean).style.background_gradient(cmap='RdYlGn')
    return df

We are choosing Multinomial NB, logistic, for reasons x and y. our thought process cuz we shouldnt use the multinomial because you should appreciate that we are actually thinking instead of copying and pasting mp 02 thank you
we are smart and realize that class_prior is not good for reasons

In [59]:
vect_n=[]
mm=[]
matrix_n= {}
feat_n = [100, 5000,10000,15000,17500,20000,22500,25000,30000,35000]
for x in feat_n:
    vectorizer = TfidfVectorizer(
        encoding='utf-8',
        preprocessor=pre_proc,
        min_df=2, # Note this
        max_df=0.8, # This, too
        binary=False,
        norm='l2',
        use_idf=True, # And this
        max_features=x)
    vect_n.append(vectorizer)
    matrix = vectorizer.fit_transform(training_books)
    #print('z-scored l2 mean:', round(np.mean(X_train_Z),3))
    dict_key=str(x)
    matrix_n[dict_key] = matrix
    mm.append(matrix)
    #matrix="matrix"+str(x)
  #  matrix = vectorizer.fit_transform(training_books)


In [32]:
matrix_n

{'5000': <80x5000 sparse matrix of type '<class 'numpy.float64'>'
 	with 187794 stored elements in Compressed Sparse Row format>,
 '10000': <80x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 274957 stored elements in Compressed Sparse Row format>,
 '15000': <80x15000 sparse matrix of type '<class 'numpy.float64'>'
 	with 320917 stored elements in Compressed Sparse Row format>,
 '17500': <80x17500 sparse matrix of type '<class 'numpy.float64'>'
 	with 335934 stored elements in Compressed Sparse Row format>,
 '20000': <80x20000 sparse matrix of type '<class 'numpy.float64'>'
 	with 347813 stored elements in Compressed Sparse Row format>,
 '22500': <80x22500 sparse matrix of type '<class 'numpy.float64'>'
 	with 357000 stored elements in Compressed Sparse Row format>,
 '25000': <80x25000 sparse matrix of type '<class 'numpy.float64'>'
 	with 364318 stored elements in Compressed Sparse Row format>,
 '30000': <80x30000 sparse matrix of type '<class 'numpy.float64'>'
 	with 375

##### Multinomial Naive Bayes

In [33]:
nb_classifiers = {
    'M NB Default, Alpha=1':MultinomialNB(alpha = 1),
   # 'M NB fit_prior=False':MultinomialNB(fit_prior = False),
}

scores = {} # Store cross-validation results in a dictionary
for matrix in matrix_n: 
    scores[matrix] = cross_validate( # perform cross-validation
        MultinomialNB(alpha = 1), # classifier object
        matrix_n[matrix], # feature matrix
        y_train, # gold labels
        cv=10, #number of folds
        scoring=['accuracy','precision', 'recall', 'f1', 'f1_macro', 'f1_micro'] # scoring methods
    )

In [42]:
compare_scores(scores)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_f1_macro,test_f1_micro
5000,0.002203,0.002948,0.6625,0.655357,1.0,0.790842,0.445421,0.6625
10000,0.001718,0.002605,0.6625,0.655357,1.0,0.790842,0.445421,0.6625
15000,0.002003,0.002655,0.6625,0.655357,1.0,0.790842,0.445421,0.6625
17500,0.001992,0.002686,0.6625,0.655357,1.0,0.790842,0.445421,0.6625
20000,0.00207,0.002727,0.6625,0.655357,1.0,0.790842,0.445421,0.6625
22500,0.002213,0.002776,0.6625,0.655357,1.0,0.790842,0.445421,0.6625
25000,0.00223,0.002754,0.6625,0.655357,1.0,0.790842,0.445421,0.6625
30000,0.002334,0.002817,0.6625,0.655357,1.0,0.790842,0.445421,0.6625
35000,0.002329,0.002801,0.6625,0.655357,1.0,0.790842,0.445421,0.6625


In [70]:
vect1 = TfidfVectorizer(
        encoding='utf-8',
        preprocessor=pre_proc,
        min_df=2, # Note this
        max_df=0.8, # This, too
        binary=False,
        norm='l2',
        use_idf=True)# And this
        #max_features=0000)

In [71]:
X_train = vect1.fit_transform(training_books)
X_train_Z = StandardScaler(with_mean=False).fit_transform(X_train)

In [72]:
classifiers = {
    'lg1': LogisticRegression()  ,
    'mnb': MultinomialNB()
}



scores = {} # Store cross-validation results in a dictionary
for classifier in classifiers: 
    scores[classifier] = cross_validate( # perform cross-validation
        classifiers[classifier], # classifier object
        X_train_Z, # feature matrix
        y_train, # gold labels
        cv=10, #number of folds
        scoring=['accuracy','precision', 'recall', 'f1', 'f1_macro', 'f1_micro'] # scoring methods
    )
    

In [73]:
compare_scores(scores)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_f1_macro,test_f1_micro
lg1,0.421886,0.00254,0.775,0.756548,1.0,0.855012,0.667506,0.775
mnb,0.002376,0.002778,0.8875,0.868095,0.98,0.917879,0.867273,0.8875


In [141]:
vectorizer.fit_transform(training_books)

<80x30376 sparse matrix of type '<class 'numpy.float64'>'
	with 376031 stored elements in Compressed Sparse Row format>

[<80x5000 sparse matrix of type '<class 'numpy.float64'>'
 	with 187794 stored elements in Compressed Sparse Row format>,
 <80x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 274957 stored elements in Compressed Sparse Row format>,
 <80x15000 sparse matrix of type '<class 'numpy.float64'>'
 	with 320917 stored elements in Compressed Sparse Row format>,
 <80x17500 sparse matrix of type '<class 'numpy.float64'>'
 	with 335934 stored elements in Compressed Sparse Row format>,
 <80x20000 sparse matrix of type '<class 'numpy.float64'>'
 	with 347813 stored elements in Compressed Sparse Row format>,
 <80x22500 sparse matrix of type '<class 'numpy.float64'>'
 	with 357000 stored elements in Compressed Sparse Row format>,
 <80x25000 sparse matrix of type '<class 'numpy.float64'>'
 	with 364318 stored elements in Compressed Sparse Row format>,
 <80x30000 sparse matrix of type '<class 'numpy.float64'>'
 	with 375279 stored elements in Compressed Sparse Row format>,
 <80x30376 sparse