## Imports and Setup

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
from   sklearn.decomposition import TruncatedSVD
from   sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# class corpus metadata
metadata = pd.read_csv("class_corpus_metadata.csv")

metadata.shape

(160, 34)

In [8]:
metadata.head()

Unnamed: 0.1,Unnamed: 0,check_1,check_2,title,year,author1_surname,author1_givenname,author2_surname,author2_givenname,gender_author1,...,feminist fiction,mystery,adventure,tragedy,children,regency,manners,philosophical,coming-of-age,filename
0,nsg57,scw222,lcc82,"Writings in the United Amateur, 1915 - 1922",1922,Lovecraft,Howard,,,Male,...,False,True,False,False,False,False,False,True,False,Lovecraft_WritingsintheUnitedAmateur1915-1922.txt
1,fhh26,gs542,tj256,Whose Body?,1923,Sayers,Dorothy L.,,,Female,...,False,True,False,False,False,False,False,False,False,Sayres_WhoseBody.txt
2,cl2264,,,Voodoo Planet,1959,Norton,Andre,,,Female,...,False,False,True,False,False,False,False,False,False,Norton_VoodooPlanet.txt
3,ehh52,sjr255,kg428,"Varney the Vampire; Or, the Feast of Blood by ...",1845,Rymer,James Malcolm,Prest,Thomas Peckett,Male,...,False,False,False,False,False,False,False,False,False,Prest_Rhymer_VarneyTheVampire.txt
4,dgr73,jlp367,kg428,Uncle Tom's Cabin,1852,Stowe,Harriet Beecher,,,Female,...,False,False,False,False,False,False,False,False,False,Stowe_UncleTom_sCabin.txt


In [3]:
# training data are books that are either horror or detective
training_data = metadata[(metadata['horror']==True) | (metadata['detective']==True)]

# drop books that are both horror and detective
drop = metadata[(metadata['horror']==True) & (metadata['detective']==True)]
training_data = training_data.drop(drop.index)

# testing data are books are neither horror or detective
testing_data = metadata[(metadata['horror']==False) & (metadata['detective']==False)]

# sort titles alphabetically 
training_data = training_data.sort_values('title')
testing_data = testing_data.sort_values('title')
# note: training+testing+dropped row = 159 rows, class corpus = 160 rows, "An Unkindness of Ghosts" has no input for horror and detective column

In [9]:
training_data.head()

Unnamed: 0.1,Unnamed: 0,check_1,check_2,title,year,author1_surname,author1_givenname,author2_surname,author2_givenname,gender_author1,...,feminist fiction,mystery,adventure,tragedy,children,regency,manners,philosophical,coming-of-age,filename
159,tl566,hz542,ja532,813,1910,Leblanc,Maurice,,,Male,...,False,True,False,False,False,False,False,False,False,Leblanc_813.txt
156,gc386,,,A Strange Disappearance,1998,Green,Anna Katharine,,,Female,...,False,True,False,False,False,False,False,False,False,GreenAnnaKatharine_AStrangeDisappearance.txt
155,nca28,tl566,stw43,A Study in Scarlet,1887,Conan Doyle,Arthur,,,Male,...,False,True,False,False,False,False,False,False,False,ConanDoyle_AStudyInScarlet.txt
153,jc2739,,,Agatha Webb,1899,Green,Anna Katharine,,,Female,...,False,True,False,False,False,False,False,False,False,Green_AgathaWebb.txt
146,lcc82,yk499,,Carmilla,1872,Le_Fanu,Joseph Sheridan,,,Male,...,False,False,False,False,False,False,False,False,False,Carmilla.txt


In [4]:
# get book file names to open
training_names = training_data.filename.values
testing_names = testing_data.filename.values

array(['Leblanc_813.txt', 'GreenAnnaKatharine_AStrangeDisappearance.txt',
       'ConanDoyle_AStudyInScarlet.txt', 'Green_AgathaWebb.txt',
       'Carmilla.txt', 'Gaboriau_CaughtInTheNet.txt',
       'Hornung_DeadMenTellNoTales.txt', 'dracula.txt',
       'Wharton_EthanFrome.txt', 'BlackwoodAlgernon_FourWeirdTales.txt',
       'Wollstonecraft_Shelley_Frankenstein.txt',
       'WollstonecraftShelley_Frankenstein.txt',
       'Ghost_Stories_of_an_Antiquary.txt',
       'WirtMildred_GuiltoftheBrassThieves.txt', 'his_last_bow.txt',
       'Hoover_LifeBlood.txt', 'bramah_maxcarrados.txt',
       'KafkaFranz_Metamorphosis.txt', 'Austen_NorthangerAbbey.txt',
       'Poirot_Investigates.txt',
       'Doyle_The Adventures of Sherlock Holmes.txt',
       'Bryce_Ashiel_Mystery.txt', 'The_Champdoce_Mystery_EG.txt',
       'Rinehart_Mary_TheCircularStaircase.txt',
       'Green_CircularStudy.txt', 'the_clique_gold.txt',
       'Wallace_TheClueOfTheTwistedCandle.txt', 'blackwood_thedamned.txt',
    

In [10]:
testing_data.head()

Unnamed: 0.1,Unnamed: 0,check_1,check_2,title,year,author1_surname,author1_givenname,author2_surname,author2_givenname,gender_author1,...,feminist fiction,mystery,adventure,tragedy,children,regency,manners,philosophical,coming-of-age,filename
158,tr333,sjs457,sl2324,A Round Dozen,1883,Coolidge,Susan,,,Female,...,False,False,False,False,False,False,False,False,False,Coolidge_ARoundDozen.txt
157,kwy3,cl922,hk627,A Sicillian Romance,1790,Radcliffe,Ann Ward,,,Female,...,False,False,False,False,False,False,False,False,False,radcliffeann_a_sicillian_romance.txt
154,lqz4,gt294,lcc82,Adele Doring at Boarding-School,1921,North,Grace May,,,Female,...,False,False,False,False,True,False,False,False,False,adele_doring_boarding_school.txt
152,yc2669,xf89,wms87,Agnes Grey,1847,Bronte,Anne,,,Female,...,True,False,False,False,False,False,True,False,False,Bronte_AgnesGrey.txt
151,mn454,ar2465,jlp367,An Old-Fashioned Girl,1869,Alcott,Louisa May,,,Female,...,False,False,False,False,True,False,True,False,True,Alcott_AnOld-FashionedGirl.txt


In [5]:
# 1=detective, 0=horror， gold labels
y_train=training_data.detective.values*1

In [12]:
y_train

array([1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0], dtype=object)

In [6]:
# open and append training books together

training_books=[]
for book in training_names:
    with open(book, 'r',encoding='utf-8') as f:
        file = f.read().replace("\n", " ") 
        training_books.append(file)
        

In [7]:
# open and append testing books together

testing_books=[]
for book in testing_names:
    with open(book, 'r',encoding='utf-8') as f:
        file = f.read().replace("\n", " ") 
        testing_books.append(file)
        