In [1]:
import numpy as np
import os
from htrc_features import FeatureReader

# Genre classification with HTRC data

In this example, we'll be classifying texts into 2 different genres: poetry and science-fiction. JSON files containing the metadata for 100 texts in each genre need to be downloaded:

In [2]:
poetry_output = !htid2rsync --f data/poetry.txt | rsync -azv --files-from=- data.sharc.hathitrust.org::features/ data/poetry/
scifi_output = !htid2rsync --f data/scifi.txt | rsync -azv --files-from=- data.sharc.hathitrust.org::features/ data/scifi/

outputs = list([poetry_output, scifi_output])
subjects = ['poetry', 'scifi']

paths = {}
suffix = '.json.bz2'
for subject, output in zip(subjects, outputs):
    folder = subject
    filePaths = [path for path in output if path.endswith(suffix)]
    paths[subject] = [os.path.join(folder, path) for path in filePaths]
    fn = 'data/' + subject + '_paths.txt'
    with open(fn, 'w') as f:
        for path in paths[subject]:
            p = str(path) + '\n'
            f.write(p)

As in the previous notebooks, we'll construct `FeatureReader` objects for each corpus. The line below reads in path files we created to the downloaded data:

In [3]:
paths = {}
subjects = ['poetry', 'scifi']
for subject in subjects:
    with open('data/' + subject + '_paths.txt', 'r') as f:
        paths[subject] = ['data/' + line[:len(line)-1] for line in f.readlines()]
        
poetry = FeatureReader(paths['poetry'])
scifi = FeatureReader(paths['scifi'])

To create our bag of words matrix, we need to keep a global dictionary of all words seen in each of our texts. We initialize "wordDict", which tracks all the words seen and records its index in the bag of words matrix. We also keep a list of volumes so that we can parse them later.

In [4]:
def createWordDict(HTRC_FeatureReader_List):

    wordDict = {}
    i = 0 
    volumes = []

    for f in HTRC_FeatureReader_List:

        for vol in f.volumes():
            
            volumes.append(vol)

            tok_list = vol.tokenlist(pages=False)
            tokens = tok_list.index.get_level_values('token')

            for token in tokens:
                if token not in wordDict.keys():
                    wordDict[token] = i
                    i += 1
    
    return wordDict, volumes

In [5]:
wordDict, volumes = createWordDict([scifi, poetry])

  df.sortlevel(inplace=True)


Once we construct the global dictionary, we can fill the bag of words matrix with the word counts for each volume. Once we have this, we will use it to format the training data for our model.

In [6]:
dtm = np.zeros((200, len(wordDict.keys())))

for i, vol in enumerate(volumes):
    tok_list = vol.tokenlist(pages=False)
    counts = list(tok_list['count'])
    tokens = tok_list.index.get_level_values('token')
    
    for token, count in zip(tokens, counts):
        try:
            index = wordDict[token]
            dtm[i, index] = count
        except:
            pass
        
X = dtm
y = np.zeros((200))
y[100:200] = 1

We can then use the `TfidfTransformer` to format the bag of words matrix, so that we can fit it to our LinearSVC model. Let's see how our model does.

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn import cross_validation

tfidf = TfidfTransformer()
out = tfidf.fit_transform(X, y)

model = LinearSVC()

score = cross_validation.cross_val_score(model, X, y, cv=10)
print(np.mean(score))



0.915


We can also get the most helpful features, or words, for each class. First we'll `fit` the model:

In [8]:
model.fit(X, y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [9]:
feats = np.argsort(model.coef_[0])[:50]
top_scifi = [(list(feats).index(wordDict[w]) + 1, w) for w in wordDict.keys() if wordDict[w] in feats]
sorted(top_scifi)

[(1, '"'),
 (2, 'science'),
 (3, 'fiction'),
 (4, "n't"),
 (5, '('),
 (6, 'it'),
 (7, 'do'),
 (8, 'und'),
 (9, 'is'),
 (10, 'w'),
 (11, ')'),
 (12, 'they'),
 (13, 'I'),
 (14, 'she'),
 (15, 'may'),
 (16, 'into'),
 (17, 'was'),
 (18, ','),
 (19, 'would'),
 (20, 'could'),
 (21, 'between'),
 (22, 'story'),
 (23, 'space'),
 (24, "'s"),
 (25, 'them'),
 (26, 'which'),
 (27, 'W'),
 (28, 'Fiction'),
 (29, 'what'),
 (30, 'Wells'),
 (31, 'You'),
 (32, 'nature'),
 (33, 'her'),
 (34, 'Obsada'),
 (35, 'Produkcja'),
 (36, 'Zdjęcia'),
 (37, 'Montaż'),
 (38, 'o'),
 (39, 'Muzyka'),
 (40, 'specjalne'),
 (41, 'He'),
 (42, 'or'),
 (43, 'p'),
 (44, 'novel'),
 (45, "'ll"),
 (46, 'other'),
 (47, 'at'),
 (48, 'world'),
 (49, 'now'),
 (50, 'himself')]

In [10]:
feats = np.argsort(model.coef_[0])[-50:]
top_poetry = [(list(feats).index(wordDict[w]) + 1, w) for w in wordDict.keys() if wordDict[w] in feats]
sorted(top_poetry, key=lambda tup: tup[0])

[(1, 'While'),
 (2, 'r'),
 (3, '“'),
 (4, 'V'),
 (5, 'P'),
 (6, 'fond'),
 (7, 'ev'),
 (8, 'yearning'),
 (9, 'camp-fires'),
 (10, '¿'),
 (11, 'lhc'),
 (12, 'Г'),
 (13, 'praying'),
 (14, 'soldier'),
 (15, 'n'),
 (16, 'burning'),
 (17, 's'),
 (18, 'His'),
 (19, 'life'),
 (20, 'America'),
 (21, 'By'),
 (22, 'your'),
 (23, 'New'),
 (24, 'my'),
 (25, 'and'),
 (26, 'And'),
 (27, 'for'),
 (28, 'f'),
 (29, 'J'),
 (30, 'dear'),
 (31, 'er'),
 (32, 'with'),
 (33, 'his'),
 (34, 'on'),
 (35, ':'),
 (36, '1'),
 (37, 'l'),
 (38, 'hearts'),
 (39, '—'),
 (40, "'"),
 (41, 'our'),
 (42, 'will'),
 (43, 'by'),
 (44, 'The'),
 (45, 'v'),
 (46, 'Brother'),
 (47, 'in'),
 (48, ';'),
 (49, '`'),
 (50, '-')]