In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
import tensorflow as tf


In [2]:
#this mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# enter the foldername in the Shared Google Drive
FOLDERNAME = 'Shared drives/CS 230 Project'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/{}'.format(FOLDERNAME))

%cd /content/drive/$FOLDERNAME/

Mounted at /content/drive
/content/drive/Shared drives/CS 230 Project


In [3]:
datapath = "datasets/Gutenberg/"
train_df = pd.read_csv(datapath + "training10.csv").head(10000)
validation_df = pd.read_csv(datapath + "validation10.csv").head(1000)

In [4]:
def map_authors(data):
    authors = []
    author_mappings = {}
    for index, row in data.iterrows():
        name = row['Author']
        if name in authors:
            continue
        authors.append(name)
        
    authors.sort()
    for i in range(len(authors)):
        author_mappings[authors[i]] = i
    return author_mappings 

In [5]:
def reverse_mapping(author_mapping):
    reverse = {}
    for key in author_mapping.keys():
        reverse[author_mapping[key]] = key
    return reverse

In [6]:
author_mapping = map_authors(validation_df)
print(author_mapping)
reverse_mappings = reverse_mapping(author_mapping)

{'Andrew Lang': 0, 'Anthony Trollope': 1, 'Bret Harte': 2, 'Charles Dickens': 3, 'Charles Kingsley': 4, 'Charlotte Mary Yonge': 5, 'Daniel Defoe': 6, 'Edgar Rice Burroughs': 7, 'Edward Phillips Oppenheim': 8, 'Edward Stratemeyer': 9, 'Frank Richard Stockton': 10, 'G K Chesterton': 11, 'George Alfred Henty': 12, 'George Bernard Shaw': 13, 'Hamlin Garland': 14, 'Harold Bindloss': 15, 'Henry James': 16, 'Henry Rider Haggard': 17, 'Herbert George Wells': 18, 'Jack London': 19, 'Jacob Abbott': 20, 'James Fenimore Cooper': 21, 'James Matthew Barrie': 22, 'James Otis': 23, 'Jerome Klapka Jerome': 24, 'John Galsworthy': 25, 'John Morley': 26, 'John Ruskin': 27, 'Joseph Conrad': 28, 'Louisa May Alcott': 29, 'Lyman Frank Baum': 30, 'Mark Twain': 31, 'Nathaniel Hawthorne': 32, 'Oscar Wilde': 33, 'P G Wodehouse': 34, 'R M Ballantyne': 35, 'Robert Louis Stevenson': 36, 'Rudyard Kipling': 37, 'Sir Arthur Conan Doyle': 38, 'Sir Walter Scott': 39, 'Thomas Carlyle': 40, 'Thomas Hardy': 41, 'Thomas Henr

In [7]:
count_vectorizer = feature_extraction.text.CountVectorizer()
train_vectors = count_vectorizer.fit_transform(train_df['Text'])
validation_vectors = count_vectorizer.transform(validation_df["Text"])

In [8]:
#This function was used to extract features for 
#the manual feature baseline.
def extract_features(data):
  features = np.zeros((len(data.index), 2))
  for index, row in data.iterrows():
      text = row['Text']
      features[index, 0] = len(text) / 100
      words = text.split()
      features[index, 1] = len(text) / len(words)
  return features


In [9]:
def get_mappings(data, author_mapping):
    authors = data['Author']
    mappings = [author_mapping[author] for author in authors]
    data['mappings'] = mappings
    return data['mappings']

In [10]:
#The same file was uesd for manual feature baseline.
#test_targets = extract_features(test_df)
#train_vectors = extract_features(train_df)
#validation_vectors = extract_features(validation_df)
train_targets = get_mappings(train_df, author_mapping)
validation_targets = get_mappings(validation_df, author_mapping)

In [11]:
from keras.utils.np_utils import to_categorical

label_train = train_targets
label_v = validation_targets
# one hot encode
encoded_label_train = to_categorical(label_train)
encoded_label_v = to_categorical(label_v)



In [12]:
ann = tf.keras.models.Sequential()

In [13]:
ann.add(tf.keras.layers.Dense(units=100, activation='relu'))

In [14]:
#ann.add(tf.keras.layers.Dense(units=100, activation='relu'))

In [15]:
#ann.add(tf.keras.layers.Dense(units=100, activation='relu'))

In [16]:
ann.add(tf.keras.layers.Dense(50, activation='softmax'))

In [17]:
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [18]:
ann.fit(train_vectors.todense(), encoded_label_train, batch_size = 32, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f07f03796d0>

In [19]:
y_pred = ann.predict(validation_vectors.todense())

In [20]:
y_output = np.zeros_like(y_pred)
y_output[np.arange(len(y_pred)), y_pred.argmax(1)] = 1

In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [22]:
accuracy_score(encoded_label_v, y_output)

0.962

In [23]:
def output_probs(reverse_mappings, output):
    probs = {}
    for i in range(len(output[0])):
        probs[reverse_mappings[i]] = round(output[0][i], 3)
    return probs


In [24]:
i = 0
success = {}
total = {}
for output in y_output:
    actual_index = encoded_label_v[i].argmax(axis=0)
    index = output.argmax(axis=0)
    actual = reverse_mappings[actual_index]
    prediction = reverse_mappings[index]
    spacing = " " * (30 - len(actual))
    spacing2 = " " * (30 - len(prediction))
    correct = actual == prediction

    if correct:
        if actual not in success:
            success[actual] = 0
        success[actual] += 1
    if actual not in total:
        total[actual] = 0
    total[actual] += 1
    print(f"{actual}   {spacing }{prediction} {spacing2}{correct}")
    i += 1
    

Sir Walter Scott                 Sir Walter Scott               True
Joseph Conrad                    Joseph Conrad                  True
Frank Richard Stockton           Frank Richard Stockton         True
Joseph Conrad                    Joseph Conrad                  True
Daniel Defoe                     Daniel Defoe                   True
John Ruskin                      John Ruskin                    True
Sir Arthur Conan Doyle           Sir Arthur Conan Doyle         True
Edgar Rice Burroughs             Edgar Rice Burroughs           True
James Fenimore Cooper            James Fenimore Cooper          True
Thomas Henry Huxley              Thomas Henry Huxley            True
Jack London                      Jack London                    True
Thomas Hardy                     Thomas Hardy                   True
Louisa May Alcott                Louisa May Alcott              True
Jack London                      Jack London                    True
William Wymark Jacobs            W

In [25]:
for key in sorted(total.keys()):
    spacing = " " * (30 - len(key))
    print(f"{key}:{spacing} {round(success[key] / total[key], 2)}")

Andrew Lang:                    1.0
Anthony Trollope:               1.0
Bret Harte:                     1.0
Charles Dickens:                1.0
Charles Kingsley:               0.88
Charlotte Mary Yonge:           1.0
Daniel Defoe:                   1.0
Edgar Rice Burroughs:           1.0
Edward Phillips Oppenheim:      1.0
Edward Stratemeyer:             0.95
Frank Richard Stockton:         0.94
G K Chesterton:                 1.0
George Alfred Henty:            0.95
George Bernard Shaw:            1.0
Hamlin Garland:                 1.0
Harold Bindloss:                1.0
Henry James:                    0.95
Henry Rider Haggard:            1.0
Herbert George Wells:           0.95
Jack London:                    0.96
Jacob Abbott:                   0.93
James Fenimore Cooper:          1.0
James Matthew Barrie:           0.94
James Otis:                     1.0
Jerome Klapka Jerome:           0.92
John Galsworthy:                1.0
John Morley:                    1.0
John Ruskin:      

In [26]:
#Sample text by Edward Stratemeyer
str = """
To those who have read the former volumes in this "Dave Porter Series"
the boys already mentioned need no special introduction. They were all
pupils of Oak Hall, a first-class boarding school located in the heart
of one of our New England States. At the academy Dave Porter seemed to
be a natural leader, although that place had been at times disputed by
Nat Poole, Gus Plum, and others. It was wonderful what a hold Dave had
on his friends, considering his natural modesty. Physically he was well
built and his muscles were those of a youth used to hard work and a life
in the open air. Yet, though he loved to run, row, swim, and play games,
Dave did not neglect his studies, and only a short time before this
story opens had won the Oak Hall medal of honor, of which he was justly
proud.
"""

In [27]:
str_vectorized = count_vectorizer.transform({str})

In [30]:
str_pred = ann.predict(str_vectorized)

In [31]:
str_output = np.zeros_like(str_pred)
str_output[np.arange(len(str_pred)), str_pred.argmax(1)] = 1

In [32]:
probs = output_probs(reverse_mappings, str_pred)
probs

{'Andrew Lang': 0.0,
 'Anthony Trollope': 0.0,
 'Bret Harte': 0.0,
 'Charles Dickens': 0.0,
 'Charles Kingsley': 0.0,
 'Charlotte Mary Yonge': 0.0,
 'Daniel Defoe': 0.0,
 'Edgar Rice Burroughs': 0.0,
 'Edward Phillips Oppenheim': 0.0,
 'Edward Stratemeyer': 1.0,
 'Frank Richard Stockton': 0.0,
 'G K Chesterton': 0.0,
 'George Alfred Henty': 0.0,
 'George Bernard Shaw': 0.0,
 'Hamlin Garland': 0.0,
 'Harold Bindloss': 0.0,
 'Henry James': 0.0,
 'Henry Rider Haggard': 0.0,
 'Herbert George Wells': 0.0,
 'Jack London': 0.0,
 'Jacob Abbott': 0.0,
 'James Fenimore Cooper': 0.0,
 'James Matthew Barrie': 0.0,
 'James Otis': 0.0,
 'Jerome Klapka Jerome': 0.0,
 'John Galsworthy': 0.0,
 'John Morley': 0.0,
 'John Ruskin': 0.0,
 'Joseph Conrad': 0.0,
 'Louisa May Alcott': 0.0,
 'Lyman Frank Baum': 0.0,
 'Mark Twain': 0.0,
 'Nathaniel Hawthorne': 0.0,
 'Oscar Wilde': 0.0,
 'P G Wodehouse': 0.0,
 'R M Ballantyne': 0.0,
 'Robert Louis Stevenson': 0.0,
 'Rudyard Kipling': 0.0,
 'Sir Arthur Conan Doyl

In [33]:
prediction = output_probs(reverse_mappings, str_output)
max(prediction, key=prediction.get)

'Edward Stratemeyer'