#### STEPS TO GET DATA
1.  Download the data by running the following command in the project directory:
    `wget -w 2 -m -H "http://www.gutenberg.org/robot/harvest?filetypes[]=html&langs[]=de"`
    
2.  Clean extraneous files by running the following commands (applies for Windows):
    `del /S *-8.zip`
    `del /S *-0.zip`
    `del /S robots.txt`
    `del /S harvest*`
    
`pip install bsddb3-6.2.6-cp37-cp37m-win_amd64.whl`
`pip install gutenberg`

In [31]:
# imports
from string import ascii_lowercase # for checking if letters
import numpy as np                 # numpy, duh...
import zipfile                     # zipped file reading
import os                          # recursive navigation of file tree
import fnmatch                     # matching file name patterns
import tensorflow as tf            # tensorflow, duh...
from tensorflow import keras
from tensorflow.keras import layers




'''
# from gutenberg.acquire import load_metadata
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

from gutenberg.query import get_etexts
from gutenberg.query import get_metadata

from gutenberg.query import list_supported_metadatas
'''


'\n# from gutenberg.acquire import load_metadata\nfrom gutenberg.acquire import load_etext\nfrom gutenberg.cleanup import strip_headers\n\nfrom gutenberg.query import get_etexts\nfrom gutenberg.query import get_metadata\n\nfrom gutenberg.query import list_supported_metadatas\n'

#### STEPS TO TAKE IN SETTING UP MODEL

1.  Read in all of the dataset from the files
2.  Classify each work
    *  If author's first name is only in male.txt, then male (ADD TO MALE LIST, WILL COMBINE LATER)
    *  Else if author's first name is only in female.txt, then female (ADD TO FEMALE LIST, WILL COMBINE LATER)
    *  Else, ambiguous (ADD TO AMBIGUOUS LIST, CAN BE USED FOR MANUAL TESTING IF YOU WANT)
3.  Clean each text
    *  Remove headers and footers
    *  Remove characters that are not spaces or newlines or numbers or characters or hyphens (or question marks or exclamation marks?) or apostrophes (in contractions or possessive forms)
    *  Replace newlines with spaces
    *  (if using ? and !, replace with a space plus the mark, so it will count as a new word in tokenization)
    *  Remove double spaces
    *  Convert all capital letters to lowercase
4.  Tokenize each cleaned text
5.  Build a vocabulary???
5.  Vectorize each tokenized text

In [2]:
# Step 1: read in all of the .txt files

# holds all the strings of the etexts, and that's all
etexts = []

# recursively navigate the directory containing all the zipped etexts
for path, dirs, files in os.walk('./aleph.gutenberg.org'):
    
    # find all the zip folders
    for zip_name in fnmatch.filter(files,'*.zip'):
        zip_path = os.path.abspath(os.path.join(path, zip_name))
        #print(zip_path)
        
        # unzip and read the etext
        archive = zipfile.ZipFile(zip_path, 'r')
        for txt_name in archive.namelist():
            #print(txt_name)
            
            # some of them have accented characters (which are non-ASCII), which throws off the decoding
            try:
                etext = archive.read(txt_name).decode('ascii')
                etexts.append(etext)
            except UnicodeDecodeError:
                pass
            
        

In [3]:
# Step 2: classify each etext

# keep these separate at first so we can minimize bias in the training set
# we ultimately want to have a decent gender balance in the training set to minimize bias
# it doesn't matter for testing set, as testing won't directly influence the model
male_etexts = []
female_etexts = []
ambiguous_etexts = []

# read in the lists of male and female names
female_names_txt = open('female.txt', 'r').read()
male_names_txt = open('male.txt', 'r').read()

In [4]:
# function to get a list of names from the name files
def get_names(names):
    index1 = names.rfind('#')
    index2 = names.find('\n', index1)
    
    # slice out the header
    no_header = names[index2:]
   
    # tokenized along whitespace to create a list of names
    return no_header.split()

In [5]:
female_names = get_names(female_names_txt)
male_names = get_names(male_names_txt)

In [6]:
# function to classify a given uncleaned etext by the author's first name
def classify_etext(etext):
    index1 = etext.find('Author: ')
    index2 = index1 + 8
    index3 = etext.find(' ', index2)
    name = etext[index2: index3]
    
    if name in female_names and not name in male_names or name == 'Mrs.':
        # classify as female
        female_etexts.append([etext, 0])
    elif name in male_names and not name in female_names:
        # classify as male
        male_etexts.append([etext, 1])
    else:
        # classify as ambiguous
        ambiguous_etexts.append(etext)
        
    return


# function to reset male_etexts, female_etexts, and ambiguous_etexts
# in case run classify_etext(...) more than once with a given etext
# basically, this is just for development purposes
def reset_classifications():
    male_etexts.clear()
    female_etexts.clear()
    ambiguous_etexts.clear()
    
    return

In [7]:
# classify all the etexts
reset_classifications()
for etext in etexts:
    classify_etext(etext)

In [8]:
print(len(male_etexts), len(female_etexts), len(ambiguous_etexts))

6403 1892 5472


In [9]:
print(len(etexts))

13767


In [10]:
# Step 3: clean the etexts

# function to cut out the publishing and legal info at the beginning and end of the etexts
def strip_headers(etext):
    
    # formatting isn't always identical, and I don't want to have to deal with all the possible formatting edge cases
    try:
        index1 = etext.index('*** START OF THIS PROJECT GUTENBERG EBOOK') + 3
    except:
        return None
    
    # index2 is the index just past the junk at the beginning of the etext
    try:
        index2 = etext.index('***', index1) + 3
    except:
        return None
    
    # index3 is the index that marks the junk at the end of the etext
    try:
        index3 = etext.index("*** END OF THIS PROJECT GUTENBERG EBOOK")
    except:
        return None
    
    return etext[index2:index3]


# used to cut down a passage to just lowercase letters, hyphens (for compound words), and spaces
def clean(etext):
    # strip the header and footer from the etext
    stripped_etext = strip_headers(etext)
    
    if stripped_etext == None:
        return None
    
    valids = ''

    for character in stripped_etext:
        # might not be necessary
        if character == '\n' or character == '-':
            character = ' '
    
        # definitely necessary
        # TODO: might need to handle cases of accented characters, as str.isalpha() doesn't handle them
        # TODO: might need to include apostrophes (for possessive nouns and contractions),
        # however, some texts likely use single quotes for quotes, so would likely need to include double quotes
        # TODO: maybe handle ! and ?
        if character.isalpha() or character == ' ':
            # check if preceding character is a space
            # if it is a space, no double or triple or n-tuple spaces
            # and, if the preceding character is a space, the length of valids will necessarily be greater than 0
            if len(valids) > 0 and character == ' ':
                if valids[len(valids) - 1] != ' ':
                    valids += character
            else:
                valids += character
      
    return valids.lower()

# used to reset clean_xxxx_etexts in case you clean all of them several times
def reset_cleanings():
    cleaned_male_etexts.clear()
    cleaned_female_etexts.clear()
    cleaned_ambiguous_etexts.clear()
    
    return

In [11]:
cleaned_male_etexts = []
cleaned_female_etexts = []
cleaned_ambiguous_etexts = []

In [12]:
# so we don't keep on appending to the same list when testing several times through
reset_cleanings()

i = 0
for etext, gender in male_etexts:
    i += 1
    if i % 10 == 0:
        print(100 * i / len(male_etexts), '% complete')
    
    cleaned_etext = clean(etext)
    if cleaned_etext != None:
        cleaned_male_etexts.append([cleaned_etext, gender])
        

print('\nMALE AUTHORS COMPLETE\n\n')

i = 0
for etext, gender in female_etexts:
    i += 1
    if i % 10 == 0:
        print(100 * i / len(female_etexts), '% complete')
    
    cleaned_etext = clean(etext)
    if cleaned_etext != None:
        cleaned_female_etexts.append([cleaned_etext, gender])

print('\nFEMALE AUTHORS COMPLETE\n\n')

'''
i = 0
for etext in ambiguous_etexts:
    i += 1
    if i % 10 == 0:
        print(100 * i / len(ambiguous_etexts), '% complete')
    
    cleaned_etext = clean(etext)
    if cleaned_etext != None:
        cleaned_ambiguous_etexts.append(cleaned_etext)
'''


0.15617679212868968 % complete
0.31235358425737936 % complete
0.46853037638606904 % complete
0.6247071685147587 % complete
0.7808839606434483 % complete
0.9370607527721381 % complete
1.0932375449008278 % complete
1.2494143370295174 % complete
1.405591129158207 % complete
1.5617679212868967 % complete
1.7179447134155865 % complete
1.8741215055442761 % complete
2.030298297672966 % complete
2.1864750898016556 % complete
2.3426518819303452 % complete
2.498828674059035 % complete
2.6550054661877245 % complete
2.811182258316414 % complete
2.9673590504451037 % complete
3.1235358425737934 % complete
3.2797126347024834 % complete
3.435889426831173 % complete
3.5920662189598627 % complete
3.7482430110885523 % complete
3.904419803217242 % complete
4.060596595345932 % complete
4.216773387474621 % complete
4.372950179603311 % complete
4.529126971732 % complete
4.6853037638606905 % complete
4.84148055598938 % complete
4.99765734811807 % complete
5.153834140246759 % complete
5.310010932375449 % compl

43.72950179603311 % complete
43.8856785881618 % complete
44.041855380290485 % complete
44.198032172419175 % complete
44.354208964547865 % complete
44.510385756676556 % complete
44.666562548805246 % complete
44.822739340933936 % complete
44.978916133062626 % complete
45.135092925191316 % complete
45.291269717320006 % complete
45.447446509448696 % complete
45.603623301577386 % complete
45.759800093706076 % complete
45.915976885834766 % complete
46.072153677963456 % complete
46.228330470092146 % complete
46.384507262220836 % complete
46.540684054349526 % complete
46.696860846478216 % complete
46.85303763860691 % complete
47.00921443073559 % complete
47.16539122286428 % complete
47.32156801499297 % complete
47.47774480712166 % complete
47.63392159925035 % complete
47.79009839137904 % complete
47.94627518350773 % complete
48.10245197563642 % complete
48.25862876776511 % complete
48.4148055598938 % complete
48.57098235202249 % complete
48.72715914415118 % complete
48.88333593627987 % complet

87.9275339684523 % complete
88.08371076058097 % complete
88.23988755270966 % complete
88.39606434483835 % complete
88.55224113696704 % complete
88.70841792909573 % complete
88.86459472122442 % complete
89.02077151335311 % complete
89.1769483054818 % complete
89.33312509761049 % complete
89.48930188973918 % complete
89.64547868186787 % complete
89.80165547399656 % complete
89.95783226612525 % complete
90.11400905825394 % complete
90.27018585038263 % complete
90.42636264251132 % complete
90.58253943464001 % complete
90.7387162267687 % complete
90.89489301889739 % complete
91.05106981102608 % complete
91.20724660315477 % complete
91.36342339528346 % complete
91.51960018741215 % complete
91.67577697954084 % complete
91.83195377166953 % complete
91.98813056379822 % complete
92.14430735592691 % complete
92.3004841480556 % complete
92.45666094018429 % complete
92.61283773231298 % complete
92.76901452444167 % complete
92.92519131657036 % complete
93.08136810869905 % complete
93.23754490082774 

"\ni = 0\nfor etext in ambiguous_etexts:\n    i += 1\n    if i % 10 == 0:\n        print(100 * i / len(ambiguous_etexts), '% complete')\n    \n    cleaned_etext = clean(etext)\n    if cleaned_etext != None:\n        cleaned_ambiguous_etexts.append(cleaned_etext)\n"

In [13]:
print(len(cleaned_male_etexts), len(cleaned_female_etexts))

4974 1333


In [14]:
# Step 4: tokenize the texts

tokenized_male_etexts = []
tokenized_female_etexts = []
tokenized_ambiguous_etexts = []

In [15]:
# used to clear all the contents of the tokenized_xxxx_etexts
def reset_tokenizations():
    tokenized_male_etexts.clear()
    tokenized_female_etexts.clear()
    tokenized_ambiguous_etexts.clear()

In [16]:
# tokenize

reset_tokenizations()

i = 0
for etext, gender in cleaned_male_etexts:
    i += 1
    if i % 10 == 0:
        print(100 * i / len(male_etexts), '% complete')
        
    tokenized_male_etexts.append([etext.split(), gender])
    
i = 0
for etext, gender in cleaned_female_etexts:
    i += 1
    if i % 10 == 0:
        print(100 * i / len(male_etexts), '% complete')
        
    tokenized_female_etexts.append([etext.split(), gender])

0.15617679212868968 % complete
0.31235358425737936 % complete
0.46853037638606904 % complete
0.6247071685147587 % complete
0.7808839606434483 % complete
0.9370607527721381 % complete
1.0932375449008278 % complete
1.2494143370295174 % complete
1.405591129158207 % complete
1.5617679212868967 % complete
1.7179447134155865 % complete
1.8741215055442761 % complete
2.030298297672966 % complete
2.1864750898016556 % complete
2.3426518819303452 % complete
2.498828674059035 % complete
2.6550054661877245 % complete
2.811182258316414 % complete
2.9673590504451037 % complete
3.1235358425737934 % complete
3.2797126347024834 % complete
3.435889426831173 % complete
3.5920662189598627 % complete
3.7482430110885523 % complete
3.904419803217242 % complete
4.060596595345932 % complete
4.216773387474621 % complete
4.372950179603311 % complete
4.529126971732 % complete
4.6853037638606905 % complete
4.84148055598938 % complete
4.99765734811807 % complete
5.153834140246759 % complete
5.310010932375449 % compl

44.198032172419175 % complete
44.354208964547865 % complete
44.510385756676556 % complete
44.666562548805246 % complete
44.822739340933936 % complete
44.978916133062626 % complete
45.135092925191316 % complete
45.291269717320006 % complete
45.447446509448696 % complete
45.603623301577386 % complete
45.759800093706076 % complete
45.915976885834766 % complete
46.072153677963456 % complete
46.228330470092146 % complete
46.384507262220836 % complete
46.540684054349526 % complete
46.696860846478216 % complete
46.85303763860691 % complete
47.00921443073559 % complete
47.16539122286428 % complete
47.32156801499297 % complete
47.47774480712166 % complete
47.63392159925035 % complete
47.79009839137904 % complete
47.94627518350773 % complete
48.10245197563642 % complete
48.25862876776511 % complete
48.4148055598938 % complete
48.57098235202249 % complete
48.72715914415118 % complete
48.88333593627987 % complete
49.03951272840856 % complete
49.19568952053725 % complete
49.35186631266594 % complet

10.463845072622208 % complete
10.620021864750898 % complete
10.776198656879588 % complete
10.932375449008278 % complete
11.088552241136966 % complete
11.244729033265656 % complete
11.400905825394346 % complete
11.557082617523037 % complete
11.713259409651727 % complete
11.869436201780415 % complete
12.025612993909105 % complete
12.181789786037795 % complete
12.337966578166485 % complete
12.494143370295173 % complete
12.650320162423863 % complete
12.806496954552554 % complete
12.962673746681244 % complete
13.118850538809934 % complete
13.275027330938622 % complete
13.431204123067312 % complete
13.587380915196002 % complete
13.743557707324692 % complete
13.89973449945338 % complete
14.05591129158207 % complete
14.21208808371076 % complete
14.36826487583945 % complete
14.52444166796814 % complete
14.680618460096829 % complete
14.836795252225519 % complete
14.99297204435421 % complete
15.1491488364829 % complete
15.305325628611588 % complete
15.461502420740278 % complete
15.617679212868968

In [17]:
# at this point, it's using like 12 or 13 GB of RAM, so gotta make some space
def clear_etexts():
    etexts.clear()

In [18]:
clear_etexts()

In [19]:
shortest = len(tokenized_male_etexts[0][0])
longest = len(tokenized_male_etexts[0][0])
sum = 0
for etext, gender in tokenized_male_etexts:
    sum += len(etext)
    if len(etext) < shortest:
        shortest = len(etext)
    elif len(etext) > longest:
        longest = len(etext)
        
for etext, gender in tokenized_female_etexts:
    sum += len(etext)
    if len(etext) < shortest:
        shortest = len(etext)
    elif len(etext) > longest:
        longest = len(etext)

average_length = int(round(sum / (len(tokenized_male_etexts) + len(tokenized_female_etexts))))
print('Shortest etext: ', shortest, ' words\nLongest etext: ', longest, ' words\nAverage etext: ', average_length, ' words')

Shortest etext:  11  words
Longest etext:  2594329  words
Average etext:  62779  words


In [20]:
# TODO: cut out shortest etexts
# TODO: trim remaining to a manageable and uniform size
# TODO: construct a dictionary
# TODO: VECTORIZE

In [21]:
# the length of texts we want
# below this, add padding tokens <PAD>
# above this, trim
TEXT_LENGTH = average_length // 6

CUTOFF_LENGTH = 1000

# tokenized etexts of uniform
trimmed_male_etexts = []
trimmed_female_etexts = []

In [22]:
# if an etext is shorter than the designated length, adds padding to make it long enough
def pad_etext(etext):
    diff = TEXT_LENGTH - len(etext)
    padding = ['<PAD>'] * diff
    
    return etext + padding

# if an etext is longer than the designated length, removes words to make it short enough
def trim_etext(etext):
    return etext[:TEXT_LENGTH - 1]

# fits all the etexts in trimmed_xxxx_etexts to be of uniform length
def fit_etexts_to_size():
    for etext, gender in tokenized_male_etexts:
        if len(etext) < CUTOFF_LENGTH:
            pass
        elif len(etext) > TEXT_LENGTH:
            trimmed_male_etexts.append([trim_etext(etext), gender])
        elif len(etext) < TEXT_LENGTH:
            trimmed_male_etexts.append([pad_etext(etext), gender])
        else:
            trimmed_male_etexts.append([etext, gender])
    
    for etext, gender in tokenized_female_etexts:
        if len(etext) > TEXT_LENGTH:
            trimmed_female_etexts.append([trim_etext(etext), gender])
        elif len(etext) < TEXT_LENGTH:
            trimmed_female_etexts.append([pad_etext(etext), gender])
        else:
            trimmed_female_etexts.append([etext, gender])
            
    return

# used to clear all the contents of the trimmed_xxxx_etexts
def reset_fittings():
    trimmed_male_etexts.clear()
    trimmed_female_etexts.clear()

In [23]:
# make all the etexts to a uniform size
reset_fittings()
fit_etexts_to_size()

In [24]:
# clear up RAM
reset_classifications()
reset_cleanings()
reset_tokenizations()

In [25]:
# vocab
dictionary = {}

In [26]:
dictionary.clear()

total_length = len(trimmed_male_etexts) + len(trimmed_female_etexts)

# i is used for progress
# j is used for 
i = 0
j = 0
for etext, gender in trimmed_male_etexts:
    # TODO: handle <PAD>
    # TODO: progress percentage
    i += 1
    # if it finds the word, don't add
    # else, add to vocab
    for word in etext:
        if word not in dictionary.keys():
            # the value of j will tell us the index for the one-hot encoding of each word
            # in the vectorized etexts
            dictionary[word] = j
            j += 1
    
    if i % 10 == 0:
        print(100 * i / total_length, '% complete')
            
for etext, gender in trimmed_female_etexts:
    i += 1
    
    # if it finds the word, don't add
    # else, add to vocab
    for word in etext:
        if word not in dictionary.keys():
            # the value of j will tell us the index for the one-hot encoding of each word
            # in the vectorized etexts
            dictionary[word] = j
            j += 1
    
    if i % 10 == 0:
        print(100 * i / total_length, '% complete')

0.16005121638924455 % complete
0.3201024327784891 % complete
0.4801536491677337 % complete
0.6402048655569782 % complete
0.8002560819462228 % complete
0.9603072983354674 % complete
1.120358514724712 % complete
1.2804097311139564 % complete
1.440460947503201 % complete
1.6005121638924455 % complete
1.7605633802816902 % complete
1.9206145966709347 % complete
2.0806658130601794 % complete
2.240717029449424 % complete
2.4007682458386683 % complete
2.560819462227913 % complete
2.7208706786171577 % complete
2.880921895006402 % complete
3.0409731113956466 % complete
3.201024327784891 % complete
3.3610755441741356 % complete
3.5211267605633805 % complete
3.681177976952625 % complete
3.8412291933418694 % complete
4.001280409731114 % complete
4.161331626120359 % complete
4.321382842509603 % complete
4.481434058898848 % complete
4.641485275288092 % complete
4.801536491677337 % complete
4.961587708066581 % complete
5.121638924455826 % complete
5.28169014084507 % complete
5.441741357234315 % comple

46.414852752880925 % complete
46.57490396927017 % complete
46.73495518565941 % complete
46.89500640204866 % complete
47.0550576184379 % complete
47.21510883482714 % complete
47.37516005121639 % complete
47.53521126760563 % complete
47.69526248399488 % complete
47.85531370038412 % complete
48.015364916773365 % complete
48.175416133162614 % complete
48.335467349551855 % complete
48.495518565941104 % complete
48.655569782330346 % complete
48.81562099871959 % complete
48.97567221510884 % complete
49.13572343149808 % complete
49.29577464788732 % complete
49.45582586427657 % complete
49.61587708066581 % complete
49.77592829705506 % complete
49.9359795134443 % complete
50.096030729833544 % complete
50.25608194622279 % complete
50.416133162612034 % complete
50.57618437900128 % complete
50.736235595390525 % complete
50.89628681177977 % complete
51.056338028169016 % complete
51.21638924455826 % complete
51.37644046094751 % complete
51.53649167733675 % complete
51.69654289372599 % complete
51.856

91.70934699103714 % complete
91.86939820742637 % complete
92.02944942381562 % complete
92.18950064020487 % complete
92.3495518565941 % complete
92.50960307298335 % complete
92.6696542893726 % complete
92.82970550576185 % complete
92.98975672215109 % complete
93.14980793854033 % complete
93.30985915492958 % complete
93.46991037131882 % complete
93.62996158770807 % complete
93.79001280409732 % complete
93.95006402048655 % complete
94.1101152368758 % complete
94.27016645326505 % complete
94.43021766965428 % complete
94.59026888604353 % complete
94.75032010243278 % complete
94.91037131882203 % complete
95.07042253521126 % complete
95.23047375160051 % complete
95.39052496798976 % complete
95.550576184379 % complete
95.71062740076825 % complete
95.8706786171575 % complete
96.03072983354673 % complete
96.19078104993598 % complete
96.35083226632523 % complete
96.51088348271446 % complete
96.67093469910371 % complete
96.83098591549296 % complete
96.99103713188221 % complete
97.15108834827144 % 

In [27]:
# TODO: create a reverse dictionary so that, if you know the index, you can find the word

In [32]:
vocab_size = len(dictionary.keys())
print(vocab_size)

313865


In [33]:
# vectorizes an entire etext with a simple integer encoding
# this feeds into the model, where the embedding layer will perform the word embeddings
def vectorize_etext(etext):
    return np.array([dictionary[word] for word in etext])

In [36]:
print(vectorize_etext(trimmed_male_etexts[0][0]))

[   0    1    2 ... 1632 1632 1632]


In [43]:
# vectorize the whole dataset
vectorized_male_etexts = np.array([[vectorize_etext(etext), gender] for etext, gender in trimmed_male_etexts])
vectorized_female_etexts = np.array([[vectorize_etext(etext), gender] for etext, gender in trimmed_female_etexts])

In [44]:
print(vectorized_male_etexts.shape)
print(vectorized_male_etexts[0][0])
print(vectorized_female_etexts.shape)
print(vectorized_female_etexts[0][0])

(4915, 2)
[   0    1    2 ... 1632 1632 1632]
(1333, 2)
[   23 23399   130 ...    36   311    47]


In [45]:
# TODO: train-test split
test_train_split = 0.9

In [None]:
# FINALLY, THE MODEL

num_filters = 8
filter_size = 3
pool_size = 2

sequence_length = TEXT_LENGTH
num_classes = 2
embedding_size = 64


model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=10))
# the model will take as input an integer matrix of size (batch,
# input_length).
# the largest integer (i.e. word index) in the input should be no larger
# than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch
# dimension.

# model.add(Conv2D(num_filters, filter_size, input_shape=(28, 28, 1)))
# model.add(MaxPooling2D(pool_size=pool_size))


In [None]:
# compile the model
model.compile(
  'adam',
  loss='categorical_crossentropy',
  metrics=['accuracy'],
)

In [None]:
# train the model
model.fit(
  train_images,
  train_labels,
    epochs=50
)