#### STEPS TO GET DATA
1.  Download the data by running the following command in the project directory:
    `wget -w 2 -m -H "http://www.gutenberg.org/robot/harvest?filetypes[]=html&langs[]=de"`
    
2.  Clean extraneous files by running the following commands (applies for Windows):
    `del /S *-8.zip`
    `del /S *-0.zip`
    `del /S robots.txt`
    `del /S harvest*`
    
`pip install bsddb3-6.2.6-cp37-cp37m-win_amd64.whl`
`pip install gutenberg`

In [None]:
# imports
from string import ascii_lowercase # for checking if letters
import numpy as np                 # numpy, duh...
#import glob                        # file reading
import zipfile                     # zipped file reading
import os                          # recursive navigation of file tree
import fnmatch                     # matching file name patterns






'''
# from gutenberg.acquire import load_metadata
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

from gutenberg.query import get_etexts
from gutenberg.query import get_metadata

from gutenberg.query import list_supported_metadatas
'''


#### STEPS TO TAKE IN SETTING UP MODEL

1.  Read in all of the dataset from the files
2.  Classify each work
    *  If author's first name is only in male.txt, then male (ADD TO MALE LIST, WILL COMBINE LATER)
    *  Else if author's first name is only in female.txt, then female (ADD TO FEMALE LIST, WILL COMBINE LATER)
    *  Else, ambiguous (ADD TO AMBIGUOUS LIST, CAN BE USED FOR MANUAL TESTING IF YOU WANT)
3.  Clean each text
    *  Remove headers and footers
    *  Remove characters that are not spaces or newlines or numbers or characters or hyphens (or question marks or exclamation marks?) or apostrophes (in contractions or possessive forms)
    *  Replace newlines with spaces
    *  (if using ? and !, replace with a space plus the mark, so it will count as a new word in tokenization)
    *  Remove double spaces
    *  Convert all capital letters to lowercase
4.  Tokenize each cleaned text
5.  Build a vocabulary???
5.  Vectorize each tokenized text

In [None]:
# Step 1: read in all of the .txt files

# holds all the strings of the etexts, and that's all
etexts = []

# recursively navigate the directory containing all the zipped etexts
for path, dirs, files in os.walk('./aleph.gutenberg.org'):
    
    # find all the zip folders
    for zip_name in fnmatch.filter(files,'*.zip'):
        zip_path = os.path.abspath(os.path.join(path, zip_name))
        #print(zip_path)
        
        # unzip and read the etext
        archive = zipfile.ZipFile(zip_path, 'r')
        for txt_name in archive.namelist():
            print(txt_name)
            #etext = str(archive.read(txt_name))
            
            # some of them have accented characters (which are non-ASCII), which throws off the decoding
            try:
                etext = archive.read(txt_name).decode('ascii')
                etexts.append(etext)
            except UnicodeDecodeError:
                pass
            
        

In [3]:
# Step 2: classify each etext

# keep these separate at first so we can minimize bias in the training set
# we ultimately want to have a decent gender balance in the training set to minimize bias
# it doesn't matter for testing set, as testing won't directly influence the model
male_etexts = []
female_etexts = []
ambiguous_etexts = []

# read in the lists of male and female names
female_names_txt = open('female.txt', 'r').read()
male_names_txt = open('male.txt', 'r').read()

In [4]:
# function to get a list of names from the name files
def get_names(names):
    index1 = names.rfind('#')
    index2 = names.find('\n', index1)
    
    # slice out the header
    no_header = names[index2:]
   
    # tokenized along whitespace to create a list of names
    return no_header.split()

In [5]:
female_names = get_names(female_names_txt)
male_names = get_names(male_names_txt)

In [6]:
# function to classify a given uncleaned etext by the author's first name
def classify_etext(etext):
    index1 = etext.find('Author: ')
    index2 = index1 + 8
    index3 = etext.find(' ', index2)
    name = etext[index2: index3]
    print(name)
    
    if name in female_names and not name in male_names or name == 'Mrs.':
        # classify as female
        female_etexts.append([etext, 0])
    elif name in male_names and not name in female_names:
        # classify as male
        male_etexts.append([etext, 1])
    else:
        # classify as ambiguous
        ambiguous_etexts.append(etext)
        
    return


# function to reset male_etexts, female_etexts, and ambiguous_etexts
# in case run classify_etext(...) more than once with a given etext
# basically, this is just for development purposes
def reset_classifications():
    male_etexts[:] = []
    female_etexts[:] = []
    ambiguous_etexts[:] = []
    
    return

In [7]:
# classify all the etexts
reset_classifications()
for etext in etexts:
    classify_etext(etext)

Lucius
William
Mary
Lindsay,
George
Giovanni
Stewart
S.
William
Unknown

Release
John
Various

Release
Various

Release
Various

Release
Various

Release
Various

Release
Various

Release
Various

Release
Dante
Dante
Dante
Edited
Ada
A.
Charles
F.
Fannie

Harold
ject
Magnay,
William
Edgar
Various

Release
Various

Release
Various

Release
Various

Release
Various

Release
Edith
Elizabeth
Aphra
Charles
James
E.R.
Joseph
William
H.
John
Various

Release
Janet
Margaret
Margaret
Margaret
Margaret
Anonymous

Release
Thomas
Henry
Jeffery
James
Max
Burton
Grace
Joseph
Edward
S.
Elizabeth
Various

Release
Venture
Harold
Various

Release
Douglas
Various

Release
Harriot
Robert
John
Anna
James
Anonymous

Release
James
Louise
An
Donald
Francesco
Various

Release
Various

Release
George
Burton
Richard
Euripides

Release
Edward
W.T.
Mrs.
John
Hudson,
William
Editor-in-Chief:
Various

Release
Various

Release
Various

Release
Hugh
Anonymous

Releas

Claude
Various

Release
Various

Release
Frank
Various

Release
Arthur
H.
Thornton
Emily
Stanley
Various

Release
Henry
William
Louis
Henry
Honore
Work
Various

Release
Ledyard
John
W.
Richard
Various

Release
George
H.
Various

Release
Augustus
Henry
Caroline
Charlotte
Albert
Cecil
David
Theodore
Theodore
Theodore
Theodore
Arthur
Le
Mildred
Richard
Collected
The
Francois
Francois
Francois
Francois
Francois
Francois
J.
Various

Release
Eliza
Eugene
Joseph
Clifford
Various

Release
A.
John
Seumas
Charlotte
W.W.
W.W.
W.W.
W.W.
W.W.
W.
Edward
William
Stephen
Various

Elizabeth
Thomas
William
Translated
Edited
Arthur
Algernon
Nicholas
Franklin
Mary
An
An
An
An
An
Gordon
Lucy
Charles
Lewis
Charles
Lewis
James
Anonymous

Translator:
Henry
Elihu
John
George
John
E.
Wm.
Cora
Frederick
Theophilus
Randall
L
Various

Release
Various

Release
L.
Francois
Joseph
Alice
Ella
Kate
C.
Various

Release
James
Frederick
John
Clarence
R.
Moncure
Walter
Walter
Ruth
R.
Thomas
Benja

Release
Various

Release
George
Edward
Edward
Edward
Edward
Edward
Various

Release
James
Various

Release
Various

Release
Yogi
Thomas
Lord
Kermit
Laura
S.R.
Anonymous

Release
Bertha
Coventry
Captain
Jean
Anonymous

Release
Henry
Israel
Various

Release
Eva
E.
Christopher
Emile
Nathaniel
Robert
Alfred
Various

Release
Various

Release
Various

Release
Edward
John
Edward
Edwin
Various

Release
Washington
Washington
Honore
Honore
Herman
Herman
Mary
Arnold
Charles
H.
Plato

Release
Mrs.
Various

Release
Richard
Zona
Various

Release
Frederick
Christopher
Laura
Various

Release
Emma
Robert
Various

Release
Calvin
Thomas
John
Various

Release
Charles
Amy
Edmund
W.
Nephi
F.
Etta
David
George
Various

Release
W.
Ethel
Robert
Arachne

Release
Elizabeth
Various

Release
Ethel
Grace
Anonymous

Release
T.
Elbert
Thomas
Mrs.
Richard
Finley
Stephen
Conrad
Various

Release
Ephraim
Lewis
Warren
Various

Release
Various

Release
A.
Anonymous

Release

C.
Ludovic
Rudyard
Mary
John
Horatio
Augusta
Various

Editor:
Lascelles
Mary
ject
ject
Unknown

Translator:
ject
Various

Release
Various

Release
Various

Release
Various

Editor:
Ruth
William
L.
Nehemiah
Frances
Jane
Carl
Sigmund
A
Robert
Henrik
Various

Release
Maurice
Mary
George
Jackson
James
ject
Robert
The
Allen
J.
William
Mary
Various

Release
William
John
Various

Release
Oliver
Frances
Kate
Honore
Honore
Honore
Honore
Howard
Honore
Ernest
Thornton
Thomas
Ian
C.
Mark
John
M.
Arthur
Rudyard
Various

Editor:
Pedro
David
Edgar
Frederick
Various

Release
James
Rosa
Various

Release
Oliver
Charlotte
William
Barney
Annie
Lucretia
Jacqueline
Various

Editor:
Mabell
Beatrice
Coningsby
Various

Release
Various

Editor:
Arthur
Caroline
Henry
Various

Editor:
Prosper
Antoine
Thomas
Mrs.
George
Kate
A.V.
Marie
Beatrix
Anonymous

Release
Henry
Frank
Zane
Benjamin
Prentiss
Various

Release
Sir
William
Jonas
T.
Marie
Juliana
Various

Release
Marguerite
J

G.
Laurence
John
Samuel
Richmal
Margaret
Francis
Edward
John
N.
Annie
James
Clement
Various

Editor:
Edward
Cory
Charles
George
Alice
Louise-Clarke
G.
Edwin
Hammurabi,
Frank
Martin
Frederick
Nelson
Jonathan
W.
Hugo
Anonymous

Release
Anonymous

Release
James
Dorothy
George
Bret
Frank
John
Amelia
A.
Mrs.
Arnold
Will
Mary
Thomas
Belle
Caroline
Thomas
Zora
Frank
Various

Editor:
Lacy
Sue
Edgar
Frederick,
Elbert
John
E.
William
Various

Editor:
Susanna
Christopher
Sax
Oscar
Gaston
Henry
Henry
Henry
William
Various

Editor:
John
William
R.
Various

Editor:
Charles
Timothy
Albert
Estelle
Wilfred
Annie
Mortimer
Various

Release
John
G.K.
H.
O.
Honore
Lydia
Daniel
James
Frances
H.
Petroleum
Various

Editor:
Irving
John
Gerhart
Ike
W.
Various

Editor:
William
Anonymous

Editor:
Nephi
Thornton
Edward
Peter
Laurence
Various

Editor:
Ernest
Mary
James
Sir
E.
George
Ontario
Charles
E.
John
Newell
Washington
Laura
Laura
Henry
Lily
Robert
Walter
Walter
Edward
John
Charlotte
Ge

Zoe
Arthur
E.
H.
Janet
Leonora
Thornton
Elbert
Various

Editor:
William
Fremont
Various

Editor:
Hugh
Norman
Charles
Andre
Thornton
Various

Release
G.K.
Mary
Anonymous



Release
Clement
Various

Release
Various

Editor:
Clyde
Henry
Nesta
Various

Editor:
Opie
C.
Hattie
Various

Editor:
Rick
Geraldine
Arthur
Lilli
Sam
Various

Editor:
Maurice
Martin
Honore
Nathaniel
James
George
Douglass
William
Louis
Various

Editor:
Arthur
Frank
Arthur
Edwin
Charles
W.
Thomas
Emma
Thomas
Sarah
Garrett
Sax
Estelle
William
Andre
George
Anna
Mark
Joseph
Various

Editor:
Jacob
G.
Clay
Various

Release
Gertrude
H.
Bernard
J.
John
William
B.M.
Frances
Bernard
McLoughlin
Stephen
Laura
George
Robert
Arthur
Margaret
Randolph
Kelly
Various

Release
Various

Editor:
Alexander
Christina
Elwyn
Edith
Charles
J.
H.
Jacob
George
Charlotte
Howard
William
Annie
Various

Editor:
Meredith
Henrik
G.
Henry
Bernarr
James
David
David
David
David
David
David
Tobias
E.
Honore
Honore
Anonymous

Talbot
Talbot
Talbot
J.
E.C.
Unknown

Release
Jean
Howard
Talbot
Edward
Elliott
Unknown



Release
Ottwell
Walter
Harry
Harry
Harry
Harry
Harry
Harry
Harry
Harry
Harry
Harry
Harry
Harry
Harry
Harry
Harry
Harry
Harry
Lawrence
Harry
Arthur
James
C.
Various

Release
Anonymous

Release
J.
John
John
John
J.C.
Henry
John
Carey
Various

Release
Raymond
John
John
John
Mrs.
Mrs
Mrs.
Mrs.
Mrs.
John
J.
John
John
John
Mrs.
Mrs.
Elizabeth
John
Ned
Alice
Eugene
Mrs.
Mrs.
Mrs.
Mrs.
Mrs.
Mrs.
Leslie
Edward
Various

Editor:
Katherine
James
Mrs.
Thomas
Thomas
Thomas
Thomas
Thomas
Thomas
Thomas
Thomas
Thomas
Thomas
W.
T.P.
T.P.
Theodore
Theodore
Theodore
Lewis
Talbot
John
Lewis
Victor
Marmaduke
Alexander
Various

Editor:
Various

Editor:
Shearjashub
Marcus
R.
Arthur
George
Harold
George
R.
John
Giorgio
George
Frank
M.
P.
Geraldine
Grace
Charlotte
Charlotte
Franklin
Various

Release
Margaret
Margaret
Amy
Amy
Thomas
Thomas
Thomas
Fa-Hsien

Translator:
Bernardin
Maria
Amy
Amy
Amy
Emily

John.
William
Karen
William
Edward
Anonymous

Release
Fannie
Maurus
Various

Release
Jules
Henry
Gordon
Charles
John
J.
Gordon
Various

Release
Robert
E.
Hugo
Lieut.-Col.
E.
Anonymous

Release
William
Donald
Paul
Various

Release
Richard
Robert
William
Charlotte-Adelaide
George
Walter
A.E.
Tyrone
John
James
Elizabeth
Maurice
Eleanor
Various

Editor:
Anna
Anna
Anna
Anna
Anna
Anna
Nathan
Melvin
John
Thornton
Oliver
Joslyn
Brandon
Newell
Howard
Oliver
Anna
Rupert
Bret
Anonymous

Editor:
Hartley
Archibald
Edward
Mary
Mary
Mary
Mary
Mary
Mary
Mary
Mary
Mary
Mary
Mary
Mary
C.
Dhan
John
John
Alan
Alan
Alan
Charles
Anne
Various

Editor:
Burt
Alan
Alan
George
George
F.
Thomas
Alan
Alan
Rosa
Mary
Lionel
Arthur
Anonymous



Release
Kate
Stanley
Anonymous

Release
Everett
Stanley
Stanley
Anonymous

Release
Stanley
Johan
Harry
Henry
Margaret
Various

Editor:
Charles
Sir
Emilie
Dorothy
Charles
Elma
Mary
Anonymous

Release
Various

Editor:
Alan
Wayne
Various

Release
V

Mary
John
Mrs.
Isaac
Harry
Henry
Joseph
Richard
Rabindranath
Arthur
T.
Honor
Jean
J.
George
Edwin
Harry
George
R.M.
D.
Anonymous

Editor:
Various

Editor:
Robert
Emmett
Arthur
Various

Editor:
Henry
B.
Robert
Niccolo
Prosper


Oliver
Talbot
Constance
Charles
Sidney
Louis
Louis
Louis
Louis
Giovanni-Andrea
Anonymous

Release
Angela
Francis
James
Charlotte
H.
Milo
Joseph
Ministry
Edgerton
John
Oliver
W.H.G.
Eleanor
Anna
Edwin
W.H.G.
William
Woodrow
Anonymous

Release
William
W.
Justin
Carolyn
Alexander
A.F.
W.H.G.
Ada
Anonymous

Release
Charles
Marie
Annie
Bertram
Thomas
Friedrich
Mary
John
Sargent
Unknown

Release
John
Unknown

Release
Edison
Frank
Horace
Oliver
James
Edward
Paul
Unknown

Release
Young
Oscar
Edgar
Gordon
Winfield
Ira
Sophie
George
Frank
Elsie
W.H.G.
Paul
William
Pierre
Charles
Various

Editor:
Frank
Various

Editor:
Everett
Ralph
Hilda
Clara
Maria
Joel
Victor
Bret
Charles
Joseph
Katharine
Arthur
Charles
Unknown

Release
Charles
Unknown

Release


Eleanor
Joseph
Joseph
Ralph
Ralph
Albert
Mrs.
George
Walter
Theodore
Theodore
Edward
Eugene
Mary
Thomas
Walter
Richard
Rebecca
[AKA
[AKA
Zitkala-Sa

Release
[AKA
Eugene
Eugene
Eugene
Eugene
Eugene
Eugene
Eugene
Eugene
Eugene
Eugene
Eugene
The
The
The
The
The
The
The
The
The
The
The
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
Sophocles

Translator:
Charlotte
H.
William
William
Robert
Anthony
[AKA
Robert
Robert
Bram
Unknown

Release
Homer
Samuel
Albert
Mary
Ouida
Ralph
William
Thomas
E.
James
Richard
Mary
Watkin
Watkin
James
Edna
W.
Edna
John
Arthur
Arthur
William
Richard
Count
Constant

Release
Michel
Michel
Michel
Michel
Michel
Michel
Michel
Michel
Michel
Michel
Michel
Michel
Michel
Michel
Michel
Michel
Michel
Michel
Michel
Anthony
Ella
Count
Cou

In [8]:
print(len(male_etexts), len(female_etexts), len(ambiguous_etexts))

6403 1892 5472


In [9]:
print(len(etexts))

13767


In [13]:
# Step 3: clean the etexts

# function to cut out the publishing and legal info at the beginning and end of the etexts
def strip_headers(etext):
    # TODO: cut off fron and back portions that have extraneous publishing info
    
    # formatting isn't always identical, and I don't want to have to deal with all the possible formatting edge cases
    try:
        index1 = etext.index('*** START OF THIS PROJECT GUTENBERG EBOOK') + 3
    except:
        return None
    
    # index2 is the index just past the junk at the beginning of the etext
    try:
        index2 = etext.index('***', index1) + 3
    except:
        return None
    
    # index3 is the index that marks the junk at the end of the etext
    try:
        index3 = etext.index("*** END OF THIS PROJECT GUTENBERG EBOOK")
    except:
        return None
    
    return etext[index2:index3]


# used to cut down a passage to just lowercase letters, hyphens (for compound words), and spaces
def clean(etext):
    # strip the header and footer from the etext
    stripped_etext = strip_headers(etext)
    
    if stripped_etext == None:
        return None
    
    valids = ''

    for character in stripped_etext:
        # might not be necessary
        if character == '\n':
            character = ' '
    
        # definitely necessary
        # TODO: might need to handle cases of accented characters, as str.isalpha() doesn't handle them
        # TODO: might need to include apostrophes (for possessive nouns and contractions),
        # however, some texts likely use single quotes for quotes, so would likely need to include double quotes
        # TODO: maybe handle ! and ?
        if character.isalpha() or character == ' ' or character == '-':
            # check if preceding character is a space
            # if it is a space, no double or triple or n-tuple spaces
            # and, if the preceding character is a space, the length of valids will necessarily be greater than 0
            if len(valids) > 0 and character == ' ':
                if valids[len(valids) - 1] != ' ':
                    valids += character
            else:
                valids += character
      
    return valids.lower()

# used to reset clean_xxxx_etexts in case you clean all of them several times
def reset_cleanings():
    cleaned_male_etexts[:] = []
    cleaned_female_etexts[:] = []
    cleaned_ambiguous_etexts[:] = []
    
    return

In [17]:
print(clean(etexts[0]))

 produced by ted garvin ben courtney and pg distributed proofreaders seneca apocolocyntosis with an english translation by whd rouse ma litt d mcmxx introduction this piece is ascribed to seneca by ancient tradition it is impossible to prove that it is his and impossible to prove that it is not the matter will probably continue to be decided by every one according to his view of senecas character and abilities in the matters of style and of sentiment much may be said on both sides dion cassius lx says that seneca composed an greek apokolokuntosis or pumpkinification of claudius after his death the title being a parody of the usual greek apotheosis but this title is not given in the mss of the ludus de morte claudii nor is there anything in the piece which suits the title very well as a literary form the piece belongs to the class called satura menippea a satiric medley in prose and verse this text is that of buecheler with a few trifling changes which are indicated in the notes we have

In [None]:
cleaned_male_etexts = []
cleaned_female_etexts = []
cleaned_ambiguous_etexts = []

In [16]:
reset_cleanings()

i = 0
for etext, gender in male_etexts:
    i += 1
    if i % 10 == 0:
        print(100 * i / len(male_etexts), '% complete')
    
    cleaned_etext = clean(etext)
    if cleaned_etext != None:
        cleaned_male_etexts.append(cleaned_etext)
        

i = 0
for etext, gender in female_etexts:
    i += 1
    if i % 10 == 0:
        print(100 * i / len(female_etexts), '% complete')
    
    cleaned_etext = clean(etext)
    if cleaned_etext != None:
        cleaned_female_etexts.append(cleaned_etext)



for etext in ambiguous_etexts:
    i += 1
    if i % 10 == 0:
        print(100 * i / len(ambiguous_etexts), '% complete')
    
    cleaned_etext = clean(etext)
    if cleaned_etext != None:
        cleaned_ambiguous_etexts.append(cleaned_etext)

KeyboardInterrupt: 

In [None]:
print(cleaned_female_etexts[0])

In [None]:
print(len(etexts))