#### STEPS TO GET DATA
1.  Download the data by running the following command in the project directory:
    `wget -w 2 -m -H "http://www.gutenberg.org/robot/harvest?filetypes[]=html&langs[]=de"`
    
2.  Clean extraneous files by running the following commands (applies for Windows):
    `del /S *-8.zip`
    `del /S *-0.zip`
    `del /S robots.txt`
    `del /S harvest*`
    
`pip install bsddb3-6.2.6-cp37-cp37m-win_amd64.whl`
`pip install gutenberg`

In [40]:
# imports
from string import ascii_lowercase # for checking if letters
import numpy as np                 # numpy, duh...
import zipfile                     # zipped file reading
import os                          # recursive navigation of file tree
import fnmatch                     # matching file name patterns
import tensorflow as tf            # tensorflow, duh...





'''
# from gutenberg.acquire import load_metadata
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

from gutenberg.query import get_etexts
from gutenberg.query import get_metadata

from gutenberg.query import list_supported_metadatas
'''


'\n# from gutenberg.acquire import load_metadata\nfrom gutenberg.acquire import load_etext\nfrom gutenberg.cleanup import strip_headers\n\nfrom gutenberg.query import get_etexts\nfrom gutenberg.query import get_metadata\n\nfrom gutenberg.query import list_supported_metadatas\n'

#### STEPS TO TAKE IN SETTING UP MODEL

1.  Read in all of the dataset from the files
2.  Classify each work
    *  If author's first name is only in male.txt, then male (ADD TO MALE LIST, WILL COMBINE LATER)
    *  Else if author's first name is only in female.txt, then female (ADD TO FEMALE LIST, WILL COMBINE LATER)
    *  Else, ambiguous (ADD TO AMBIGUOUS LIST, CAN BE USED FOR MANUAL TESTING IF YOU WANT)
3.  Clean each text
    *  Remove headers and footers
    *  Remove characters that are not spaces or newlines or numbers or characters or hyphens (or question marks or exclamation marks?) or apostrophes (in contractions or possessive forms)
    *  Replace newlines with spaces
    *  (if using ? and !, replace with a space plus the mark, so it will count as a new word in tokenization)
    *  Remove double spaces
    *  Convert all capital letters to lowercase
4.  Tokenize each cleaned text
5.  Build a vocabulary???
5.  Vectorize each tokenized text

In [26]:
# Step 1: read in all of the .txt files

# holds all the strings of the etexts, and that's all
etexts = []

# recursively navigate the directory containing all the zipped etexts
for path, dirs, files in os.walk('./aleph.gutenberg.org'):
    
    # find all the zip folders
    for zip_name in fnmatch.filter(files,'*.zip'):
        zip_path = os.path.abspath(os.path.join(path, zip_name))
        #print(zip_path)
        
        # unzip and read the etext
        archive = zipfile.ZipFile(zip_path, 'r')
        for txt_name in archive.namelist():
            print(txt_name)
            #etext = str(archive.read(txt_name))
            
            # some of them have accented characters (which are non-ASCII), which throws off the decoding
            try:
                etext = archive.read(txt_name).decode('ascii')
                etexts.append(etext)
            except UnicodeDecodeError:
                pass
            
        

10001.txt
10002.txt
10003.txt
10004.txt
10005.txt
10006.txt
10008.txt
10009.txt
10010.txt
10011.txt
10012.txt
10013.txt
10014.txt
10015.txt
10016.txt
10017.txt
10018.txt
10019.txt
1005.txt
1006.txt
1007.txt
10020.txt
10021.txt
10022/10022.txt
10023.txt
10024.txt
10025.txt
10026.txt
10027.txt
10028.txt
10029.txt
10030.txt
10031.txt
10032.txt
10033.txt
10034.txt
10035.txt
10036.txt
10037.txt
10038.txt
10039.txt
10040.txt
10041.txt
10042.txt
10043.txt
10044/10044.txt
10045.txt
10046.txt
10047.txt
10048.txt
10049.txt
10050.txt
10051.txt
10052.txt
10056.txt
10057/10057.txt
10058/10058.txt
10059.txt
10060.txt
10062.txt
10063.txt
10064.txt
10065.txt
10066/10066.txt
10067.txt
10068.txt
10069.txt
10070.txt
10071.txt
10072.txt
10074.txt
10075.txt
10076.txt
10077.txt
10078.txt
10079.txt
10080.txt
10081.txt
10082.txt
10083.txt
10084.txt
10085/10085.txt
10086.txt
10087.txt
10088.txt
10089.txt
10090.txt
10091/10091.txt
10092/10092.txt
10093.txt
10094.txt
10095.txt
10096/10096.txt
10097/10097.txt
100

10867.txt
10868/10868.txt
10869.txt
10870/10870.txt
10871/10871.txt
10872/10872.txt
10873/10873.txt
10874/10874.txt
10875/10875.txt
10876.txt
10877.txt
10878/10878.txt
10879/10879.txt
10880/10880.txt
10881/10881.txt
10882/10882.txt
10883/10883.txt
10884/10884.txt
10885/10885.txt
10886.txt
10887/10887.txt
10888/10888.txt
10889/10889.txt
10890/10890.txt
10891.txt
10892/10892.txt
10893/10893.txt
10894/10894.txt
10895/10895.txt
10896/10896.txt
10897.txt
10898.txt
10899.txt
10901.txt
10902.txt
10903.txt
10904.txt
10905.txt
10907.txt
10908.txt
10910.txt
10911.txt
10912.txt
10913.txt
10915.txt
10916.txt
10918.txt
10919.txt
1093.txt
1096.txt
1097.txt
10920.txt
10921.txt
10922.txt
10923.txt
10924.txt
10925.txt
10926.txt
10928.txt
10929.txt
10930.txt
10931.txt
10932.txt
10933.txt
10934.txt
10935.txt
10936.txt
10937.txt
10938.txt
10939.txt
10940.txt
10942.txt
10943.txt
10944.txt
10945.txt
10946.txt
10947.txt
10948.txt
10949.txt
10950.txt
10951.txt
10952.txt
10954.txt
10955.txt
10956.txt
10957.txt

11859.txt
11860.txt
11861.txt
11862.txt
11863.txt
11864.txt
11865.txt
11867.txt
11868.txt
11869.txt
11870.txt
11871.txt
11872.txt
11873.txt
11874.txt
11875.txt
11876.txt
11877.txt
11878.txt
11880.txt
11881.txt
11882.txt
11883.txt
11885.txt
11886.txt
11887.txt
11888.txt
11889.txt
11890.txt
11892.txt
11894.txt
11895.txt
11896.txt
11897.txt
11898.txt
11900.txt
11901.txt
11902.txt
11903.txt
11904.txt
11906.txt
11907.txt
11908.txt
11909.txt
11910.txt
11911.txt
11912.txt
11913.txt
11915.txt
11917.txt
11918.txt
11919.txt
1190.txt
1191.txt
1194.txt
1195.txt
1196.txt
11920.txt
11921.txt
11922.txt
11923.txt
11924.txt
11926.txt
11929.txt
11930.txt
11931.txt
11932.txt
11933.txt
11934.txt
11935.txt
11936.txt
11937.txt
11938.txt
11939.txt
11941.txt
11942.txt
11943.txt
11944.txt
11945.txt
11946.txt
11947.txt
11948.txt
11949.txt
11950.txt
11951.txt
11952.txt
11953.txt
11954.txt
11955.txt
11956.txt
11957.txt
11958.txt
11959.txt
11960.txt
11961.txt
11962.txt
11963.txt
11965.txt
11966.txt
11969.txt
11970

12820.txt
12821.txt
12823.txt
12825.txt
12826.txt
12827.txt
12828.txt
12830.txt
12832.txt
12833.txt
12834.txt
12835.txt
12836.txt
12839.txt
12841.txt
12842.txt
12843.txt
12845.txt
12846.txt
12847.txt
12849.txt
12850.txt
12851.txt
12852.txt
12853.txt
12854.txt
12855.txt
12856.txt
12857.txt
12858.txt
12859.txt
12860.txt
12861.txt
12863.txt
12864.txt
12866.txt
12867.txt
12868.txt
12870.txt
12871.txt
12872.txt
12873.txt
12874.txt
12875.txt
12876.txt
12878.txt
12879.txt
12880.txt
12881.txt
12882.txt
12883.txt
12884.txt
12886.txt
12887.txt
12888.txt
12890.txt
12891.txt
12892.txt
12894.txt
12895.txt
12896.txt
12897.txt
12898.txt
12899.txt
12900.txt
12901.txt
12902.txt
12903.txt
12904.txt
12905.txt
12908.txt
12909.txt
12910.txt
12911.txt
12912.txt
12913.txt
12914.txt
12915.txt
12916.txt
12917.txt
12918.txt
12919.txt
1291.txt
1294.txt
1295.txt
1296.txt
12920.txt
12922.txt
12923.txt
12924.txt
12925.txt
12926.txt
12928.txt
12929.txt
12930.txt
12931.txt
12932.txt
12933.txt
12934.txt
12935.txt
1293

13790.txt
13791.txt
13796.txt
13797.txt
13799.txt
13800.txt
13801.txt
13803.txt
13806.txt
13809.txt
13811.txt
13812.txt
13813.txt
13814.txt
13815.txt
13816.txt
13817.txt
1380.txt
1389.txt
13820.txt
13821.txt
13822.txt
13823.txt
13824.txt
13826.txt
13827.txt
13828.txt
13829.txt
13830.txt
13831.txt
13832.txt
13833.txt
13835.txt
13836.txt
13840.txt
13841.txt
13842.txt
13843.txt
13844.txt
13847.txt
13851.txt
13852.txt
13853.txt
13854.txt
13858.txt
13859.txt
13860.txt
13864.txt
13865.txt
13870.txt
13871.txt
13872.txt
13876.txt
13877.txt
13878.txt
13879.txt
13880.txt
13881.txt
13882.txt
13883.txt
13884.txt
13885.txt
13886.txt
13887.txt
13888.txt
13889.txt
13890.txt
13891.txt
13893.txt
13894.txt
13895.txt
13896.txt
13897.txt
13898.txt
13899.txt
13900.txt
13903.txt
13905.txt
13906.txt
13907.txt
13908.txt
13909.txt
13910.txt
13911.txt
13912.txt
13913.txt
13916.txt
13918.txt
1392.txt
1394.txt
1395.txt
1396.txt
1397.txt
1399.txt
13922.txt
13923.txt
13924.txt
13925.txt
13926.txt
13927.txt
13928.tx

14809.txt
14811.txt
14812.txt
14813.txt
14814.txt
14815.txt
14817.txt
14818.txt
1480.txt
1481.txt
1482.txt
1484.txt
1488.txt
14821.txt
14823.txt
14824.txt
14825.txt
14829.txt
14831.txt
14832.txt
14833.txt
14834.txt
14835.txt
14836.txt
14837.txt
14838.txt
14841.txt
14842.txt
14843.txt
14844.txt
14845.txt
14846.txt
14848.txt
14849.txt
14851.txt
14852.txt
14853.txt
14854.txt
14855.txt
14856.txt
14857.txt
14858.txt
14859.txt
14860.txt
14863.txt
14865.txt
14866.txt
14867.txt
14868.txt
14869.txt
14870.txt
14871.txt
14872.txt
14873.txt
14874.txt
14875.txt
14876.txt
14877.txt
14879.txt
14880.txt
14881.txt
14882.txt
14883.txt
14884.txt
14885.txt
14886.txt
14887.txt
14888.txt
14889.txt
14890.txt
14891.txt
14892.txt
14893.txt
14895.txt
14896.txt
14897.txt
14898.txt
14899.txt
14900.txt
14901.txt
14902.txt
14903.txt
14906.txt
14907.txt
14908.txt
14909.txt
14910.txt
14914.txt
14916.txt
14917.txt
14919.txt
1492.txt
14920.txt
14921.txt
14922.txt
14923.txt
14924.txt
14925.txt
14926.txt
14927.txt
14928.

15827.txt
15828.txt
15829.txt
15830.txt
15831.txt
15833.txt
15834.txt
15835.txt
15836.txt
15837.txt
15838.txt
15839.txt
15840.txt
15841.txt
15843.txt
15850.txt
15851.txt
15852.txt
15853.txt
15854.txt
15855.txt
15856.txt
15857.txt
15858.txt
15859.txt
15860.txt
15861.txt
15862.txt
15863.txt
15864.txt
15865.txt
15866.txt
15867.txt
15868.txt
15869.txt
15870.txt
15872.txt
15873.txt
15874.txt
15875.txt
15876.txt
15877.txt
15878.txt
15879.txt
15880.txt
15881.txt
15883.txt
15884.txt
15886.txt
15887.txt
15888.txt
15889.txt
15892.txt
15893.txt
15894.txt
15895.txt
15896.txt
15899.txt
15900.txt
15901.txt
15902.txt
15903.txt
15904.txt
15905.txt
15906.txt
15909.txt
15910.txt
15911.txt
15912.txt
15913.txt
15914.txt
15916.txt
15917.txt
15918.txt
15919.txt
1590.txt
1594.txt
15920.txt
15921.txt
15922.txt
15923.txt
15924.txt
15925.txt
15926.txt
15927.txt
15928.txt
15929.txt
15930.txt
15931.txt
15932.txt
15934.txt
15935.txt
15936.txt
15937.txt
15938.txt
15940.txt
15941.txt
15944.txt
15945.txt
15946.txt
15

16845.txt
16847.txt
16853.txt
16855.txt
16856.txt
16857.txt
16858.txt
16859.txt
16860.txt
16861.txt
16863.txt
16864.txt
16865.txt
16866.txt
16867.txt
16868.txt
16869.txt
16870.txt
16871.txt
16872.txt
16873.txt
16877.txt
16878.txt
16879.txt
16889.txt
16890.txt
16891.txt
16892.txt
16894.txt
16895.txt
16896.txt
16897.txt
16898.txt
16900.txt
16902.txt
16903.txt
16904.txt
16905.txt
16906.txt
16907.txt
16908.txt
16909.txt
16910.txt
16911.txt
16912.txt
16913.txt
16914.txt
16915.txt
16916.txt
16917.txt
16918.txt
16919.txt
1697.txt
16921.txt
16923.txt
16924.txt
16925.txt
16926.txt
16927.txt
16928.txt
16929.txt
16930.txt
16931.txt
16932.txt
16933.txt
16935.txt
16936.txt
16937.txt
16938.txt
16939.txt
16940.txt
16941.txt
16942.txt
16943.txt
16945.txt
16946.txt
16947.txt
16948.txt
16949.txt
16950.txt
16951.txt
16953.txt
16954.txt
16955.txt
16956.txt
16957.txt
16958.txt
16959.txt
16960.txt
16962.txt
16963.txt
16964.txt
16965.txt
16966.txt
16967.txt
16968.txt
16969.txt
16971.txt
16972.txt
16973.txt
1

17956.txt
17957.txt
17958.txt
17959.txt
17960.txt
17961.txt
17964.txt
17965.txt
17966.txt
17967.txt
17968.txt
17969.txt
17970.txt
17971.txt
17972.txt
17973.txt
17976.txt
17977.txt
17978.txt
17979.txt
17981.txt
17982.txt
17985.txt
17987.txt
17988.txt
17993.txt
17994.txt
17997.txt
17998.txt
17999.txt
18000.txt
18001.txt
18002.txt
18004.txt
18007.txt
18009.txt
18010.txt
18011.txt
18012.txt
18013.txt
18018.txt
18019.txt
1804.txt
1805.txt
18020.txt
18021.txt
18022.txt
18025.txt
18031.txt
18032.txt
18033.txt
18035.txt
18036.txt
18037.txt
18038.txt
18039.txt
18040.txt
18041.txt
18042.txt
18044.txt
18045.txt
18047.txt
18048.txt
18049.txt
18050.txt
18051.txt
18052.txt
18053.txt
18054.txt
18056.txt
18057.txt
18058.txt
18060.txt
18062.txt
18063.txt
18065.txt
18068.txt
18070.txt
18071.txt
18076.txt
18077.txt
18078.txt
18079.txt
18080.txt
18086.txt
18087.txt
18091.txt
18093.txt
18094.txt
18095.txt
18096.txt
18097.txt
18099.txt
18100.txt
18102.txt
18103.txt
18104.txt
18105.txt
18107.txt
18109.txt
18

19067.txt
19068.txt
19069.txt
19070.txt
19071.txt
19073.txt
19074.txt
19076.txt
19077.txt
19078.txt
19079.txt
19080.txt
19081.txt
19082.txt
19083.txt
19084.txt
19085.txt
19087.txt
19089.txt
19090.txt
19092.txt
19093.txt
19094.txt
19096.txt
19097.txt
19098.txt
19099.txt
19100.txt
19101.txt
19102.txt
19103.txt
19104.txt
19105.txt
19107.txt
19108.txt
19109.txt
19110.txt
19111.txt
19113.txt
19114.txt
19115.txt
19116.txt
19117.txt
19118.txt
19119.txt
1911.txt
1912.txt
1916.txt
19120.txt
19121.txt
19122.txt
19123.txt
19126.txt
19127.txt
19129.txt
19130.txt
19131.txt
19132.txt
19133.txt
19134.txt
19135.txt
19136.txt
19138.txt
19139.txt
19140.txt
19141.txt
19142.txt
19143.txt
19144.txt
19145.txt
19146.txt
19147.txt
19148.txt
19150.txt
19151.txt
19153.txt
19154.txt
19155.txt
19156.txt
19157.txt
19158.txt
19160.txt
19162.txt
19164.txt
19165.txt
19166.txt
19167.txt
19168.txt
19169.txt
19170.txt
19171.txt
19172.txt
19173.txt
19174.txt
19175.txt
19177.txt
19179.txt
19180.txt
19181.txt
19185.txt
191

20192.txt
20193.txt
20194.txt
20195.txt
20196.txt
20197.txt
20198.txt
20200.txt
20201.txt
20202.txt
20203.txt
20204.txt
20205.txt
20206.txt
20207.txt
20208.txt
20209.txt
20210.txt
20212.txt
20213.txt
20214.txt
20215.txt
20216.txt
20217.txt
20218.txt
20219.txt
20220.txt
20221.txt
20222.txt
20223.txt
20224.txt
20225.txt
20226.txt
20229.txt
2021.txt
2022.txt
2024.txt
2026.txt
2028.txt
20230.txt
20231.txt
20232.txt
20233.txt
20235.txt
20236.txt
20237.txt
20238.txt
20239.txt
20241.txt
20242.txt
20243.txt
20245.txt
20247.txt
20248.txt
20249.txt
20250.txt
20251.txt
20255.txt
20256.txt
20257.txt
20258.txt
20259.txt
20260.txt
20261.txt
20263.txt
20264.txt
20279.txt
20280.txt
20281.txt
20282.txt
20283.txt
20286.txt
20287.txt
20288.txt
20289.txt
20290.txt
20291.txt
20292.txt
20293.txt
20294.txt
20295.txt
20296.txt
20297.txt
20298.txt
20299.txt
201.txt
202.txt
204.txt
206.txt
208.txt
209.txt
20300.txt
20301.txt
20303.txt
20304.txt
20305.txt
20306.txt
20307.txt
20308.txt
20309.txt
20310.txt
20311.t

21248.txt
21249.txt
21250.txt
21251.txt
21252.txt
21253.txt
21254.txt
21255.txt
21256.txt
21258.txt
21259.txt
21260.txt
21261.txt
21262.txt
21263.txt
21264.txt
21266.txt
21267.txt
21268.txt
21270.txt
21271.txt
21272.txt
21273.txt
21274.txt
21275.txt
21276.txt
21278.txt
21279.txt
21280.txt
21281.txt
21284.txt
21285.txt
21286.txt
21288.txt
21291.txt
21292.txt
21293.txt
21294.txt
21295.txt
21296.txt
21297.txt
21298.txt
21299.txt
210.txt
211.txt
214.txt
216.txt
217.txt
219.txt
21300.txt
21301.txt
21302.txt
21303.txt
21304.txt
21305.txt
21306.txt
21307.txt
21308.txt
21309.txt
21310.txt
21311.txt
21312.txt
21313.txt
21314.txt
21315.txt
21316.txt
21317.txt
21318.txt
21319.txt
21320.txt
21321.txt
21322.txt
21323.txt
21324.txt
21325.txt
21326.txt
21327.txt
21328.txt
21329.txt
2130.txt
2131.txt
2132.txt
2133.txt
2135.txt
21330.txt
21331.txt
21332.txt
21333.txt
21334.txt
21335.txt
21337.txt
21338.txt
21339.txt
21340.txt
21341.txt
21342.txt
21344.txt
21345.txt
21346.txt
21348.txt
21351.txt
21352.t

22207.txt
22208.txt
22210.txt
22211.txt
22212.txt
22213.txt
22214.txt
22215.txt
22216.txt
22217.txt
22218.txt
22219.txt
22220.txt
22221.txt
22222.txt
22223.txt
22224.txt
22225.txt
22226.txt
22227.txt
22228.txt
22229.txt
22230.txt
22231.txt
22232.txt
22233.txt
22234.txt
22236.txt
22237.txt
22238.txt
22239.txt
22240.txt
22241.txt
22242.txt
22243.txt
22244.txt
22245.txt
22246.txt
22247.txt
22248.txt
22249.txt
22250.txt
22251.txt
22252.txt
22254.txt
22255.txt
22256.txt
22257.txt
22258.txt
22260.txt
22261.txt
22263.txt
22264.txt
22265.txt
22267.txt
22269.txt
22270.txt
22271.txt
22272.txt
22273.txt
22274.txt
22275.txt
22276.txt
22277.txt
22278.txt
22279.txt
22280.txt
22281.txt
22282.txt
22283.txt
22284.txt
22285.txt
22286.txt
22287.txt
22288.txt
22289.txt
22290.txt
22291.txt
22292.txt
22293.txt
22294.txt
22295.txt
22297.txt
22298.txt
220.txt
222.txt
223.txt
224.txt
228.txt
22300.txt
22301.txt
22302.txt
22303.txt
22304.txt
22305.txt
22306.txt
22307.txt
22308.txt
22309.txt
22310.txt
22311.txt


23278.txt
23280.txt
23281.txt
23282.txt
23283.txt
23284.txt
23286.txt
23287.txt
23288.txt
23290.txt
23291.txt
23292.txt
23293.txt
23295.txt
23296.txt
23299.txt
230.txt
232.txt
234.txt
236.txt
238.txt
23300.txt
23301.txt
23302.txt
23303.txt
23304.txt
23305.txt
23307.txt
23308.txt
23310.txt
23311.txt
23312.txt
23314.txt
23315.txt
23316.txt
23317.txt
23318.txt
23319.txt
23320.txt
23321.txt
23322.txt
23323.txt
23324.txt
23325.txt
23326.txt
23327.txt
23328.txt
23329.txt
2330.txt
2331.txt
23330.txt
23331.txt
23332.txt
23334.txt
23335.txt
23336.txt
23337.txt
23338.txt
23339.txt
23340.txt
23343.txt
23344.txt
23347.txt
23348.txt
23349.txt
23350.txt
23351.txt
23352.txt
23353.txt
23354.txt
23355.txt
23356.txt
23357.txt
23358.txt
23359.txt
23360.txt
23361.txt
23362.txt
23363.txt
23364.txt
23365.txt
23366.txt
23367.txt
23368.txt
23369.txt
23370.txt
23371.txt
23372.txt
23373.txt
23374.txt
23375.txt
23376.txt
23377.txt
23378.txt
23379.txt
23380.txt
23381.txt
23382.txt
23383.txt
23384.txt
23385.txt
23

24432.txt
24433.txt
24434.txt
24435.txt
24436.txt
24437.txt
24438.txt
24439.txt
24440.txt
24441.txt
24442.txt
24443.txt
24444.txt
24446.txt
24449.txt
24450.txt
24451.txt
24452.txt
24453.txt
24454.txt
24456.txt
24457.txt
24458.txt
24459.txt
24460.txt
24461.txt
24465.txt
24469.txt
24470.txt
24471.txt
24472.txt
24473.txt
24474.txt
24475.txt
24476.txt
24477.txt
24478.txt
24479.txt
24482.txt
24483.txt
24484.txt
24485.txt
24487.txt
24488.txt
24489.txt
24491.txt
24492.txt
24493.txt
24494.txt
24495.txt
24496.txt
24497.txt
24498.txt
24499.txt
24500.txt
24502.txt
24503.txt
24504.txt
24505.txt
24506.txt
24507.txt
24509.txt
24510.txt
24513.txt
24516.txt
24517.txt
24518.txt
24519.txt
24520.txt
24521.txt
24522.txt
24524.txt
24526.txt
24528.txt
24529.txt
2450.txt
2451.txt
2452.txt
2457.txt
2458.txt
2459.txt
24530.txt
24531.txt
24532.txt
24534.txt
24535.txt
24537.txt
24538.txt
24539.txt
24540.txt
24541.txt
24542.txt
24543.txt
24544.txt
24545.txt
24547.txt
24548.txt
24550.txt
24551.txt
24552.txt
24553.

2555.txt
2556.txt
25531.txt
25532.txt
25533.txt
25534.txt
25535.txt
25536.txt
25538.txt
25540.txt
25541.txt
25542.txt
25544.txt
25545.txt
25546.txt
25547.txt
25548.txt
25549.txt
25550.txt
25551.txt
25552.txt
25553.txt
25555.txt
25556.txt
25560.txt
25562.txt
25563.txt
25564.txt
25565.txt
25566.txt
25567.txt
25568.txt
25569.txt
25570.txt
25571.txt
25572.txt
25573.txt
25574.txt
25577.txt
25578.txt
25579.txt
25581.txt
25582.txt
25583.txt
25584.txt
25585.txt
25586.txt
25588.txt
25589.txt
25590.txt
25591.txt
25592.txt
25595.txt
25596.txt
25597.txt
25598.txt
25599.txt
25600.txt
25601.txt
25603.txt
25607.txt
25608.txt
25609.txt
25610.txt
25611.txt
25617.txt
25618.txt
25619.txt
25620.txt
25621.txt
25622.txt
25623.txt
25624.txt
25625.txt
25626.txt
25627.txt
25628.txt
25629.txt
2560.txt
2565.txt
2566.txt
25630.txt
25631.txt
25632.txt
25633.txt
25634.txt
25637.txt
25638.txt
25639.txt
25642.txt
25643.txt
25644.txt
25645.txt
25646.txt
25647.txt
25648.txt
25650.txt
25651.txt
25652.txt
25653.txt
25654

3756.txt
3762.txt
3763.txt
3764.txt
3765.txt
3766.txt
3787.txt
3797.txt
3802.txt
3803.txt
3805.txt
3813.txt
3815.txt
3816.txt
3821.txt
3822.txt
3823.txt
3824.txt
3828.txt
3838.txt
3839.txt
384.txt
3840.txt
3841.txt
3842.txt
3843.txt
3844.txt
3845.txt
3846.txt
3847.txt
3848.txt
3849.txt
3850.txt
3851.txt
3852.txt
3853.txt
3854.txt
3855.txt
3856.txt
3857.txt
3858.txt
3859.txt
3860.txt
3861.txt
3862.txt
3863.txt
3864.txt
3865.txt
3866.txt
3867.txt
3868.txt
3869.txt
3870.txt
3871.txt
3872.txt
3873.txt
3874.txt
3875.txt
3876.txt
3877.txt
3878.txt
3879.txt
3880.txt
3881.txt
3882.txt
3883.txt
3884.txt
3885.txt
3886.txt
3887.txt
3888.txt
3889.txt
3890.txt
3891.txt
3892.txt
3893.txt
3894.txt
3895.txt
3896.txt
3897.txt
3898.txt
3899.txt
3900.txt
3901.txt
3902.txt
3903.txt
3904.txt
3905.txt
3906.txt
3907.txt
3908.txt
3909.txt
3910.txt
3911.txt
3912.txt
3913.txt
3918.txt
3922.txt
3926.txt
3930.txt
3934.txt
3938.txt
398.txt
399.txt
3942.txt
3946.txt
3953.txt
3957.txt
3962.txt
3967.txt
3971.txt
3975

8582.txt
8583.txt
8584.txt
8585.txt
8586.txt
8587.txt
8588.txt
8589.txt
8593.txt
8594.txt
8595.txt
8598.txt
8600.txt
8667.txt
8668.txt
8674.txt
8681.txt
8701.txt
8702.txt
8703.txt
8704.txt
8705.txt
8706.txt
8707.txt
8708.txt
8709.txt
8710.txt
8779.txt
8780.txt
8781.txt
8782.txt
8783.txt
8784.txt
8785.txt
8786.txt
8787.txt
8788.txt
8789.txt
871.txt
877.txt
8790.txt
8791.txt
8792.txt
8793.txt
8794.txt
8795.txt
8796.txt
8797.txt
8798.txt
8799.txt
8897.txt
81.txt
82.txt
85.txt
88.txt
8911.txt
8941.txt
890.txt
891.txt
892.txt
893.txt
894.txt
895.txt
897.txt
9097.txt
908.txt
909.txt
9197.txt
9198.txt
910.txt
917.txt
921.txt
9301.txt
9312.txt
9315.txt
9316.txt
9317.txt
9318.txt
9319.txt
9320.txt
9382.txt
9443.txt
9444.txt
9445.txt
9446.txt
9447.txt
9448.txt
9449.txt
9450.txt
9456.txt
9459.txt
9465.txt
9479.txt
9499.txt
940.txt
947.txt
9500.txt
9548.txt
9625.txt
9626.txt
9627.txt
9628.txt
9658.txt
964.txt
967.txt
968.txt
9755.txt
9761.txt
9762.txt
9774.txt
973.txt
974.txt
980.txt
985.txt
986.t

In [27]:
# Step 2: classify each etext

# keep these separate at first so we can minimize bias in the training set
# we ultimately want to have a decent gender balance in the training set to minimize bias
# it doesn't matter for testing set, as testing won't directly influence the model
male_etexts = []
female_etexts = []
ambiguous_etexts = []

# read in the lists of male and female names
female_names_txt = open('female.txt', 'r').read()
male_names_txt = open('male.txt', 'r').read()

In [28]:
# function to get a list of names from the name files
def get_names(names):
    index1 = names.rfind('#')
    index2 = names.find('\n', index1)
    
    # slice out the header
    no_header = names[index2:]
   
    # tokenized along whitespace to create a list of names
    return no_header.split()

In [29]:
female_names = get_names(female_names_txt)
male_names = get_names(male_names_txt)

In [30]:
# function to classify a given uncleaned etext by the author's first name
def classify_etext(etext):
    index1 = etext.find('Author: ')
    index2 = index1 + 8
    index3 = etext.find(' ', index2)
    name = etext[index2: index3]
    print(name)
    
    if name in female_names and not name in male_names or name == 'Mrs.':
        # classify as female
        female_etexts.append([etext, 0])
    elif name in male_names and not name in female_names:
        # classify as male
        male_etexts.append([etext, 1])
    else:
        # classify as ambiguous
        ambiguous_etexts.append(etext)
        
    return


# function to reset male_etexts, female_etexts, and ambiguous_etexts
# in case run classify_etext(...) more than once with a given etext
# basically, this is just for development purposes
def reset_classifications():
    male_etexts[:] = []
    female_etexts[:] = []
    ambiguous_etexts[:] = []
    
    return

In [31]:
# classify all the etexts
reset_classifications()
for etext in etexts:
    classify_etext(etext)

Lucius
William
Mary
Lindsay,
George
Giovanni
Stewart
S.
William
Unknown

Release
John
Various

Release
Various

Release
Various

Release
Various

Release
Various

Release
Various

Release
Various

Release
Dante
Dante
Dante
Edited
Ada
A.
Charles
F.
Fannie

Harold
ject
Magnay,
William
Edgar
Various

Release
Various

Release
Various

Release
Various

Release
Various

Release
Edith
Elizabeth
Aphra
Charles
James
E.R.
Joseph
William
H.
John
Various

Release
Janet
Margaret
Margaret
Margaret
Margaret
Anonymous

Release
Thomas
Henry
Jeffery
James
Max
Burton
Grace
Joseph
Edward
S.
Elizabeth
Various

Release
Venture
Harold
Various

Release
Douglas
Various

Release
Harriot
Robert
John
Anna
James
Anonymous

Release
James
Louise
An
Donald
Francesco
Various

Release
Various

Release
George
Burton
Richard
Euripides

Release
Edward
W.T.
Mrs.
John
Hudson,
William
Editor-in-Chief:
Various

Release
Various

Release
Various

Release
Hugh
Anonymous

Releas

John
John
G.
Mabel
Francis
Frances
Nathaniel
Various

Release
Various

Release
Various

Release
Various

Release
Various

Release
Punch

Release
George
James
John
Various

Release
Charles
Various

Release
John
Captain
George
Mrs.
James
Susan
Charles
ject
Emerson
A
Abigail
Julia
Julia
Robert
George
Various

Release
G.K.
Various

Release
Cassius
Lilian
Joseph
Various

Release
Various

Release
Various

Release
David
Goold
Various

Release
Various

Release
Henry
Various

Release
Jack
Jack
Jack
Jack
Jack
Miles
Sir
Elinor
Harry
Robert
ject
Edmund
Various

Release
Various

Release
Various

Release
Retold
Graham
Joseph
Various

Release
James
H.
Arnold
Gordon
Anthony
Various

Release
Various

Release
Various

Release
Various

Release
Various

Release
Various

Release
George
Will
Arthur
Arthur
Mabell
Josephine
Various

Release
Hildegard
George
Jacob
James
Various

Release
Frank
Eugene
Vicente
Richard
Jacob
Kathleen
George
Jacob
Philip
Will


Arthur
The
George
Sam
J.
William
Patrick
Alfred
Guglielmo
Grace
Sophie
Various

Release
A.
Work
F.
Charles
Charles
Jean
Jean
Robert
Frank
Evelyn
F.
Pansy
Erskine
Henry
Victoria
Arthur
John
Various

Release
Various

Release
F.
Various

Release
J.
Gertrude
John
William
Donald
Donald
Various

Release
Various

Release
Emma
James
Honore
Justus
Anonymous

Release
Louisiana
Epiphanius
Various

Release
Various

Release
Samuel
Lady
Ethel
John
Booth
Frederick
Clara
Gertrude
R.
Various

Release
Margaret
Various

Release
Marion
Robert
Mrs
Harriette
May
Arnold
Charles
L.
James
Various

Release
Various

Release
A.
Nicholai
Auguste
Various

Release
A.
Various

Release
Clarence
James
Percy
Various

Release
Susannah
Charles
Anicius
David
Robert
Charles
Francis
Oscar
Charles
William
Various

Release
Mary
J.
Various

Release
Maurice
F.
Hubert
Luther
Alexis
Ambrose
T.
Mrs.
F.
Isaac
G.
Ada
Anonymous

Release
Margaret
Arthur
Various

Release
Annie
Samuel
Thomas
Variou

Jacob
Samuel
John
Edward
Nannie
Richard
Henry
C.
Mary
Natalie
Spenser
Amy
J.
Mrs.
Lunsford
Various

Release
Edward
Various

Release
Annie
John
Robert
W.A.P.
Joseph
C.
Jacob
James
L.
William
Stella
William
Joseph
Beatrix
Meredith
Various

Release
George
Various

Release
Various

Release
Various

Release
Edric
Isabella
Jackson
L.
Dutton
W.B.
Hugo
Walter
Elizabeth
Department
Emerson
Dietlof
James
George
Various

Release
Edmund
Various

Release
H.
Thornton
Laura
Hetty
William
Charles
Hannah
Kate
F.
Henry
Frank
Douglass
Alice
Charlotte
Charlotte
Charlotte
Ed.
S.
Various

Release
F.
Captain
George
Benjamin
P.
Bret
Various

Release
Coningsby
Maria
Various

Release
Various

Release
Edmund
John
Plato

Release
Samuel
Ruth
Thomas
William
Jean
Jane
H.
John
Various

Release
Helen
H.
Lucien
Clara
W.
Clara
J.
Various

Release
Mrs.
Francis
Ida
Jane
Walter
F.
Philip
Unknown

Release
Mrs.
A.
Ithamar
Alice
Various

Release
William
Beatrix
Mary
Marcus
Carolyn
Joan
Maria
C

Anthony
Edward
Grant
G.
Dillon
Timothy
William
Honore
Honore
Jack
Margaret
R.
Various

Release
Caradoc
Ibn
John-Stuart
Charles
W.
Susanna
William
H.
Anonymous

Release
ject
Charles
Hesba
Edward
John
Martin
George
August
F.
C.
Arabella
John
C.
Henry
Padriac
W.
Edward
William
B.
Charles
Alpheus
Various

Release
Isaac
James
Sir
Samuel
Alexander
Frank
Frank
Mme
William
Arthur
Charles
Anna
Edward
Edward
Mosnar
Hugh
Anna
Richard
Bettina
Mary
James
James
James
A.
A.
Henry
Clive
Susan
Francis
Charles
Various

Release
E.
Skookum
Caius
James
Ring
Mary
Charles
(Mrs.)
Ronald
P.
Richard
John
J.
Baha'u'llah

Release
Baha'u'llah

Release
Baha'u'llah

Release
George
David
Kathleen
Kate
John
W.
Christina
Richard
Various

Editor:
Mary
ject
Laura
R.
Hugh
Joseph
Charles
H.
Maria
R.
E.
A.
Charles
Mabel
E.
Various

Release
Algernon
Enos
Walter
James
Anonymous



Release
John
John
Arthur
Mary
Baha'u'llah

Release
Baha'u'llah

Release
Baha'u'llah

Release
Baha'u'llah

Release
Wil

ject
Various

Editor:
G.
Anthony
Henry
John
Irving
Ethel
Anthony
Elizabeth
R.
George
Edmund
Joseph
F.
Arthur
Various

Commentator:
Frank
David
Arthur
Henrik
Charles
S.
Charles
Various

Editor:
Arthur
Various

Editor:
Dallas
Eleanor
L.
Thomas
Robert
Arthur
Algernon
Norman
Joseph
Lawrence
W.
Oliver
Andrew
Edward
Florence
Anonymous



Release
William
William
Baron
Henry
George
Laura
Anonymous



Release
Various



Release
James
Maurus
Maisie
Various

Editor:
Henry
Alice
Montague
Murray
Honore
Honore
Various

Editor:
Thomas
James
Charles
Algernon
Anatole
Robert
Maurice
Aesop

Illustrator:
Various

Editor:
Florence
Maria
Theodore
Alta
Wilfrid
Various

Editor:
Various

Editor:
Beverly
A.
Sir
Friedrich
Carey
Bruce
Thomas
Maria
C.
Allen
T.
Ernest
Anna
Budgett
Various

Editor:
John
Francis
Lester
Bronson
Samuel
Various

Editor:
Joseph
Various

Editor:
Various

Editor:
Lucy
Allen
Francis
Sir
Christopher
Algernon
Various

Release
Carey
Paul
Grant
Max
Marg

Edward
Hesba
Logan
Frederick
Leslie
John
Ernest
Caroline
Edmund
Lorin
William
Frank
Agnes
Jessie
Jessie
Jessie
Jessie
J.
J.G.
William
Charles
Cheiro

Release
Various

Release
Albert
Pierce
J.
Alice
John
J.
William
Rosa
Harriet
Arthur
Wilhelm
Allan
Alan
John
United
Lydia
New
H.
William
George
Anonymous

Illustrator:
Elinor
Orison
Cyrus
Zona
George
Henry
New
James
Ernest
Glance
Clotilda
W.C.C.
Richard
Hawley
Daniel
Daniel
Various

Editor:
P.
P.
William
Hurlo
Eliza
Various

Editor:
Edward
J.
Samuel
James
George
Arthur
Lenore
Grace
Charles
Raymond
Various

Compiler:
Robert
Edmond
Ministry
Samuel
John
Forbes
Fanny
Charles
Various

Release
Virginia
Henry
Arthur
Marguerite
Unknown

Illustrator:
Francis
Clovis
Charles
Victor
R.V.
George
Thomas
Various

Editor:
Morris
Roger
Frank
Edith
Josef
Major
William
Martin
E.
Arthur
Roger
Ellis
John
An
Ralph
J.
Josephine
Edward
Various

Editor:
Various

Editor:
Sewell
Sewell
Sewell
Sewell
William
Samuel
Ferdinand
Joseph
Joseph
Israel
Nell
Stephen
Ellis
Ab

William
William
William
William
William
William
W.H.G.
W.H.G.
W.H.G.
William
W.H.G.
George
Erckmann-Chatrian

Translator:
Anton
Ivan
Prosper
Alexander
A.
Honore
Arthur
Johann
Francois
Various

Editor:
Randolph
W.H.G.
W.H.G.
W.H.G.
W.H.G.
W.H.G.
W.H.G.
W.H.G.
W.H.G.
Milton
Various

Editor:
Julia
Robert
Charles
Allan
Various

Release
J.
Shelby
Henry
Augustus
George
Horace
Roger
Evelyn
Caroline
Margaret
Horatio
James
Amy
Handley
Amy
Harriet
Alice
Richard
Harry
Emily
Emily
Emily
Emily
Thomas
Mrs.
Mrs
Frederic
Dean
George
Mayne
Bret
Honore
Amy
Harriet
Grace
Peter
Frederick
Frederick
Mayne
John
Harry
Mayne
Carroll
E.
Frank
Will
Louis
John
Walter
Horace
J.
Harriet
Larry
William
Raphael
Fritz
Rudyard
Fritz
J.
Bayard
Charles
W.
Fitz-James
Fitz
Edwin
Ambrose
Alfred
Hjalmar
Frank
Elia
W.
Lorimer
Max
A.
James
J.
Edmund
Edmund
Helen
Emily
William
W.H.G.
W.H.G.
W.H.G.
Edwin
Mayne
Guy
YAM

Release
Christopher
Dallas
Gordon
Various

Release
Various

Editor:
Various

Editor:
Helen
Thoma

Wilmarth
Frances
Oliver
Various

Release
Unknown

Release
Everett
Queen
Henry
Dion
Mrs.
Stephen
Various

Editor:
Baroness
Charles
Gerry
Holman
Lady
Unknown

Release
William
Various

Editor:
Irvin
George
Nat
Louis
Louis
Louis
William
Harold
Jennie
W.H.G.
George
Mayne
Edward
Caroline
Edward
Charles
George
Sarah
Anonymous

Release
Laura
Laura
Laura
Joseph
Jacob
Wilfred
Anonymous

Release
Louis
Louis
Louis
Louis
Carolyn
Mary
Ernest
Unknown

Release
Various

Release
Prince
W.
Charles
Sara
Harry
Samuel
George
Allen
Richard
Oliver
National
ject
Edmond
George
Arthur
Lucy
Lucy
Lucy
Lucy
Lucy
Lucy
Elizabeth
Morgan
Arthur
Various

Editor:
Henry
Jacques
Various

Release
William
George
Sherard
Various

Editor:
Martin
Unknown

Release
Louis
Louis
Paul
Mrs.
Various

Contributor:
John
Various

Release
Nell
Various

Editor:
John
Emerson
George
Walter
Emilie
S.
Charles
John
George
Edgar
Emily
Jeffery
Charles
George
Al
Therese
Edgar
Bret
Charles
Horace
Thomas
W.
Edgar
Rufu

Ralph
Ralph
Ralph
Ralph
Various

Release
Ralph
Ralph
Ralph
Oliver
Abraham
John
William
Joseph
Robert
Harold
Eleanor
Joseph
Joseph
Ralph
Ralph
Albert
Mrs.
George
Walter
Theodore
Theodore
Edward
Eugene
Mary
Thomas
Walter
Richard
Rebecca
[AKA
[AKA
Zitkala-Sa

Release
[AKA
Eugene
Eugene
Eugene
Eugene
Eugene
Eugene
Eugene
Eugene
Eugene
Eugene
Eugene
The
The
The
The
The
The
The
The
The
The
The
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
William
Sophocles

Translator:
Charlotte
H.
William
William
Robert
Anthony
[AKA
Robert
Robert
Bram
Unknown

Release
Homer
Samuel
Albert
Mary
Ouida
Ralph
William
Thomas
E.
James
Richard
Mary
Watkin
Watkin
James
Edna
W.
Edna
John
Arthur
Arthur
William
Richard
Count
Constant

Release
Michel
Michel
Michel
Michel
Michel
Michel
Mic

Josiah
Josiah
Josiah
Josiah
Josiah
Josiah
Florence
Alfred
Arthur
Eugenie
Emile
James
Robert
Will
Mary
Jennie
Jennie
Jennie
Jennie
Howard
Charles
Charles
Edward
Edward
Edward
Edward
Howard
Joseph
Booth
Leo
Leo
Jessie
Thomas
John
William
Gordon
Edgar
Edgar
Willa
Edgar
Edwin
Charles
Frederick


In [32]:
print(len(male_etexts), len(female_etexts), len(ambiguous_etexts))

6403 1892 5472


In [33]:
print(len(etexts))

13767


In [34]:
# Step 3: clean the etexts

# function to cut out the publishing and legal info at the beginning and end of the etexts
def strip_headers(etext):
    
    # formatting isn't always identical, and I don't want to have to deal with all the possible formatting edge cases
    try:
        index1 = etext.index('*** START OF THIS PROJECT GUTENBERG EBOOK') + 3
    except:
        return None
    
    # index2 is the index just past the junk at the beginning of the etext
    try:
        index2 = etext.index('***', index1) + 3
    except:
        return None
    
    # index3 is the index that marks the junk at the end of the etext
    try:
        index3 = etext.index("*** END OF THIS PROJECT GUTENBERG EBOOK")
    except:
        return None
    
    return etext[index2:index3]


# used to cut down a passage to just lowercase letters, hyphens (for compound words), and spaces
def clean(etext):
    # strip the header and footer from the etext
    stripped_etext = strip_headers(etext)
    
    if stripped_etext == None:
        return None
    
    valids = ''

    for character in stripped_etext:
        # might not be necessary
        if character == '\n':
            character = ' '
    
        # definitely necessary
        # TODO: might need to handle cases of accented characters, as str.isalpha() doesn't handle them
        # TODO: might need to include apostrophes (for possessive nouns and contractions),
        # however, some texts likely use single quotes for quotes, so would likely need to include double quotes
        # TODO: maybe handle ! and ?
        if character.isalpha() or character == ' ' or character == '-':
            # check if preceding character is a space
            # if it is a space, no double or triple or n-tuple spaces
            # and, if the preceding character is a space, the length of valids will necessarily be greater than 0
            if len(valids) > 0 and character == ' ':
                if valids[len(valids) - 1] != ' ':
                    valids += character
            else:
                valids += character
      
    return valids.lower()

# used to reset clean_xxxx_etexts in case you clean all of them several times
def reset_cleanings():
    cleaned_male_etexts[:] = []
    cleaned_female_etexts[:] = []
    cleaned_ambiguous_etexts[:] = []
    
    return

In [35]:
print(clean(etexts[0]))

 produced by ted garvin ben courtney and pg distributed proofreaders seneca apocolocyntosis with an english translation by whd rouse ma litt d mcmxx introduction this piece is ascribed to seneca by ancient tradition it is impossible to prove that it is his and impossible to prove that it is not the matter will probably continue to be decided by every one according to his view of senecas character and abilities in the matters of style and of sentiment much may be said on both sides dion cassius lx says that seneca composed an greek apokolokuntosis or pumpkinification of claudius after his death the title being a parody of the usual greek apotheosis but this title is not given in the mss of the ludus de morte claudii nor is there anything in the piece which suits the title very well as a literary form the piece belongs to the class called satura menippea a satiric medley in prose and verse this text is that of buecheler with a few trifling changes which are indicated in the notes we have

In [36]:
cleaned_male_etexts = []
cleaned_female_etexts = []
cleaned_ambiguous_etexts = []

In [37]:
# so we don't keep on appending to the same list when testing several times through
reset_cleanings()

i = 0
for etext, gender in male_etexts:
    i += 1
    if i % 10 == 0:
        print(100 * i / len(male_etexts), '% complete')
    
    cleaned_etext = clean(etext)
    if cleaned_etext != None:
        cleaned_male_etexts.append([cleaned_etext, gender])
        

print('\nMALE AUTHORS COMPLETE\n\n')

i = 0
for etext, gender in female_etexts:
    i += 1
    if i % 10 == 0:
        print(100 * i / len(female_etexts), '% complete')
    
    cleaned_etext = clean(etext)
    if cleaned_etext != None:
        cleaned_female_etexts.append([cleaned_etext, gender])

print('\nFEMALE AUTHORS COMPLETE\n\n')

'''
i = 0
for etext in ambiguous_etexts:
    i += 1
    if i % 10 == 0:
        print(100 * i / len(ambiguous_etexts), '% complete')
    
    cleaned_etext = clean(etext)
    if cleaned_etext != None:
        cleaned_ambiguous_etexts.append(cleaned_etext)
'''


0.15617679212868968 % complete
0.31235358425737936 % complete
0.46853037638606904 % complete
0.6247071685147587 % complete
0.7808839606434483 % complete
0.9370607527721381 % complete
1.0932375449008278 % complete
1.2494143370295174 % complete
1.405591129158207 % complete
1.5617679212868967 % complete
1.7179447134155865 % complete
1.8741215055442761 % complete
2.030298297672966 % complete
2.1864750898016556 % complete
2.3426518819303452 % complete
2.498828674059035 % complete
2.6550054661877245 % complete
2.811182258316414 % complete
2.9673590504451037 % complete
3.1235358425737934 % complete
3.2797126347024834 % complete
3.435889426831173 % complete
3.5920662189598627 % complete
3.7482430110885523 % complete
3.904419803217242 % complete
4.060596595345932 % complete
4.216773387474621 % complete
4.372950179603311 % complete
4.529126971732 % complete
4.6853037638606905 % complete
4.84148055598938 % complete
4.99765734811807 % complete
5.153834140246759 % complete
5.310010932375449 % compl

43.72950179603311 % complete
43.8856785881618 % complete
44.041855380290485 % complete
44.198032172419175 % complete
44.354208964547865 % complete
44.510385756676556 % complete
44.666562548805246 % complete
44.822739340933936 % complete
44.978916133062626 % complete
45.135092925191316 % complete
45.291269717320006 % complete
45.447446509448696 % complete
45.603623301577386 % complete
45.759800093706076 % complete
45.915976885834766 % complete
46.072153677963456 % complete
46.228330470092146 % complete
46.384507262220836 % complete
46.540684054349526 % complete
46.696860846478216 % complete
46.85303763860691 % complete
47.00921443073559 % complete
47.16539122286428 % complete
47.32156801499297 % complete
47.47774480712166 % complete
47.63392159925035 % complete
47.79009839137904 % complete
47.94627518350773 % complete
48.10245197563642 % complete
48.25862876776511 % complete
48.4148055598938 % complete
48.57098235202249 % complete
48.72715914415118 % complete
48.88333593627987 % complet

87.9275339684523 % complete
88.08371076058097 % complete
88.23988755270966 % complete
88.39606434483835 % complete
88.55224113696704 % complete
88.70841792909573 % complete
88.86459472122442 % complete
89.02077151335311 % complete
89.1769483054818 % complete
89.33312509761049 % complete
89.48930188973918 % complete
89.64547868186787 % complete
89.80165547399656 % complete
89.95783226612525 % complete
90.11400905825394 % complete
90.27018585038263 % complete
90.42636264251132 % complete
90.58253943464001 % complete
90.7387162267687 % complete
90.89489301889739 % complete
91.05106981102608 % complete
91.20724660315477 % complete
91.36342339528346 % complete
91.51960018741215 % complete
91.67577697954084 % complete
91.83195377166953 % complete
91.98813056379822 % complete
92.14430735592691 % complete
92.3004841480556 % complete
92.45666094018429 % complete
92.61283773231298 % complete
92.76901452444167 % complete
92.92519131657036 % complete
93.08136810869905 % complete
93.23754490082774 

37.46345029239766 % complete
37.646198830409354 % complete
37.828947368421055 % complete
38.01169590643275 % complete
38.19444444444444 % complete
38.37719298245614 % complete
38.55994152046784 % complete
38.74269005847953 % complete
38.925438596491226 % complete
39.10818713450293 % complete
39.29093567251462 % complete
39.473684210526315 % complete
39.65643274853801 % complete
39.83918128654971 % complete
40.021929824561404 % complete
40.2046783625731 % complete
40.38742690058479 % complete
40.57017543859649 % complete
40.75292397660819 % complete
40.93567251461988 % complete
41.11842105263158 % complete
41.301169590643276 % complete
41.48391812865497 % complete
41.666666666666664 % complete
41.849415204678365 % complete
42.03216374269006 % complete
42.21491228070175 % complete
42.39766081871345 % complete
42.58040935672515 % complete
42.76315789473684 % complete
42.94590643274854 % complete
43.12865497076023 % complete
43.31140350877193 % complete
43.494152046783626 % complete
43.676

89.18128654970761 % complete
89.3640350877193 % complete
89.546783625731 % complete
89.72953216374269 % complete
89.91228070175438 % complete
90.09502923976608 % complete
90.27777777777777 % complete
90.46052631578948 % complete
90.64327485380117 % complete
90.82602339181287 % complete
91.00877192982456 % complete
91.19152046783626 % complete
91.37426900584795 % complete
91.55701754385964 % complete
91.73976608187135 % complete
91.92251461988305 % complete
92.10526315789474 % complete
92.28801169590643 % complete
92.47076023391813 % complete
92.65350877192982 % complete
92.83625730994152 % complete
93.01900584795321 % complete
93.20175438596492 % complete
93.38450292397661 % complete
93.5672514619883 % complete
93.75 % complete
93.9327485380117 % complete
94.11549707602339 % complete
94.29824561403508 % complete
94.48099415204679 % complete
94.66374269005848 % complete
94.84649122807018 % complete
95.02923976608187 % complete
95.21198830409357 % complete
95.39473684210526 % complete
95

In [38]:
print(cleaned_female_etexts[0])



In [39]:
print(len(cleaned_male_etexts), len(cleaned_female_etexts), len(cleaned_ambiguous_etexts))

4974 1333 4382


In [41]:
# Step 4: tokenize the texts

tokenized_male_etexts = []
tokenized_female_etexts = []
tokenized_ambiguous_etexts = []

In [42]:
# used to clear all the contents of the tokenized_xxxx_etexts
def reset_tokenizations():
    tokenized_male_etexts[:] = []
    tokenized_female_etexts[:] = []
    tokenized_ambiguous_etexts[:] = []

In [43]:
# tokenize

i = 0
for etext, gender in cleaned_male_etexts:
    i += 1
    if i % 10 == 0:
        print(100 * i / len(male_etexts), '% complete')
        
    tokenized_male_etexts.append([etext.split(), gender])
    
i = 0
for etext, gender in cleaned_female_etexts:
    i += 1
    if i % 10 == 0:
        print(100 * i / len(male_etexts), '% complete')
        
    tokenized_female_etexts.append([etext.split(), gender])

ValueError: too many values to unpack (expected 2)