#### STEPS TO GET DATA
1.  Download the data by running the following command in the project directory:
    `wget -w 2 -m -H "http://www.gutenberg.org/robot/harvest?filetypes[]=html&langs[]=de"`
    
2.  Clean extraneous files by running the following commands (applies for Windows):
    `del /S *-8.zip`
    `del /S *-0.zip`
    `del /S robots.txt`
    `del /S harvest*`
    
`pip install bsddb3-6.2.6-cp37-cp37m-win_amd64.whl`
`pip install gutenberg`

In [1]:
# imports
from string import ascii_lowercase # for checking if letters
import numpy as np                 # numpy, duh...
#import glob                        # file reading
import zipfile                     # zipped file reading
import os                          # recursive navigation of file tree
import fnmatch                     # matching file name patterns






'''
# from gutenberg.acquire import load_metadata
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

from gutenberg.query import get_etexts
from gutenberg.query import get_metadata

from gutenberg.query import list_supported_metadatas
'''


'\n# from gutenberg.acquire import load_metadata\nfrom gutenberg.acquire import load_etext\nfrom gutenberg.cleanup import strip_headers\n\nfrom gutenberg.query import get_etexts\nfrom gutenberg.query import get_metadata\n\nfrom gutenberg.query import list_supported_metadatas\n'

#### STEPS TO TAKE IN SETTING UP MODEL

1.  Read in all of the dataset from the files
2.  Classify each work
    *  If author's first name is only in male.txt, then male (ADD TO MALE LIST, WILL COMBINE LATER)
    *  Else if author's first name is only in female.txt, then female (ADD TO FEMALE LIST, WILL COMBINE LATER)
    *  Else, ambiguous (ADD TO AMBIGUOUS LIST, CAN BE USED FOR MANUAL TESTING IF YOU WANT)
3.  Clean each text
    *  Remove headers and footers
    *  Remove characters that are not spaces or newlines or numbers or characters or hyphens (or question marks or exclamation marks?) or apostrophes (in contractions or possessive forms)
    *  Replace newlines with spaces
    *  (if using ? and !, replace with a space plus the mark, so it will count as a new word in tokenization)
    *  Remove double spaces
    *  Convert all capital letters to lowercase
4.  Tokenize each cleaned text
5.  Build a vocabulary???
5.  Vectorize each tokenized text

In [18]:
# Step 1: read in all of the .txt files

# holds all the strings of the etexts, and that's all
etexts = []

# recursively navigate the directory containing all the zipped etexts
for path, dirs, files in os.walk('./aleph.gutenberg.org'):
    
    # find all the zip folders
    for zip_name in fnmatch.filter(files,'*.zip'):
        zip_path = os.path.abspath(os.path.join(path, zip_name))
        #print(zip_path)
        
        # unzip and read the etext
        archive = zipfile.ZipFile(zip_path, 'r')
        for txt_name in archive.namelist():
            print(txt_name)
            etext = str(archive.read(txt_name))
            etexts.append(etext)
            
        

10001.txt
10002.txt
10003.txt
10004.txt
10005.txt
10006.txt
10008.txt
10009.txt
10010.txt
10011.txt
10012.txt
10013.txt
10014.txt
10015.txt
10016.txt
10017.txt
10018.txt
10019.txt
1005.txt
1006.txt
1007.txt
10020.txt
10021.txt
10022/10022.txt
10023.txt
10024.txt
10025.txt
10026.txt
10027.txt
10028.txt
10029.txt
10030.txt
10031.txt
10032.txt
10033.txt
10034.txt
10035.txt
10036.txt
10037.txt
10038.txt
10039.txt
10040.txt
10041.txt
10042.txt
10043.txt
10044/10044.txt
10045.txt
10046.txt
10047.txt
10048.txt
10049.txt
10050.txt
10051.txt
10052.txt
10056.txt
10057/10057.txt
10058/10058.txt
10059.txt
10060.txt
10062.txt
10063.txt
10064.txt
10065.txt
10066/10066.txt
10067.txt
10068.txt
10069.txt
10070.txt
10071.txt
10072.txt
10074.txt
10075.txt
10076.txt
10077.txt
10078.txt
10079.txt
10080.txt
10081.txt
10082.txt
10083.txt
10084.txt
10085/10085.txt
10086.txt
10087.txt
10088.txt
10089.txt
10090.txt
10091/10091.txt
10092/10092.txt
10093.txt
10094.txt
10095.txt
10096/10096.txt
10097/10097.txt
100

10868/10868.txt
10869.txt
10870/10870.txt
10871/10871.txt
10872/10872.txt
10873/10873.txt
10874/10874.txt
10875/10875.txt
10876.txt
10877.txt
10878/10878.txt
10879/10879.txt
10880/10880.txt
10881/10881.txt
10882/10882.txt
10883/10883.txt
10884/10884.txt
10885/10885.txt
10886.txt
10887/10887.txt
10888/10888.txt
10889/10889.txt
10890/10890.txt
10891.txt
10892/10892.txt
10893/10893.txt
10894/10894.txt
10895/10895.txt
10896/10896.txt
10897.txt
10898.txt
10899.txt
10901.txt
10902.txt
10903.txt
10904.txt
10905.txt
10907.txt
10908.txt
10910.txt
10911.txt
10912.txt
10913.txt
10915.txt
10916.txt
10918.txt
10919.txt
1093.txt
1096.txt
1097.txt
10920.txt
10921.txt
10922.txt
10923.txt
10924.txt
10925.txt
10926.txt
10928.txt
10929.txt
10930.txt
10931.txt
10932.txt
10933.txt
10934.txt
10935.txt
10936.txt
10937.txt
10938.txt
10939.txt
10940.txt
10942.txt
10943.txt
10944.txt
10945.txt
10946.txt
10947.txt
10948.txt
10949.txt
10950.txt
10951.txt
10952.txt
10954.txt
10955.txt
10956.txt
10957.txt
10958.txt

11900.txt
11901.txt
11902.txt
11903.txt
11904.txt
11906.txt
11907.txt
11908.txt
11909.txt
11910.txt
11911.txt
11912.txt
11913.txt
11915.txt
11917.txt
11918.txt
11919.txt
1190.txt
1191.txt
1194.txt
1195.txt
1196.txt
11920.txt
11921.txt
11922.txt
11923.txt
11924.txt
11926.txt
11929.txt
11930.txt
11931.txt
11932.txt
11933.txt
11934.txt
11935.txt
11936.txt
11937.txt
11938.txt
11939.txt
11941.txt
11942.txt
11943.txt
11944.txt
11945.txt
11946.txt
11947.txt
11948.txt
11949.txt
11950.txt
11951.txt
11952.txt
11953.txt
11954.txt
11955.txt
11956.txt
11957.txt
11958.txt
11959.txt
11960.txt
11961.txt
11962.txt
11963.txt
11965.txt
11966.txt
11969.txt
11970.txt
11971.txt
11972.txt
11973.txt
11974.txt
11975.txt
11977.txt
11978.txt
11979.txt
11980.txt
11981.txt
11982.txt
11983.txt
11984.txt
11985.txt
11986.txt
11987.txt
11988.txt
11989.txt
11990.txt
11991.txt
11992.txt
11993.txt
11994.txt
11995.txt
11996.txt
11997.txt
11998.txt
11999.txt
11.txt
12.txt
13.txt
16.txt
17.txt
19.txt
12000.txt
12001.txt
120

12879.txt
12880.txt
12881.txt
12882.txt
12883.txt
12884.txt
12886.txt
12887.txt
12888.txt
12890.txt
12891.txt
12892.txt
12894.txt
12895.txt
12896.txt
12897.txt
12898.txt
12899.txt
12900.txt
12901.txt
12902.txt
12903.txt
12904.txt
12905.txt
12908.txt
12909.txt
12910.txt
12911.txt
12912.txt
12913.txt
12914.txt
12915.txt
12916.txt
12917.txt
12918.txt
12919.txt
1291.txt
1294.txt
1295.txt
1296.txt
12920.txt
12922.txt
12923.txt
12924.txt
12925.txt
12926.txt
12928.txt
12929.txt
12930.txt
12931.txt
12932.txt
12933.txt
12934.txt
12935.txt
12936.txt
12937.txt
12938.txt
12939.txt
12940.txt
12941.txt
12942.txt
12943.txt
12944.txt
12945.txt
12946.txt
12947.txt
12948.txt
12951.txt
12952.txt
12953.txt
12954.txt
12955.txt
12956.txt
12957.txt
12958.txt
12959.txt
12960.txt
12961.txt
12963.txt
12964.txt
12965.txt
12966.txt
12967.txt
12968.txt
12970.txt
12971.txt
12972.txt
12973.txt
12974.txt
12975.txt
12976.txt
12977.txt
12978.txt
12980.txt
12981.txt
12982.txt
12983.txt
12984.txt
12985.txt
12986.txt
1298

13903.txt
13905.txt
13906.txt
13907.txt
13908.txt
13909.txt
13910.txt
13911.txt
13912.txt
13913.txt
13916.txt
13918.txt
1392.txt
1394.txt
1395.txt
1396.txt
1397.txt
1399.txt
13922.txt
13923.txt
13924.txt
13925.txt
13926.txt
13927.txt
13928.txt
13929.txt
13930.txt
13931.txt
13932.txt
13933.txt
13934.txt
13935.txt
13936.txt
13937.txt
13939.txt
13940.txt
13941.txt
13942.txt
13943.txt
13944.txt
13945.txt
13946.txt
13954.txt
13955.txt
13956.txt
13957.txt
13958.txt
13960.txt
13961.txt
13962.txt
13963.txt
13964.txt
13966.txt
13967.txt
13968.txt
13969.txt
13970.txt
13971.txt
13972.txt
13973.txt
13974.txt
13977.txt
13978.txt
13979.txt
13980.txt
13982.txt
13983.txt
13984.txt
13985.txt
13986.txt
13987.txt
13988.txt
13989.txt
13990.txt
13992.txt
13993.txt
13994.txt
13995.txt
13996.txt
13997.txt
13998.txt
14000.txt
14002.txt
14003.txt
14004.txt
14005.txt
14006.txt
14007.txt
14008.txt
14009.txt
14010.txt
14011.txt
14012.txt
14013.txt
14014.txt
14015.txt
14016.txt
14017.txt
14018.txt
14019.txt
1403.t

14954.txt
14955.txt
14957.txt
14958.txt
14959.txt
14960.txt
14961.txt
14963.txt
14964.txt
14965.txt
14966.txt
14967.txt
14968.txt
14969.txt
14970.txt
14971.txt
14972.txt
14973.txt
14974.txt
14975.txt
14976.txt
14977.txt
14978.txt
14979.txt
14980.txt
14981.txt
14984.txt
14985.txt
14986.txt
14987.txt
14988.txt
14989.txt
14990.txt
14991.txt
14992.txt
14993.txt
14994.txt
14996.txt
14998.txt
14999.txt
15000.txt
15001.txt
15002.txt
15003.txt
15004.txt
15005.txt
15006.txt
15007.txt
15011.txt
15012.txt
15013.txt
15014.txt
15015.txt
15016.txt
15017.txt
15018.txt
15019.txt
15020.txt
15021.txt
15022.txt
15025.txt
15026.txt
15029.txt
15030.txt
15031.txt
15033.txt
15034.txt
15035.txt
15036.txt
15040.txt
15041.txt
15042.txt
15043.txt
15044.txt
15045.txt
15049.txt
15050.txt
15051.txt
15052.txt
15053.txt
15055.txt
15063.txt
15064.txt
15065.txt
15067.txt
15069.txt
15072.txt
15073.txt
15074.txt
15076.txt
15077.txt
15078.txt
15079.txt
15080.txt
15081.txt
15082.txt
15084.txt
15086.txt
15087.txt
15088.txt


15999.txt
16000.txt
16001.txt
16002.txt
16003.txt
16004.txt
16005.txt
16006.txt
16007.txt
16008.txt
16009.txt
16010.txt
16011.txt
16012.txt
16013.txt
16014.txt
16015.txt
16016.txt
16017.txt
16018.txt
16019.txt
1606.txt
16024.txt
16025.txt
16026.txt
16027.txt
16028.txt
16029.txt
16030.txt
16031.txt
16032.txt
16033.txt
16035.txt
16036.txt
16037.txt
16038.txt
16039.txt
16040.txt
16041.txt
16042.txt
16044.txt
16046.txt
16047.txt
16048.txt
16049.txt
16050.txt
16051.txt
16052.txt
16053.txt
16054.txt
16055.txt
16056.txt
16057.txt
16058.txt
16059.txt
16060.txt
16064.txt
16065.txt
16070.txt
16073.txt
16074.txt
16076.txt
16077.txt
16078.txt
16079.txt
16080.txt
16081.txt
16083.txt
16084.txt
16085.txt
16086.txt
16087.txt
16088.txt
16089.txt
16090.txt
16091.txt
16092.txt
16093.txt
16094.txt
16095.txt
16096.txt
16097.txt
16098.txt
16099.txt
16100.txt
16101.txt
16103.txt
16104.txt
16106.txt
16107.txt
16108.txt
16112.txt
16113.txt
16114.txt
16115.txt
16116.txt
16118.txt
16119.txt
1611.txt
16121.txt
16

17060.txt
17061.txt
17062.txt
17063.txt
17064.txt
17065.txt
17066.txt
17067.txt
17068.txt
17069.txt
17071.txt
17074.txt
17075.txt
17081.txt
17083.txt
17084.txt
17085.txt
17086.txt
17087.txt
17088.txt
17089.txt
17090.txt
17091.txt
17093.txt
17094.txt
17095.txt
17096.txt
17097.txt
17099.txt
17100.txt
17102.txt
17103.txt
17104.txt
17107.txt
17108.txt
17109.txt
17110.txt
17111.txt
17112.txt
17113.txt
17115.txt
17117.txt
17118.txt
17119.txt
1710.txt
1712.txt
1713.txt
1714.txt
1715.txt
1718.txt
17120.txt
17122.txt
17124.txt
17125.txt
17126.txt
17128.txt
17129.txt
17131.txt
17132.txt
17133.txt
17134.txt
17135.txt
17136.txt
17137.txt
17138/17138.txt
17141.txt
17144.txt
17145.txt
17146.txt
17147.txt
17148.txt
17149.txt
17150.txt
17151.txt
17152.txt
17154.txt
17155.txt
17156.txt
17157.txt
17158.txt
17159.txt
17160.txt
17162.txt
17163.txt
17164.txt
17165.txt
17166.txt
17167.txt
17168.txt
17170.txt
17171.txt
17172.txt
17173.txt
17174.txt
17175.txt
17176.txt
17178.txt
17180.txt
17181.txt
17182.txt


18277.txt
18278.txt
18279.txt
18280.txt
18281.txt
18283.txt
18284.txt
18285.txt
18286.txt
18287.txt
18288.txt
18290.txt
18292.txt
18293.txt
18297.txt
18298.txt
18299.txt
18300.txt
18304.txt
18307.txt
18309.txt
18310.txt
18314.txt
18315.txt
18316.txt
18318.txt
18320.txt
18323.txt
18324.txt
18325.txt
18327.txt
18328.txt
18329.txt
18332.txt
18333.txt
18334.txt
18335.txt
18336.txt
18337.txt
18338.txt
18341.txt
18342.txt
18343.txt
18344.txt
18345.txt
18346.txt
18347.txt
18349.txt
18350.txt
18351.txt
18352.txt
18355.txt
18356.txt
18357.txt
18359.txt
18360.txt
18361.txt
18362.txt
18364.txt
18366.txt
18369.txt
18370.txt
18371.txt
18372.txt
18373.txt
18374.txt
18375.txt
18376.txt
18377.txt
18378.txt
18379.txt
18380.txt
18382.txt
18383.txt
18384.txt
18385.txt
18387.txt
18388.txt
18390.txt
18391.txt
18392.txt
18394.txt
18395.txt
18396.txt
18397.txt
18398.txt
18399.txt
18400.txt
18405.txt
18408.txt
18409.txt
18410.txt
18413.txt
18414.txt
18417.txt
18418.txt
18419.txt
1840.txt
1841.txt
1848.txt
184

19343.txt
19346.txt
19347.txt
19348.txt
19349.txt
19350.txt
19351.txt
19352.txt
19353.txt
19354.txt
19355.txt
19356.txt
19357.txt
19358.txt
19359.txt
19360.txt
19361.txt
19363.txt
19364.txt
19365.txt
19366.txt
19367.txt
19368.txt
19369.txt
19370.txt
19371.txt
19373.txt
19376.txt
19377.txt
19378.txt
19379.txt
19381.txt
19382.txt
19383.txt
19384.txt
19385.txt
19386.txt
19387.txt
19388.txt
19389.txt
19390.txt
19391.txt
19392.txt
19393.txt
19394.txt
19395.txt
19396.txt
19397.txt
19398.txt
19399.txt
19400.txt
19401.txt
19402.txt
19403.txt
19404.txt
19405.txt
19406.txt
19407.txt
19408.txt
19409.txt
19410.txt
19411.txt
19412.txt
19413.txt
19414.txt
19415.txt
19416.txt
19417.txt
19418.txt
19419.txt
1940.txt
1941.txt
1942.txt
1943.txt
1944.txt
1948.txt
19420.txt
19421.txt
19422.txt
19423.txt
19424.txt
19425.txt
19432.txt
19434.txt
19435.txt
19436.txt
19438.txt
19441.txt
19443.txt
19444.txt
19445.txt
19446.txt
19448.txt
19449.txt
19450.txt
19451.txt
19452.txt
19453.txt
19457.txt
19458.txt
19459.

20452.txt
20453.txt
20455.txt
20456.txt
20458.txt
20459.txt
20460.txt
20461.txt
20462.txt
20463.txt
20464.txt
20467.txt
20468.txt
20469.txt
20470.txt
20471.txt
20472.txt
20473.txt
20474.txt
20475.txt
20476.txt
20477.txt
20478.txt
20480.txt
20481.txt
20483.txt
20484.txt
20485.txt
20486.txt
20487.txt
20488.txt
20489.txt
20491.txt
20492.txt
20493.txt
20494.txt
20496.txt
20497.txt
20499.txt
20500.txt
20503.txt
20504.txt
20505.txt
20506.txt
20509.txt
20510.txt
20511.txt
20512.txt
20513.txt
20515.txt
20516.txt
20519.txt
20520.txt
20521.txt
20522.txt
20523.txt
20524.txt
20525.txt
20526.txt
20527.txt
20528.txt
20529.txt
2051.txt
2052.txt
20531.txt
20532.txt
20533.txt
20534.txt
20535.txt
20537.txt
20538.txt
20539.txt
20540.txt
20541.txt
20543.txt
20544.txt
20546.txt
20547.txt
20548.txt
20549.txt
20551.txt
20552.txt
20553.txt
20555.txt
20556.txt
20557.txt
20558.txt
20559.txt
20560.txt
20561.txt
20563.txt
20565.txt
20566.txt
20567.txt
20569.txt
20571.txt
20572.txt
20575.txt
20576.txt
20578.txt
20

21501.txt
21502.txt
21503.txt
21504.txt
21505.txt
21506.txt
21507.txt
21508.txt
21509.txt
21510.txt
21511.txt
21512.txt
21513.txt
21515.txt
21516.txt
21528.txt
21529.txt
2151.txt
2155.txt
2156.txt
2158.txt
21530.txt
21531.txt
21532.txt
21533.txt
21534.txt
21536.txt
21537.txt
21538.txt
21539.txt
21540.txt
21541.txt
21542.txt
21543.txt
21546.txt
21547.txt
21549.txt
21550.txt
21551.txt
21552.txt
21553.txt
21554.txt
21555.txt
21556.txt
21557.txt
21558.txt
21559.txt
21560.txt
21561.txt
21562.txt
21566.txt
21568.txt
21569.txt
21570.txt
21571.txt
21572.txt
21573.txt
21574.txt
21575.txt
21576.txt
21577.txt
21578.txt
21579.txt
21580.txt
21582.txt
21583.txt
21590.txt
21591.txt
21592.txt
21594.txt
21595.txt
21596.txt
21597.txt
21598.txt
21599.txt
21600.txt
21607.txt
21608.txt
21609.txt
21610.txt
21611.txt
21612.txt
21613.txt
21614.txt
21615.txt
21616.txt
21617.txt
21618.txt
21619.txt
21620.txt
21621.txt
21622.txt
21623.txt
21624.txt
21625.txt
21626.txt
21627.txt
21628.txt
21629.txt
2163.txt
2165.

22544.txt
22545.txt
22546.txt
22547.txt
22549.txt
22550.txt
22553.txt
22554.txt
22557.txt
22559.txt
22560.txt
22561.txt
22563.txt
22564.txt
22565.txt
22566.txt
22567.txt
22568.txt
22569.txt
22571.txt
22573.txt
22574.txt
22576.txt
22577.txt
22578.txt
22579.txt
22581.txt
22582.txt
22583.txt
22584.txt
22585.txt
22586.txt
22587.txt
22588.txt
22589.txt
22590.txt
22591.txt
22593.txt
22594.txt
22595.txt
22596.txt
22597.txt
22598.txt
22599.txt
22600.txt
22601.txt
22602.txt
22603.txt
22604.txt
22605.txt
22606.txt
22607.txt
22608.txt
22609.txt
22610.txt
22611.txt
22612.txt
22617.txt
22619.txt
22620.txt
22621.txt
22623.txt
22624.txt
22625.txt
22626.txt
22629.txt
22631.txt
22636.txt
22637.txt
22638.txt
22639.txt
22644.txt
22645.txt
22646.txt
22650.txt
22651.txt
22652.txt
22653.txt
22654.txt
22655.txt
22656.txt
22657.txt
22660.txt
22661.txt
22662.txt
22663.txt
22664.txt
22665.txt
22666.txt
22667.txt
22668.txt
22669.txt
22670.txt
22671.txt
22672.txt
22673.txt
22674.txt
22675.txt
22676.txt
22677.txt


23611.txt
23612.txt
23613.txt
23614.txt
23619.txt
23622.txt
23623.txt
23624.txt
23625.txt
23626.txt
23627.txt
23629.txt
2363.txt
2366.txt
2368.txt
2369.txt
23630.txt
23631.txt
23632.txt
23633.txt
23634.txt
23635.txt
23636.txt
23637.txt
23638.txt
23639.txt
23640.txt
23641.txt
23642.txt
23643.txt
23644.txt
23645.txt
23646.txt
23647.txt
23648.txt
23649.txt
23650.txt
23651.txt
23652.txt
23653.txt
23655.txt
23656.txt
23657.txt
23658.txt
23659.txt
23660.txt
23661.txt
23662.txt
23663.txt
23664.txt
23665.txt
23666.txt
23667.txt
23668.txt
23669.txt
23671.txt
23673.txt
23674.txt
23675.txt
23677.txt
23678.txt
23680.txt
23681.txt
23682.txt
23683.txt
23684.txt
23686.txt
23688.txt
23689.txt
23690.txt
23691.txt
23692.txt
23693.txt
23694.txt
23695.txt
23696.txt
23697.txt
23698.txt
23699.txt
23700.txt
23701.txt
23702.txt
23703.txt
23725.txt
23726.txt
23727.txt
23728.txt
23729.txt
2372.txt
2373.txt
2375.txt
23730.txt
23731.txt
23732.txt
23733.txt
23734.txt
23735.txt
23736.txt
23737.txt
23738.txt
23739.t

24769.txt
24770.txt
24771.txt
24772.txt
24775.txt
24776.txt
24777.txt
24778.txt
24779.txt
24780.txt
24781.txt
24783.txt
24784.txt
24785.txt
24786.txt
24788.txt
24789.txt
24791.txt
24792.txt
24793.txt
24794.txt
24795.txt
24797.txt
24798.txt
24799.txt
24800.txt
24803.txt
24804.txt
24805.txt
24806.txt
24807.txt
24808.txt
24810.txt
24811.txt
24812.txt
24813.txt
24814.txt
24815.txt
24816.txt
24818.txt
24819.txt
24821.txt
24822.txt
24825.txt
24826.txt
24827.txt
24828.txt
2480.txt
24831.txt
24832.txt
24834.txt
24835.txt
24836.txt
24837.txt
24838.txt
24839.txt
24840.txt
24841.txt
24842.txt
24849.txt
24851.txt
24852.txt
24854.txt
24856.txt
24857.txt
24858.txt
24859.txt
24860.txt
24862.txt
24864.txt
24865.txt
24866.txt
24868.txt
24869.txt
24870.txt
24871.txt
24872.txt
24873.txt
24874.txt
24875.txt
24876.txt
24877.txt
24878.txt
24879.txt
24880.txt
24881.txt
24882.txt
24883.txt
24884.txt
24885.txt
24886.txt
24889.txt
24891.txt
24892.txt
24893.txt
24894.txt
24895.txt
24896.txt
24897.txt
24898.txt
2

25902.txt
25903.txt
25904.txt
25905.txt
25906.txt
25907.txt
25908.txt
25909.txt
25910.txt
25911.txt
25912.txt
25913.txt
25914.txt
25915.txt
25916.txt
25917.txt
25918.txt
25919.txt
25920.txt
25921.txt
25922.txt
25923.txt
25926.txt
25927.txt
25928.txt
25929.txt
2595.txt
2597.txt
2598.txt
2599.txt
25930.txt
25931.txt
25932.txt
25933.txt
25935.txt
25937.txt
25938.txt
25939.txt
25940.txt
25941.txt
25943.txt
25944.txt
25947.txt
25948.txt
25950.txt
25951.txt
25952.txt
25953.txt
25954.txt
25955.txt
25957.txt
25958.txt
25959.txt
25960.txt
25961.txt
25962.txt
25963.txt
25965.txt
25966.txt
25967.txt
25968.txt
25969.txt
25970.txt
25971.txt
25972.txt
25973.txt
25974.txt
25975.txt
25976.txt
25977.txt
25978.txt
25979.txt
25980.txt
25982.txt
25983.txt
25984.txt
25985.txt
25986.txt
25989.txt
25990.txt
25991.txt
25992.txt
25993.txt
25994.txt
25995.txt
25997.txt
25998.txt
26000.txt
26001.txt
26002.txt
26003.txt
26005.txt
26007.txt
26008.txt
26009.txt
26010.txt
26011.txt
26012.txt
26013.txt
26014.txt
2601

5221.txt
5222.txt
5223.txt
5224.txt
5225.txt
5228.txt
5230.txt
5231.txt
5234.txt
5235.txt
5236.txt
5237.txt
5238.txt
5239.txt
5240.txt
524.txt
525.txt
526.txt
527.txt
528.txt
5300.txt
5344.txt
5345.txt
5348.txt
5356.txt
5357.txt
5358.txt
5359.txt
530.txt
535.txt
536.txt
5360.txt
5361.txt
5362.txt
5363.txt
5364.txt
5365.txt
5366.txt
5367.txt
5368.txt
5369.txt
5370.txt
5371.txt
5372.txt
5373.txt
5374.txt
5375.txt
5376.txt
5377.txt
5378.txt
5379.txt
5380.txt
5381.txt
5382.txt
5383.txt
5384.txt
5385.txt
5386.txt
5387.txt
5388.txt
5389.txt
5390.txt
5391.txt
5392.txt
5393.txt
5394.txt
5395.txt
5396.txt
5397.txt
5398.txt
5399.txt
5400.txt
5409.txt
5410.txt
5411.txt
5412.txt
5413.txt
5414.txt
5415.txt
5416.txt
5449.txt
543.txt
547.txt
5460.txt
5466.txt
5472.txt
5493.txt
5499.txt
5507.txt
5516.txt
5529.txt
5542.txt
5551.txt
552.txt
5560.txt
5571.txt
5577.txt
5583.txt
5592.txt
5599.txt
50.txt
51.txt
55.txt
57.txt
58.txt
59.txt
5600.txt
5625.txt
5645.txt
5646.txt
5647.txt
5648.txt
5649.txt
5650.t

In [25]:
# Step 2: classify each etext

# keep these separate at first so we can minimize bias in the training set
# we ultimately want to have a decent gender balance in the training set to minimize bias
# it doesn't matter for testing set, as testing won't directly influence the model
male_etexts = []
female_etexts = []
ambiguous_etexts = []

# read in the lists of male and female names
female_names_txt = open('female.txt', 'r').read()
male_names_txt = open('male.txt', 'r').read()



In [26]:
print(female_names_txt)

# List of common female names.
# Copyright (c) January 1991 by Mark Kantrowitz.
# 4987 names
# Thanks to Bill.Ross for about 1000 additional names.
# Version 1.3 (29-MAR-94)

Abagael
Abagail
Abbe
Abbey
Abbi
Abbie
Abby
Abigael
Abigail
Abigale
Abra
Acacia
Ada
Adah
Adaline
Adara
Addie
Addis
Adel
Adela
Adelaide
Adele
Adelice
Adelina
Adelind
Adeline
Adella
Adelle
Adena
Adey
Adi
Adiana
Adina
Adora
Adore
Adoree
Adorne
Adrea
Adria
Adriaens
Adrian
Adriana
Adriane
Adrianna
Adrianne
Adrien
Adriena
Adrienne
Aeriel
Aeriela
Aeriell
Ag
Agace
Agata
Agatha
Agathe
Aggi
Aggie
Aggy
Agna
Agnella
Agnes
Agnese
Agnesse
Agneta
Agnola
Agretha
Aida
Aidan
Aigneis
Aila
Aile
Ailee
Aileen
Ailene
Ailey
Aili
Ailina
Ailyn
Aime
Aimee
Aimil
Aina
Aindrea
Ainslee
Ainsley
Ainslie
Ajay
Alaine
Alameda
Alana
Alanah
Alane
Alanna
Alayne
Alberta
Albertina
Albertine
Albina
Alecia
Aleda
Aleece
Aleecia
Aleen
Alejandra
Alejandrina
Alena
Alene
Alessandra
Aleta
Alethea
Alex
Alexa
Alexandra
Alexandrina
Alexi
Alexia
Alexina
Alexine
Alexi

In [None]:
def clean_names(names):
    