# Fuse metadata to permita first test run

We've got reviewed books, prizewinners, and bestsellers. We need to merge the three frames and assign unique author IDs.

We also need to get some unreviewed books as a control set.

In [77]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from tqdm import tqdm

In [2]:
review = pd.read_csv("cohortmatches.tsv", sep = '\t')
review.head()

Unnamed: 0,volume_author,review_author,aRatio,volume_title,trunc_revtitle,review_title,tRatio,volume_year,review_year,docid,rownum
0,"shaw, irwin","shaw, irwin.",96,"welcome to the city, and other","welcome to the city, and other","welcome to the city, and other stories.",106,1942,1942,uc1.b3347126,3289
1,"bennett, arnold","bennett, arnold.",97,woman who stole everything,"woman who stole everything, an","woman who stole everything, and other stories.",106,1927,1927,uc1.$b322688,4140
2,"lippincott, joseph wharton","lippincott, joseph wharton.",98,wilderness champion; the story,wilderness champion; the story,wilderness champion; the story of a great houn...,106,1944,1944,uc1.b3407035,3481
3,"becker, may lamberton","becker, mrs may (lamberton), ed.",79,golden tales of the southwest;,golden tales of the southwest;,golden tales of the southwest; selected with a...,106,1939,1939,uc1.$b116192,4283
4,"miller, alice duer, mrs","miller, mrs alice (duer).",75,instruments,"instruments of darkness, and o","instruments of darkness, and other stories.",106,1926,1926,mdp.39015059415383,2405


In [4]:
best = pd.read_csv("bestsellersincohortmodel.tsv", sep = '\t')
best.head()

Unnamed: 0,titlemeta_ID,titlemeta_author,titlemeta_title,bestseller_author,bestseller_title,bestseller_year,ratio_author,ratio_title,docid,year
0,uc2.ark+=13960=t05x25z6x,"lewis, sinclair",Main street,"lewis, sinclair",Main Street,1921,100,100,uc2.ark+=13960=t05x25z6x,1920
1,mdp.39015000636111,"lewis, sinclair",Arrowsmith,"lewis, sinclair",Arrowsmith,1925,100,100,mdp.39015000636111,1925
2,mdp.39015008361134,"lewis, sinclair",Elmer Gantry,"lewis, sinclair",Elmer Gantry,1927,100,100,mdp.39015008361134,1927
3,mdp.39015048880846,"lewis, sinclair",Dodsworth,"lewis, sinclair",Dodsworth,1929,100,100,mdp.39015048880846,1929
4,mdp.39015000636095,"lewis, sinclair",Ann Vickers,"lewis, sinclair",Ann Vickers,1933,100,100,mdp.39015000636095,1933


In [5]:
prize = pd.read_csv("prizewinnersincohortmodel.tsv", sep = '\t')
prize.head()

Unnamed: 0,titlemeta_ID,titlemeta_author,titlemeta_title,prize_author,prize_title,prize_date,ratio_author,ratio_title,docid,year
0,mdp.39015005332708,"Barnes, Margaret Ayer",Years of grace,"Barnes, Margaret Ayer",Years of Grace,1931,100,100,mdp.39015005332708,1930
1,mdp.39015005305936,"Barnes, Margaret Ayer",Westward passage,"Barnes, Margaret Ayer",Years of Grace,1931,100,40,mdp.39015005305936,1931
2,uc1.$b56317,"Barnes, Margaret Ayer",Within this present,"Barnes, Margaret Ayer",Years of Grace,1931,100,24,uc1.$b56317,1933
3,mdp.39015063551983,"Barnes, Margaret Ayer","Edna, his wife, an American idyll","Barnes, Margaret Ayer",Years of Grace,1931,100,34,mdp.39015063551983,1935
4,mdp.39015051324153,"Barnes, Margaret Ayer",Wisdom's gate,"Barnes, Margaret Ayer",Years of Grace,1931,100,44,mdp.39015051324153,1938


### goal

what we want is a dataframe with

    tm_author, tm_title, rev_title (if exists), docid, rownum (if exists), 
    review_year (if exists), bestseller_year (if exists), tm_year,
    lowest_year, prizewinning_author (T/F), bestseller (T/F)
    

In [6]:
review = review.rename(columns = {'volume_author': 'tm_author', 'volume_title': 'tm_title', 'volume_year': 'tm_year'})
review = review.loc[: , ['docid', 'tm_author', 'tm_title', 'tm_year', 'review_year', 'rownum', 'review_title']]
review.head()

Unnamed: 0,docid,tm_author,tm_title,tm_year,review_year,rownum,review_title
0,uc1.b3347126,"shaw, irwin","welcome to the city, and other",1942,1942,3289,"welcome to the city, and other stories."
1,uc1.$b322688,"bennett, arnold",woman who stole everything,1927,1927,4140,"woman who stole everything, and other stories."
2,uc1.b3407035,"lippincott, joseph wharton",wilderness champion; the story,1944,1944,3481,wilderness champion; the story of a great houn...
3,uc1.$b116192,"becker, may lamberton",golden tales of the southwest;,1939,1939,4283,golden tales of the southwest; selected with a...
4,mdp.39015059415383,"miller, alice duer, mrs",instruments,1926,1926,2405,"instruments of darkness, and other stories."


In [7]:
best = best.rename(columns = {'year': 'tm_year', 'titlemeta_author': 'tm_author', 'titlemeta_title': 'tm_title'})
best = best.loc[: , ['docid', 'tm_author', 'tm_title', 'tm_year', 'bestseller_year']]
best.tm_title = best.tm_title.str.lower()
best.head()

Unnamed: 0,docid,tm_author,tm_title,tm_year,bestseller_year
0,uc2.ark+=13960=t05x25z6x,"lewis, sinclair",main street,1920,1921
1,mdp.39015000636111,"lewis, sinclair",arrowsmith,1925,1925
2,mdp.39015008361134,"lewis, sinclair",elmer gantry,1927,1927
3,mdp.39015048880846,"lewis, sinclair",dodsworth,1929,1929
4,mdp.39015000636095,"lewis, sinclair",ann vickers,1933,1933


In [8]:
best['is_bestseller'] = True

In [9]:
prize = prize.rename(columns = {'year': 'tm_year', 'titlemeta_author': 'tm_author', 'titlemeta_title': 'tm_title'})
prize = prize.loc[: , ['docid', 'tm_author', 'tm_title', 'tm_year']]
prize['prize_author'] = True
prize.head()

Unnamed: 0,docid,tm_author,tm_title,tm_year,prize_author
0,mdp.39015005332708,"Barnes, Margaret Ayer",Years of grace,1930,True
1,mdp.39015005305936,"Barnes, Margaret Ayer",Westward passage,1931,True
2,uc1.$b56317,"Barnes, Margaret Ayer",Within this present,1933,True
3,mdp.39015063551983,"Barnes, Margaret Ayer","Edna, his wife, an American idyll",1935,True
4,mdp.39015051324153,"Barnes, Margaret Ayer",Wisdom's gate,1938,True


In [10]:
prize.tm_author = prize.tm_author.str.lower()
prize.tm_title = prize.tm_title.str.lower()
prize.head()

Unnamed: 0,docid,tm_author,tm_title,tm_year,prize_author
0,mdp.39015005332708,"barnes, margaret ayer",years of grace,1930,True
1,mdp.39015005305936,"barnes, margaret ayer",westward passage,1931,True
2,uc1.$b56317,"barnes, margaret ayer",within this present,1933,True
3,mdp.39015063551983,"barnes, margaret ayer","edna, his wife, an american idyll",1935,True
4,mdp.39015051324153,"barnes, margaret ayer",wisdom's gate,1938,True


In [25]:
complete = review.merge(prize, how = 'outer', left_on = ['docid', 'tm_author'], right_on = ['docid', 'tm_author'])
print(review.shape, prize.shape)
complete.shape

(4412, 7) (365, 5)


(4549, 10)

In [26]:
complete.head()

Unnamed: 0,docid,tm_author,tm_title_x,tm_year_x,review_year,rownum,review_title,tm_title_y,tm_year_y,prize_author
0,uc1.b3347126,"shaw, irwin","welcome to the city, and other",1942.0,1942.0,3289.0,"welcome to the city, and other stories.",,,
1,uc1.$b322688,"bennett, arnold",woman who stole everything,1927.0,1927.0,4140.0,"woman who stole everything, and other stories.",,,
2,uc1.b3407035,"lippincott, joseph wharton",wilderness champion; the story,1944.0,1944.0,3481.0,wilderness champion; the story of a great houn...,,,
3,uc1.$b116192,"becker, may lamberton",golden tales of the southwest;,1939.0,1939.0,4283.0,golden tales of the southwest; selected with a...,,,
4,mdp.39015059415383,"miller, alice duer, mrs",instruments,1926.0,1926.0,2405.0,"instruments of darkness, and other stories.",,,


In [27]:
listofdicts = []

for idx, row in complete.iterrows():
    newdict = dict()
    for col in complete.columns:
        if col.endswith('_x') or col.endswith('_y'):
            prefix = col[0: -2]
            if pd.isnull(row[prefix + '_x']):
                value = row[prefix + '_y']
            elif pd.isnull(row[prefix + '_y']):
                value = row[prefix + '_x']
            else:
                if type(row[prefix + '_x']) == float or type(row[prefix + '_y']) == float:
                    value = row[prefix + '_x']
                elif len(row[prefix + '_x']) > len(row[prefix + '_y']):
                    value = row[prefix + '_x']
                else:
                    value = row[prefix + '_y']
                    
            newdict[prefix] = value
        else:
            newdict[col] = row[col]
        
    listofdicts.append(newdict)
complete = pd.DataFrame(listofdicts)
print(complete.shape)
complete.head()

(4549, 8)


Unnamed: 0,docid,tm_author,tm_title,tm_year,review_year,rownum,review_title,prize_author
0,uc1.b3347126,"shaw, irwin","welcome to the city, and other",1942.0,1942.0,3289.0,"welcome to the city, and other stories.",
1,uc1.$b322688,"bennett, arnold",woman who stole everything,1927.0,1927.0,4140.0,"woman who stole everything, and other stories.",
2,uc1.b3407035,"lippincott, joseph wharton",wilderness champion; the story,1944.0,1944.0,3481.0,wilderness champion; the story of a great houn...,
3,uc1.$b116192,"becker, may lamberton",golden tales of the southwest;,1939.0,1939.0,4283.0,golden tales of the southwest; selected with a...,
4,mdp.39015059415383,"miller, alice duer, mrs",instruments,1926.0,1926.0,2405.0,"instruments of darkness, and other stories.",


In [28]:
print(complete.shape)
complete = complete.merge(best, how = 'outer', left_on = ['docid', 'tm_author'], right_on = ['docid', 'tm_author'])
print(best.shape)
complete.shape

(4549, 8)
(187, 6)


(4620, 12)

In [29]:
complete.head()

Unnamed: 0,docid,tm_author,tm_title_x,tm_year_x,review_year,rownum,review_title,prize_author,tm_title_y,tm_year_y,bestseller_year,is_bestseller
0,uc1.b3347126,"shaw, irwin","welcome to the city, and other",1942.0,1942.0,3289.0,"welcome to the city, and other stories.",,,,,
1,uc1.$b322688,"bennett, arnold",woman who stole everything,1927.0,1927.0,4140.0,"woman who stole everything, and other stories.",,,,,
2,uc1.b3407035,"lippincott, joseph wharton",wilderness champion; the story,1944.0,1944.0,3481.0,wilderness champion; the story of a great houn...,,,,,
3,uc1.$b116192,"becker, may lamberton",golden tales of the southwest;,1939.0,1939.0,4283.0,golden tales of the southwest; selected with a...,,,,,
4,mdp.39015059415383,"miller, alice duer, mrs",instruments,1926.0,1926.0,2405.0,"instruments of darkness, and other stories.",,,,,


In [30]:
listofdicts = []

for idx, row in complete.iterrows():
    newdict = dict()
    for col in complete.columns:
        if col.endswith('_x') or col.endswith('_y'):
            prefix = col[0: -2]
            if pd.isnull(row[prefix + '_x']):
                value = row[prefix + '_y']
            elif pd.isnull(row[prefix + '_y']):
                value = row[prefix + '_x']
            else:
                if type(row[prefix + '_x']) == float or type(row[prefix + '_y']) == float:
                    value = row[prefix + '_x']
                elif len(row[prefix + '_x']) > len(row[prefix + '_y']):
                    value = row[prefix + '_x']
                else:
                    value = row[prefix + '_y']
                    
            newdict[prefix] = value
        else:
            newdict[col] = row[col]
        
    listofdicts.append(newdict)
complete = pd.DataFrame(listofdicts)
print(complete.shape)
complete.head()

(4620, 10)


Unnamed: 0,docid,tm_author,tm_title,tm_year,review_year,rownum,review_title,prize_author,bestseller_year,is_bestseller
0,uc1.b3347126,"shaw, irwin","welcome to the city, and other",1942.0,1942.0,3289.0,"welcome to the city, and other stories.",,,
1,uc1.$b322688,"bennett, arnold",woman who stole everything,1927.0,1927.0,4140.0,"woman who stole everything, and other stories.",,,
2,uc1.b3407035,"lippincott, joseph wharton",wilderness champion; the story,1944.0,1944.0,3481.0,wilderness champion; the story of a great houn...,,,
3,uc1.$b116192,"becker, may lamberton",golden tales of the southwest;,1939.0,1939.0,4283.0,golden tales of the southwest; selected with a...,,,
4,mdp.39015059415383,"miller, alice duer, mrs",instruments,1926.0,1926.0,2405.0,"instruments of darkness, and other stories.",,,


In [31]:
len(set(complete.docid))

4554

In [34]:
complete = complete.drop_duplicates()
complete.shape

(4576, 10)

In [36]:
sum(complete.prize_author == True)

327

In [37]:
sum(complete.is_bestseller == True)

187

In [40]:
complete.is_bestseller = complete.is_bestseller.fillna(value = False)

In [42]:
complete.prize_author = complete.prize_author.fillna(value = False)

In [43]:
complete.iloc[4390:4440, :]

Unnamed: 0,docid,tm_author,tm_title,tm_year,review_year,rownum,review_title,prize_author,bestseller_year,is_bestseller
4426,nyp.33433076049414,"perry, lawrence",for the ga,1920.0,1920.0,2480.0,for the e's sake.,False,,False
4427,uc1.$b80847,"duhamel, georges",salavin,1935.0,1936.0,826.0,salavln; tr.,False,,False
4428,nyp.33433074807599,"brent, john",a man's game,1882.0,1921.0,164.0,man's game.,False,,False
4429,inu.30000035052244,"maugham, w. somerset, (william somerset)",casuarina tree : six stories,1926.0,1926.0,1237.0,ciisuarina tree.,False,,False
4430,mdp.39015030946902,"bodenheim, maxwell",georgie may,1928.0,1928.0,10.0,qeorgle may.,False,,False
4431,mdp.49015000570912,"clark, walter van tilburg","watchful gods, and other st",1950.0,1950.0,1024.0,watchful and other stories.,False,,False
4432,uc1.b3688341,"frye, ralph bridges",uncle 'lish,1945.0,1945.0,268.0,uncle 'ush.,False,,False
4433,uc1.b3148807,"kaus, gina",tomorrow we,1933.0,1934.0,4996.0,tomorrow we part [tr.,False,,False
4434,mdp.39015015186334,"green, henry","loving,",1945.0,1949.0,3901.0,loving.,False,,False
4435,uc1.b4104150,"bacon, leonard",furioso,1932.0,1932.0,2061.0,the furloso.,False,,False


In [44]:
meta = pd.read_csv("../../cohort/topicdata/comprehensivebooktopicdata.tsv", sep="\t", low_memory=False)

In [46]:
meta = meta.loc[ : , ['docid', 'birthyear', 'firstpub', 'hathi_author', 'hathi_title', 'us_national', 'authof3ormore', 'age']]
meta = meta.rename(columns = {'hathi_author': 'tm_author', 'hathi_title': 'tm_title'})
meta.head()

Unnamed: 0,docid,birthyear,firstpub,tm_author,tm_title,us_national,authof3ormore,age
0,uc1.b4975632,,1989,"Aakhus, Patricia",The voyage of Mael Duin's curragh,True,False,
1,inu.30000112046630,1911.0,1957,"Abbe, George",The winter house,False,True,46.0
2,uc1.$b799882,1911.0,1967,"Abbe, George",The funeral,False,True,56.0
3,uc1.$b149331,1911.0,1968,"Abbe, George",Yonderville,False,True,57.0
4,uc1.32106007981415,1927.0,1956,"Abbey, Edward",The brave cowboy : an old tale in a new time,True,True,29.0


In [None]:
meta.tm_author = meta.tm_author.str.lower()
meta.tm_title = meta.tm_title.str.lower()

In [47]:
wehave = set(complete.docid)
toconcat = []

for year in range(1916, 1951):
    df = meta.loc[meta.firstpub == year, :]
    df = df.loc[~df.docid.isin(wehave), : ]
    toconcat.append(df.sample(30))
    
missing = pd.concat(toconcat)
missing.shape

(1050, 8)

In [48]:
missing.head()

Unnamed: 0,docid,birthyear,firstpub,tm_author,tm_title,us_national,authof3ormore,age
137,uc2.ark+=13960=fk3ws8hq14,1849.0,1916,"Allen, James Lane",A cathedral singer,True,True,67.0
19943,nyp.33433076044449,1875.0,1916,"Scott, Leroy",Partners of the night,False,False,41.0
19984,uc2.ark+=13960=t87h1h26q,1890.0,1916,"Sutherland, Joan",The edge of empire,False,False,26.0
4883,uc2.ark+=13960=t3804z70b,1865.0,1916,"Hill, Grace Livingston",A voice in the wilderness,True,True,51.0
19917,uc2.ark+=13960=t9p26rm6h,1880.0,1916,"Carlsen, C. L. (Carl Laurence)",The taming of Calinga,False,False,36.0


In [55]:
len(set(missing.docid).intersection(set(complete.docid)))

0

In [56]:
missing = missing.loc[ : , ['docid', 'tm_author', 'tm_title']]

In [57]:
missing['obscure'] = True

In [58]:
newcomplete = pd.concat([complete, missing])

In [59]:
newcomplete.shape

(5626, 11)

In [61]:
newcomplete.tail()

Unnamed: 0,docid,tm_author,tm_title,tm_year,review_year,rownum,review_title,prize_author,bestseller_year,is_bestseller,obscure
1161,mdp.39015008633334,"Braine, John",Room at the top,,,,,,,,True
11199,uc1.$b104483,"Yates, Elizabeth",Guardian heart,,,,,,,,True
1283,uc1.$b322859,"Brooke, Jocelyn",The image of a drawn sword,,,,,,,,True
9145,uc1.32106014579582,"Shute, Nevil",A town like Alice,,,,,,,,True
8751,osu.32435017549254,"Ruck, Berta",Joyful journey,,,,,,,,True


In [62]:
tomerge = meta.loc[: , ['docid', 'firstpub', 'birthyear', 'us_national']]

In [63]:
newcomplete = newcomplete.merge(tomerge, how = 'inner', on = 'docid')
newcomplete.shape

(5626, 14)

In [64]:
newcomplete.head()

Unnamed: 0,docid,tm_author,tm_title,tm_year,review_year,rownum,review_title,prize_author,bestseller_year,is_bestseller,obscure,firstpub,birthyear,us_national
0,uc1.b3347126,"shaw, irwin","welcome to the city, and other",1942.0,1942.0,3289.0,"welcome to the city, and other stories.",False,,False,,1942,1913.0,True
1,uc1.$b322688,"bennett, arnold",woman who stole everything,1927.0,1927.0,4140.0,"woman who stole everything, and other stories.",False,,False,,1927,1867.0,False
2,uc1.b3407035,"lippincott, joseph wharton",wilderness champion; the story,1944.0,1944.0,3481.0,wilderness champion; the story of a great houn...,False,,False,,1944,1887.0,True
3,uc1.$b116192,"becker, may lamberton",golden tales of the southwest;,1939.0,1939.0,4283.0,golden tales of the southwest; selected with a...,False,,False,,1939,1873.0,True
4,mdp.39015059415383,"miller, alice duer, mrs",instruments,1926.0,1926.0,2405.0,"instruments of darkness, and other stories.",False,,False,,1926,1874.0,True


In [65]:
newcomplete.is_bestseller = newcomplete.is_bestseller.fillna(value = False)
newcomplete.prize_author = newcomplete.prize_author.fillna(value = False)
newcomplete.obscure = newcomplete.obscure.fillna(value = False)

In [69]:
lowest = []
errors = 0
for idx, row in newcomplete.iterrows():
    values = []
    if not pd.isnull(row.tm_year):
        values.append(int(row.tm_year))
    if not pd.isnull(row.review_year):
        values.append(int(row.review_year))    
    if not pd.isnull(row.firstpub):
        values.append(int(row.firstpub))
    if not pd.isnull(row.bestseller_year):
        values.append(int(row.bestseller_year))
    
    minval = min(values)
    
    if minval < 1916 or minval > 1950:
        minval = 0
        errors += 1
    
    if not pd.isnull(row.review_year) and minval > 0 and row.review_year > minval + 5:
        print(row.review_year, minval)
        minval = 0
        errors += 1

    lowest.append(minval)
    
print(errors)

1927.0 1920
1927.0 1916
1948.0 1941
1935.0 1928
1933.0 1923
1941.0 1925
1947.0 1924
1949.0 1930
1950.0 1932
1950.0 1932
1949.0 1937
1940.0 1934
1942.0 1934
1950.0 1944
1942.0 1926
1943.0 1932
1938.0 1928
1941.0 1921
1950.0 1936
1944.0 1937
1948.0 1922
1932.0 1922
1929.0 1922
1931.0 1924
1946.0 1938
1927.0 1916
1936.0 1924
1945.0 1933
1942.0 1931
1937.0 1930
1934.0 1922
1929.0 1921
1927.0 1919
1932.0 1920
1949.0 1935
1927.0 1920
1931.0 1921
1938.0 1926
1928.0 1920
1936.0 1919
1949.0 1916
107


In [70]:
newcomplete['lowest_date'] = lowest

In [72]:
newcomplete = newcomplete.loc[newcomplete.lowest_date > 0, : ]
newcomplete.shape

(5519, 15)

In [81]:
newcomplete.tm_author = newcomplete.tm_author.str.lower()
newcomplete.tm_title = newcomplete.tm_title.str.lower()

In [85]:
standards = []

for auth1 in tqdm(newcomplete.tm_author):
    if pd.isnull(auth1) or len(auth1) < 1:
        auth1 = 'anonymous'
    auth1 = auth1.replace('[', '')
    auth1 = auth1.replace(']', '')
    if auth1.endswith('mrs'):
        auth1 = auth1[0: -3]
    if len(auth1) > 20:
        auth1 = auth1[0: 20]
        
    if auth1 in standards:
        standards.append(auth1)
    else:
        thetuples = []
        for auth2 in standards:
            match = fuzz.ratio(auth1, auth2)
            if match > 90:
                thetuples.append((match, auth2))
        
        if len(thetuples) < 1:
            standards.append(auth1)
        else:
            thetuples.sort(reverse = True)
            standards.append(thetuples[0][1])
            print(auth1, thetuples[0][1])
                

 26%|██▌       | 1441/5519 [00:02<00:09, 440.92it/s]

howard, elizabeth howard, elizabeth me
howard, elizabeth howard, elizabeth me


 30%|██▉       | 1636/5519 [00:02<00:09, 429.33it/s]

fielding, archibald fielding, archibald 
fielding, archibald fielding, archibald 
fielding, archibald fielding, archibald 


 34%|███▍      | 1864/5519 [00:03<00:12, 300.97it/s]

kendrick, baynard ha kendrick, baynard h


 59%|█████▉    | 3246/5519 [00:08<00:06, 351.43it/s]

bellamannn, henry bellamann, henry


 66%|██████▌   | 3616/5519 [00:09<00:06, 303.86it/s]

van loan, charles em van loan, charles e


 74%|███████▍  | 4096/5519 [00:11<00:05, 249.87it/s]

fielding, archibald fielding, archibald 


 82%|████████▏ | 4531/5519 [00:13<00:08, 117.22it/s]

davis, robert h davis, robert
sheldon, charles m sheldon, charles mon


100%|██████████| 5519/5519 [00:21<00:00, 252.33it/s]


In [86]:
len(standards)

5519

In [87]:
newcomplete['standard_auth'] = standards

In [88]:
newcomplete.head(15)

Unnamed: 0,docid,tm_author,tm_title,tm_year,review_year,rownum,review_title,prize_author,bestseller_year,is_bestseller,obscure,firstpub,birthyear,us_national,lowest_date,standard_auth
0,uc1.b3347126,"shaw, irwin","welcome to the city, and other",1942.0,1942.0,3289.0,"welcome to the city, and other stories.",False,,False,False,1942,1913.0,True,1942,"shaw, irwin"
1,uc1.$b322688,"bennett, arnold",woman who stole everything,1927.0,1927.0,4140.0,"woman who stole everything, and other stories.",False,,False,False,1927,1867.0,False,1927,"bennett, arnold"
2,uc1.b3407035,"lippincott, joseph wharton",wilderness champion; the story,1944.0,1944.0,3481.0,wilderness champion; the story of a great houn...,False,,False,False,1944,1887.0,True,1944,"lippincott, joseph w"
3,uc1.$b116192,"becker, may lamberton",golden tales of the southwest;,1939.0,1939.0,4283.0,golden tales of the southwest; selected with a...,False,,False,False,1939,1873.0,True,1939,"becker, may lamberto"
4,mdp.39015059415383,"miller, alice duer, mrs",instruments,1926.0,1926.0,2405.0,"instruments of darkness, and other stories.",False,,False,False,1926,1874.0,True,1926,"miller, alice duer,"
5,mdp.39015002756842,"saroyan, william",daring young man on the flying,1934.0,1934.0,821.0,"the daring young man on the flying trapeze, an...",False,,False,False,1934,1908.0,True,1934,"saroyan, william"
6,umn.31951000955121j,"chase, mary ellen",girl from,1916.0,1916.0,2149.0,"girl from the big horn country, il",False,,False,False,1916,1887.0,False,1916,"chase, mary ellen"
7,mdp.39015063945169,"miln, louise (jordan), mrs","ann zu-zan, a chine",1932.0,1932.0,61.0,"ann zu-zan, a chinese love story.",False,,False,False,1932,1864.0,True,1932,"miln, louise (jordan"
8,uc1.b3835561,"gogarty, oliver st. john",as i was goin,1937.0,1937.0,2770.0,as i was going down sackville street.,False,,False,False,1937,1878.0,False,1937,"gogarty, oliver st."
9,mdp.39015028779778,"ferguson, j. de lancey (john de lancey)","pride and passion, robert burn",1939.0,1939.0,3524.0,"pride and passion, robert burns, 1759-1796.",False,,False,False,1939,1888.0,False,1939,"ferguson, j. de lanc"


In [89]:
len(set(newcomplete.tm_author))

1927

In [91]:
len(set(newcomplete.standard_auth))

1918

In [92]:
meta = pd.read_csv("../../cohort/topicdata/comprehensivebooktopicdata.tsv", sep="\t", low_memory=False)

In [94]:
newstandards = []

for auth1 in tqdm(meta.hathi_author):
    if pd.isnull(auth1) or len(auth1) < 1:
        auth1 = 'anonymous'
    else:
        auth1 = auth1.lower()
    
    auth1 = auth1.replace('[', '')
    auth1 = auth1.replace(']', '')
    if auth1.endswith('mrs'):
        auth1 = auth1[0: -3]
    if len(auth1) > 20:
        auth1 = auth1[0: 20]
        
    if auth1 in standards:
        newstandards.append(auth1)
    else:
        thetuples = []
        for auth2 in standards:
            match = fuzz.ratio(auth1, auth2)
            if match > 90:
                thetuples.append((match, auth2))
        
        if len(thetuples) < 1:
            standards.append(auth1)
            newstandards.append(auth1)
        else:
            thetuples.sort(reverse = True)
            standards.append(thetuples[0][1])
            newstandards.append(thetuples[0][1])
            print(auth1, thetuples[0][1])

 11%|█         | 3095/29341 [00:12<02:54, 150.36it/s]

ellin, stanley elkin, stanley
ellin, stanley elkin, stanley


 12%|█▏        | 3391/29341 [00:13<01:38, 263.43it/s]

fielding, archibald fielding, archibald 
fielding, archibald fielding, archibald 
fielding, archibald fielding, archibald 
fielding, archibald fielding, archibald 


 13%|█▎        | 3848/29341 [00:14<01:27, 291.32it/s]

garrett, george p garrett, george
garrett, george p garrett, george
garrett, george p garrett, george
garrett, george p garrett, george
garrett, george p garrett, george


 16%|█▌        | 4554/29341 [00:18<02:17, 179.64it/s]

harris, john farris, john
harris, john farris, john
harris, john farris, john


 17%|█▋        | 5056/29341 [00:20<01:48, 223.17it/s]

howard, elizabeth howard, elizabeth me
howard, elizabeth howard, elizabeth me
howard, elizabeth howard, elizabeth me
howard, elizabeth howard, elizabeth me
howard, elizabeth howard, elizabeth me
howard, elizabeth howard, elizabeth me


 19%|█▉        | 5655/29341 [00:23<01:44, 226.26it/s]

kendrick, baynard ha kendrick, baynard h


 25%|██▍       | 7320/29341 [00:31<01:35, 229.62it/s]

moore, christopher morley, christopher


 28%|██▊       | 8336/29341 [00:35<01:31, 228.48it/s]

price, richard pryce, richard
price, richard pryce, richard
price, richard pryce, richard


 29%|██▉       | 8551/29341 [00:37<01:45, 196.29it/s]

rice, luanne rice, anne


 31%|███       | 9097/29341 [00:39<01:34, 214.82it/s]

sheldon, charles m sheldon, charles mon


 34%|███▎      | 9873/29341 [00:44<01:46, 183.28it/s]

swarthout, glendon f swarthout, glendon
swarthout, glendon f swarthout, glendon
swarthout, glendon f swarthout, glendon


 34%|███▍      | 9959/29341 [00:44<01:14, 259.18it/s]

tarr, judith farr, judith


 36%|███▌      | 10604/29341 [00:47<01:44, 179.59it/s]

weidman, jereme weidman, jerome


 36%|███▋      | 10701/29341 [00:48<01:29, 208.93it/s]

west, anthony c west, anthony
west, anthony c west, anthony
west, anthony c west, anthony


 37%|███▋      | 10932/29341 [00:49<01:14, 246.42it/s]

williams, joy williams, jay


 39%|███▉      | 11429/29341 [00:51<00:39, 455.43it/s]

price, richard pryce, richard


 39%|███▉      | 11574/29341 [00:54<05:14, 56.44it/s] 

fleming, may agnes e fleming, may agnes


 40%|████      | 11848/29341 [01:00<04:43, 61.66it/s]

hoffmann, franz hoffman, franz


 41%|████      | 12032/29341 [01:03<04:50, 59.52it/s]

fraser-tytler, m. e. fraser-tytler, m. e


 42%|████▏     | 12211/29341 [01:06<04:34, 62.51it/s]

lugard, flora louisa lugard, flora louise


 43%|████▎     | 12487/29341 [01:11<06:23, 43.90it/s]

reid, christian reid, christine


 44%|████▍     | 12837/29341 [01:17<04:15, 64.72it/s]

edwardes, annie edwards, annie


 44%|████▍     | 12851/29341 [01:17<06:00, 45.72it/s]

hardinge, william m hardinge, william mo


 44%|████▍     | 12941/29341 [01:18<03:59, 68.52it/s]

spenser, mary clare  spencer, mary clare 


 45%|████▍     | 13064/29341 [01:21<05:17, 51.35it/s]

majendie, margaret majendie, margaret, 


 45%|████▌     | 13336/29341 [01:25<05:38, 47.32it/s]

troubetzkoy, amelie  troubetzkoy, amélie 


 46%|████▌     | 13507/29341 [01:28<03:56, 67.05it/s]

smith, constance smith, constance i


 47%|████▋     | 13715/29341 [01:31<04:43, 55.17it/s]

garrigues, adele m garrigues, adéle m


 47%|████▋     | 13801/29341 [01:33<03:17, 78.87it/s]

kernahan, coulson kernahan, coulson, 


 48%|████▊     | 14018/29341 [01:37<06:39, 38.33it/s]

piatt, donn piatt, don


 48%|████▊     | 14044/29341 [01:37<05:04, 50.22it/s]

laffan, de courcy laffan, de courcy, 


 48%|████▊     | 14116/29341 [01:39<04:21, 58.12it/s]

grey, edward greey, edward
ball, thomas bell, thomas
of samosata lucian of samosata. lucian


 48%|████▊     | 14206/29341 [01:40<04:43, 53.44it/s]

molloy, j. fitzgerzl molloy, j. fitzgeral
dering, edward henea dering, edward henag


 49%|████▉     | 14466/29341 [01:45<04:57, 49.95it/s]

fergusson, dugald furguson, dugald


 50%|█████     | 14761/29341 [01:49<04:15, 57.07it/s] 

mackie, john macnie, john


 51%|█████     | 14857/29341 [01:50<02:53, 83.54it/s]

smith, constance isa smith, constance i


 51%|█████     | 14961/29341 [01:52<03:27, 69.30it/s]

du bois, constance g dubois, constance go


 52%|█████▏    | 15207/29341 [01:57<04:32, 51.93it/s]

white, william hale white, william allen


 52%|█████▏    | 15330/29341 [01:59<02:27, 95.29it/s]

kennard, edward,  kennard, edward


 53%|█████▎    | 15697/29341 [02:05<04:24, 51.50it/s]

sheldon, charles m sheldon, charles mon


 54%|█████▎    | 15741/29341 [02:06<04:19, 52.42it/s]

suffling, ernest r suffling, ernest r. 


 55%|█████▍    | 16070/29341 [02:13<03:55, 56.38it/s]

carryl, charles e. ( carryl, charles e


 55%|█████▍    | 16099/29341 [02:13<03:58, 55.52it/s]

cameron, h. lovett cameron, h. lovett, 


 57%|█████▋    | 16751/29341 [02:26<04:54, 42.81it/s]

angus, orme agnus, orme


 57%|█████▋    | 16788/29341 [02:27<04:53, 42.75it/s]

prevost, marcel prévost, marcel


 60%|█████▉    | 17572/29341 [02:43<04:17, 45.78it/s]

baroness von, hutten baroness, von hutten


 60%|██████    | 17614/29341 [02:44<05:43, 34.16it/s]

demorgan, william fr de morgan, william f


 61%|██████    | 17916/29341 [02:51<03:25, 55.52it/s]

kernahan, coulson kernahan, coulson, 


 63%|██████▎   | 18395/29341 [03:02<04:36, 39.64it/s]

auscough, john ayscough, john


 63%|██████▎   | 18573/29341 [03:06<04:44, 37.85it/s]

macdonald, robert m macdonald, robert


 64%|██████▎   | 18685/29341 [03:09<04:24, 40.30it/s]

panton, jane ellen f panton, jane ellen


 64%|██████▍   | 18863/29341 [03:13<05:15, 33.22it/s]

reid, christian reid, christine


 65%|██████▍   | 18983/29341 [03:16<05:07, 33.65it/s]

robbins, tod robbins, tom


 65%|██████▌   | 19129/29341 [03:20<03:33, 47.72it/s]

grimshaw, beatrice grimshaw, beatrice e


 66%|██████▌   | 19264/29341 [03:23<04:29, 37.46it/s]

lombardini, s. h lombardini, s h


 66%|██████▋   | 19481/29341 [03:29<04:48, 34.17it/s]

ryan, marah ellis ma ryan, marah ellis


 67%|██████▋   | 19572/29341 [03:32<04:58, 32.73it/s]

sheldon, charles m sheldon, charles mon


 67%|██████▋   | 19612/29341 [03:33<03:47, 42.72it/s]

artsybashev, mikhail art͡ybashev, mikhail
macdonald, ronald

 67%|██████▋   | 19617/29341 [03:33<04:13, 38.33it/s]

 mcdonald, ronald


 67%|██████▋   | 19685/29341 [03:35<03:56, 40.86it/s]

sterrett, frances ro sterrett, frances r.


 68%|██████▊   | 19822/29341 [03:38<04:20, 36.48it/s]

de selincourt, hugh de sélincourt, hugh


 68%|██████▊   | 19834/29341 [03:38<04:48, 32.94it/s]

venable, edward c venable, edward carr
curtis, alice turner curtis, alice (turne


 68%|██████▊   | 19843/29341 [03:39<04:17, 36.92it/s]

sandes, john saunders, john


 68%|██████▊   | 20072/29341 [03:42<02:48, 54.95it/s] 

rogers, robert c rogers, robert l


 69%|██████▊   | 20119/29341 [03:44<03:46, 40.65it/s]

davis, robert h davis, robert


 69%|██████▉   | 20244/29341 [03:46<02:39, 57.02it/s]

sheldon, charles m sheldon, charles mon


 69%|██████▉   | 20303/29341 [03:47<01:35, 94.84it/s]

de s??lincourt, hugh de sélincourt, hugh


 70%|███████   | 20654/29341 [03:54<02:35, 55.94it/s]

van loan, charles em van loan, charles e


 71%|███████   | 20721/29341 [03:55<02:55, 49.04it/s]

forrester, izola l.  forrester, izola l


 71%|███████   | 20741/29341 [03:55<02:23, 60.09it/s]

kernahan, coulson kernahan, coulson, 
curtis, alice turner curtis, alice (turne


 71%|███████▏  | 20941/29341 [03:58<01:50, 75.85it/s]

basset, sara ware bassett, sara ware


 72%|███████▏  | 21133/29341 [04:01<02:36, 52.56it/s]

botsford, charles al botsford, charles a.


 72%|███████▏  | 21181/29341 [04:02<01:53, 72.19it/s]

hubbard, bert hubbard, elbert


 72%|███████▏  | 21210/29341 [04:03<02:28, 54.87it/s]

bosch??re, jean de boschère, jean de


 73%|███████▎  | 21539/29341 [04:09<02:52, 45.12it/s]

woods, margaret l (m woods, margaret l. (


 74%|███████▍  | 21806/29341 [04:12<02:37, 47.93it/s]

bellamannn, henry bellamann, henry


 74%|███████▍  | 21844/29341 [04:13<01:34, 79.04it/s]

johnston, william johnston, william an


 75%|███████▌  | 22112/29341 [04:17<00:57, 124.75it/s]

oldridge, james aldridge, james


 77%|███████▋  | 22521/29341 [04:26<04:03, 28.05it/s] 

andersen nex,̜ marti andersen nexø, marti


 77%|███████▋  | 22581/29341 [04:28<03:37, 31.06it/s]

harris, john farris, john


 78%|███████▊  | 22761/29341 [04:36<03:19, 32.99it/s]

miller, kenneth millar, kenneth


 79%|███████▉  | 23110/29341 [04:50<02:56, 35.33it/s]

creasey, john casey, john


 79%|███████▉  | 23126/29341 [04:50<03:52, 26.69it/s]

holles, robert holmes, robert
sewell, elizabeth sewell, elizabeth mi


 79%|███████▉  | 23149/29341 [04:51<04:17, 24.00it/s]

davison, gladys davidson, gladys


 80%|███████▉  | 23422/29341 [05:04<04:06, 24.05it/s]

tanizaki, junʾichirō tanizaki, junʼichirō


 80%|████████  | 23507/29341 [05:08<04:26, 21.89it/s]

bowen, john brown, john


 80%|████████  | 23586/29341 [05:11<04:29, 21.34it/s]

nicole, christopher coe, christopher


 80%|████████  | 23597/29341 [05:12<04:15, 22.49it/s]

russ, martin ross, martin


 81%|████████  | 23675/29341 [05:15<03:55, 24.08it/s]

creasey, john casey, john


 81%|████████  | 23692/29341 [05:16<04:29, 20.96it/s]

walder, david walker, david


 82%|████████▏ | 23968/29341 [05:29<03:23, 26.39it/s]

corley, edwin corle, edwin
james, john jakes, john


 82%|████████▏ | 23986/29341 [05:30<04:48, 18.59it/s]

briley, john bailey, john


 82%|████████▏ | 24041/29341 [05:32<04:36, 19.13it/s]

green, philip geen, philip


 82%|████████▏ | 24192/29341 [05:39<03:44, 22.95it/s]

colter, cyrus cole, cyrus


 83%|████████▎ | 24285/29341 [05:43<03:05, 27.28it/s]

jones, glyn jones, gwyn


 83%|████████▎ | 24318/29341 [05:44<02:31, 33.10it/s]

streatfeild, noel streatfield, noel


 84%|████████▎ | 24508/29341 [05:51<02:59, 26.97it/s]

sayers, dorothy l sayers, dorothy l. (


 84%|████████▍ | 24629/29341 [05:56<03:26, 22.83it/s]

kennett, john bennett, john


 84%|████████▍ | 24641/29341 [05:56<02:54, 27.01it/s]

colter, cyrus cole, cyrus


 84%|████████▍ | 24743/29341 [06:00<02:40, 28.63it/s]

espino, frederico li espino, federico lic


 85%|████████▌ | 24960/29341 [06:08<02:56, 24.80it/s]

beck, warren a beck, warren


 85%|████████▌ | 24971/29341 [06:09<02:34, 28.28it/s]

clarke, austin clare, austin


 85%|████████▌ | 24992/29341 [06:10<02:44, 26.38it/s]

gardner, john e gardner, john


 86%|████████▌ | 25109/29341 [06:15<02:40, 26.29it/s]

swarthout, glendon f swarthout, glendon


 86%|████████▌ | 25185/29341 [06:18<01:46, 38.98it/s]

starling, thomas sterling, thomas


 86%|████████▌ | 25269/29341 [06:21<02:17, 29.58it/s]

arlen, michael j arlen, michael


 87%|████████▋ | 25398/29341 [06:26<02:11, 29.87it/s]

morice, anne rice, anne


 87%|████████▋ | 25585/29341 [06:33<03:16, 19.10it/s]

wilson, a. n wilson, a. j


 88%|████████▊ | 25722/29341 [06:38<01:19, 45.53it/s]

price, richard pryce, richard


 88%|████████▊ | 25748/29341 [06:39<02:24, 24.90it/s]

waten, judah l waten, judah


 89%|████████▉ | 26061/29341 [06:51<02:12, 24.79it/s]

clarke, a. m clarke, a. w


 90%|████████▉ | 26304/29341 [07:01<01:52, 26.88it/s]

hope, christopher coe, christopher


 90%|█████████ | 26419/29341 [07:05<01:47, 27.15it/s]

robinson, margaret a robinson, margaret b


 90%|█████████ | 26430/29341 [07:06<01:30, 32.27it/s]

henderson, robert anderson, robert


 90%|█████████ | 26470/29341 [07:08<02:00, 23.84it/s]

rosen, norma rosten, norman


 90%|█████████ | 26485/29341 [07:08<01:31, 31.28it/s]

morice, anne rice, anne


 91%|█████████ | 26584/29341 [07:13<02:00, 22.80it/s]

waten, judah l waten, judah


 91%|█████████ | 26620/29341 [07:14<01:26, 31.56it/s]

price, richard pryce, richard


 91%|█████████ | 26687/29341 [07:16<01:13, 35.92it/s]

brown, michael e brown, michael


 91%|█████████ | 26747/29341 [07:18<01:48, 23.97it/s]

wilson, a. n wilson, a. j


 91%|█████████▏| 26810/29341 [07:21<01:21, 30.91it/s]

lawson, robert g lawson, robert


 92%|█████████▏| 26862/29341 [07:23<01:37, 25.36it/s]

lodge, david dodge, david


 92%|█████████▏| 27084/29341 [07:32<01:07, 33.63it/s]

mahy, margaret mayo, margaret


 92%|█████████▏| 27117/29341 [07:34<01:43, 21.55it/s]

tallent, elizabeth allen, elizabeth


 93%|█████████▎| 27242/29341 [07:39<01:34, 22.31it/s]

demarinis, rick de marinis, rick


 94%|█████████▎| 27463/29341 [07:50<01:34, 19.91it/s]

tallent, elizabeth allen, elizabeth


 94%|█████████▍| 27653/29341 [07:59<01:19, 21.27it/s]

madsen, david madden, david


 94%|█████████▍| 27714/29341 [08:02<01:15, 21.50it/s]

lodge, david dodge, david


 95%|█████████▍| 27780/29341 [08:05<01:02, 24.83it/s]

roth, henry h roth, henry


 95%|█████████▍| 27833/29341 [08:07<01:06, 22.62it/s]

fisher, mark fisher, mary


 97%|█████████▋| 28382/29341 [08:35<00:36, 26.54it/s]

smith, j. p smith, j. f


 97%|█████████▋| 28465/29341 [08:40<00:43, 20.37it/s]

winder, robert wilder, robert


 97%|█████████▋| 28507/29341 [08:42<00:51, 16.25it/s]

hickling, r. h hickling, r. a


 97%|█████████▋| 28582/29341 [08:45<00:38, 19.81it/s]

guérard, albert j (a guérard, albert j. (


 98%|█████████▊| 28660/29341 [08:49<00:31, 21.66it/s]

robinson, peter robins, peter


 98%|█████████▊| 28663/29341 [08:49<00:32, 21.15it/s]

roberts, ken roberts, kevin


 98%|█████████▊| 28669/29341 [08:50<00:35, 18.77it/s]

suárez, virgil suarez, virgil


 98%|█████████▊| 28681/29341 [08:50<00:39, 16.79it/s]

stewart, jean stewart, sean


 98%|█████████▊| 28746/29341 [08:54<00:30, 19.46it/s]

enriquez, antonio enriquez, antonio re


 99%|█████████▊| 28905/29341 [09:03<00:25, 17.17it/s]

bayley, john bailey, john


 99%|█████████▉| 29064/29341 [09:12<00:14, 19.52it/s]

carter, william carpenter, william


 99%|█████████▉| 29074/29341 [09:12<00:15, 17.17it/s]

anderson, david d anderson, david


100%|██████████| 29341/29341 [09:27<00:00, 51.73it/s]


In [95]:
len(newstandards)

29341

In [96]:
newcomplete.shape

(5519, 16)

In [98]:
newcomplete.to_csv('allmatchedbooks.tsv', sep = '\t', index = False)

In [99]:
len(set(newstandards))

12370

In [100]:
len(set(meta.hathi_author))

12532

In [101]:
meta.shape

(29341, 208)

In [102]:
newcomplete.head()

Unnamed: 0,docid,tm_author,tm_title,tm_year,review_year,rownum,review_title,prize_author,bestseller_year,is_bestseller,obscure,firstpub,birthyear,us_national,lowest_date,standard_auth
0,uc1.b3347126,"shaw, irwin","welcome to the city, and other",1942.0,1942.0,3289.0,"welcome to the city, and other stories.",False,,False,False,1942,1913.0,True,1942,"shaw, irwin"
1,uc1.$b322688,"bennett, arnold",woman who stole everything,1927.0,1927.0,4140.0,"woman who stole everything, and other stories.",False,,False,False,1927,1867.0,False,1927,"bennett, arnold"
2,uc1.b3407035,"lippincott, joseph wharton",wilderness champion; the story,1944.0,1944.0,3481.0,wilderness champion; the story of a great houn...,False,,False,False,1944,1887.0,True,1944,"lippincott, joseph w"
3,uc1.$b116192,"becker, may lamberton",golden tales of the southwest;,1939.0,1939.0,4283.0,golden tales of the southwest; selected with a...,False,,False,False,1939,1873.0,True,1939,"becker, may lamberto"
4,mdp.39015059415383,"miller, alice duer, mrs",instruments,1926.0,1926.0,2405.0,"instruments of darkness, and other stories.",False,,False,False,1926,1874.0,True,1926,"miller, alice duer,"


In [103]:
meta['standard_auth'] = newstandards

In [104]:
meta.tail()

Unnamed: 0,t0,t1,t2,t3,t4,t5,t6,t7,t8,t9,...,t199,docid,birthyear,firstpub,hathi_author,hathi_title,us_national,authof3ormore,age,standard_auth
29336,0.004679,6e-06,0.000409,0.046729,0.000439,6e-06,3e-06,1e-05,0.001933,0.052563,...,2e-06,mdp.39015043059172,1932.0,1999,"Smithies, Michael",Gulfs of Thailand : a collection of short stories,False,False,67.0,"smithies, michael"
29337,0.000111,0.001208,0.000319,0.01009,0.000717,0.000409,5e-06,1.6e-05,1.4e-05,0.000113,...,0.000703,mdp.39015047842185,,1999,"Claxton, William",Laugh : portraits of the greatest comedians an...,False,False,,"claxton, william"
29338,5e-05,2.7e-05,0.001393,0.000329,0.009165,0.000376,0.00016,1.9e-05,0.000131,0.006394,...,8.1e-05,mdp.39015051894841,,1999,"Keyes, Marian",Last chance saloon,False,False,,"keyes, marian"
29339,0.000611,0.002533,0.007632,0.003172,0.007383,0.000112,2e-06,0.000187,0.000292,0.000292,...,2e-06,mdp.39015047609089,,1999,"Mayo, Wendell",B. Horror : and other stories,False,False,,"mayo, wendell"
29340,0.000164,6e-06,0.004271,4e-06,0.027734,6e-06,3e-06,0.003068,0.000576,0.000198,...,2e-06,mdp.39015046006196,,1999,"Vega, Suzanne",The passionate eye : the collected writing of ...,False,False,,"vega, suzanne"


In [105]:
meta = meta.loc[(meta.firstpub > 1885) & (meta.firstpub < 1977), :]
meta.shape

(22850, 209)

In [106]:
meta.to_csv('topicdatastandardauths.tsv', sep = '\t', index = False)

In [107]:
yeardict = dict()

for year, df in newcomplete.groupby('lowest_date'):
    yeardict[year] = df

In [108]:
len(yeardict)

35

In [109]:
del yeardict