# Cleaning Data with Pandas
## Reading the data

### NOTE: Missing Values in Date fields with MongoDB
When you try to save a DataFrame that includes a columnn of datetime values and there
are some of those values missing, pymongo gives you a "ValueError: NaTType does not support utcoffset".  This is because for type datetype, it first calls utcoffset on each value, but the NaT (Not a Time -- designator for missing values in datetime columns) does not support having utcoffset called on it.

To solve this, I am going to modify the "dataframe_to_mongo()" function to test for any columns of datetime type with missing values.  If it finds any of those, it will save those missing value rows separately and save them without the datetime column.

The key to understanding this solution is that MongoDB is a NoSQL database, so all the rows do __not__ have to have the same fields -- it is open structure and there is no table schema like in SQL databases.  So I will save the rows that have the datetime value so that they _include_ the datetime column, but save the rows that are _missing_ the datetime value so that they _exclude_ the datetime column.  That way, I will avoid the ValueError.

In [143]:
import numpy as np
import pandas as pd

from pymongo import MongoClient

def get_mongo_database(db_name, host='localhost', port=27017, username=None, password=None):
    '''Get (or create) named database from MongoDB with/out authentication'''
    if username and password:
        mongo_uri = 'mongodb://{}:{}@{}/{}'.format(username, password, host, db_name)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)
    return conn[db_name]

def mongo_to_dataframe(db_name, collection_name, query={}, host='localhost',
                       port=27017, username=None, password=None, no_id=True):
    '''Create a Pandas DataFrame from MongoDB collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    cursor = db[collection_name].find(query)
    df = pd.DataFrame(list(cursor))
    if no_id:
        del df['_id']
    return df

def dataframe_to_mongo(dframe, db_name, collection_name, host='localhost',
                 port=27017, username=None, password=None):
    '''save a dataframe to mongodb collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    records = dframe.to_dict('records')  # 'records' puts it into our list-of-dicts format
    db[collection_name].insert_many(records)


DB_NOBEL_PRIZE = 'nobel_prize' # use string constants or a spell error in retrieval will create new table.
COLL_WINNERS = 'winners' # winners collection

#----------------------------
# From json file.
#----------------------------

with open('data/nwinners_raw.json') as f:
    df = pd.read_json(f)

# Make sure mongodb is clear (so we don't duplicate data), then
# save to Mongo for next section
db = get_mongo_database(DB_NOBEL_PRIZE)
db[COLL_WINNERS].delete_many({})  # deletes everything (no filter)
dataframe_to_mongo(df, DB_NOBEL_PRIZE, COLL_WINNERS) # save to Mongo for next section

#----------------------------
# From mongodb collection
#----------------------------
if False:
    df = mongo_to_dataframe(DB_NOBEL_PRIZE, COLL_WINNERS)


## Inspecting the data

In [144]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1089 entries, 0 to 1088
Data columns (total 12 columns):
born_in           1089 non-null object
category          1089 non-null object
country           1089 non-null object
date_of_birth     1079 non-null object
date_of_death     716 non-null object
gender            1079 non-null object
link              1089 non-null object
name              1089 non-null object
place_of_birth    1079 non-null object
place_of_death    716 non-null object
text              1089 non-null object
year              1089 non-null int64
dtypes: int64(1), object(11)
memory usage: 110.6+ KB


In [145]:
df.describe() # only acts on numeric columns
df.describe(include=['object']) # include specifies other types to include (besides numeric)

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text
count,1089.0,1089,1089,1079,716,1079,1089,1089,1079,716,1089
unique,35.0,7,59,879,590,2,925,1019,615,314,1078
top,,Physiology or Medicine,United States,7 November 1867,4 July 1934,male,https://en.wikipedia.org/wiki/Marie_Curie,César Milstein,New York City,Cambridge,"Bernardo Houssay , Physiology or Medicine, 1947"
freq,955.0,259,359,4,4,1021,4,3,43,37,2


In [146]:
df.tail()
df.head(3)

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,,Physiology or Medicine,Summary,8 October 1927,24 March 2002,male,https://en.wikipedia.org/wiki/C%C3%A9sar_Milstein,César Milstein,Bahía Blanca,Cambridge,"César Milstein , Physiology or Medicine, 1984",1984
1,Austria,Physiology or Medicine,,20 November 1886,12 June 1982,male,https://en.wikipedia.org/wiki/Karl_von_Frisch,Karl von Frisch *,Vienna,Munich,"Karl von Frisch *, Physiology or Medicine, 1973",1973
2,,Physiology or Medicine,Austria,7 November 1903,27 February 1989,male,https://en.wikipedia.org/wiki/Konrad_Lorenz,Konrad Lorenz,Vienna,Vienna,"Konrad Lorenz , Physiology or Medicine, 1973",1973


## Set indices (optional, but useful)

In [147]:
print(df.columns) # the column index
df = df.set_index('name') # row index (note we need to assign to df)
df.loc['Albert Einstein'] # loc accesses by label index

Index([u'born_in', u'category', u'country', u'date_of_birth', u'date_of_death',
       u'gender', u'link', u'name', u'place_of_birth', u'place_of_death',
       u'text', u'year'],
      dtype='object')


Unnamed: 0_level_0,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Albert Einstein,,Physics,Switzerland,14 March 1879,18 April 1955,male,https://en.wikipedia.org/wiki/Albert_Einstein,Ulm,Princeton borough,"Albert Einstein , born in Germany , Physics, ...",1921
Albert Einstein,,Physics,Germany,14 March 1879,18 April 1955,male,https://en.wikipedia.org/wiki/Albert_Einstein,Ulm,Princeton borough,"Albert Einstein , Physics, 1921",1921


In [148]:
df.reset_index(inplace=True) # go back to default int index.  Note the 'inplace'
df.iloc[2] # iloc accesses by integer index
df.head(2)

Unnamed: 0,name,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
0,César Milstein,,Physiology or Medicine,Summary,8 October 1927,24 March 2002,male,https://en.wikipedia.org/wiki/C%C3%A9sar_Milstein,Bahía Blanca,Cambridge,"César Milstein , Physiology or Medicine, 1984",1984
1,Karl von Frisch *,Austria,Physiology or Medicine,,20 November 1886,12 June 1982,male,https://en.wikipedia.org/wiki/Karl_von_Frisch,Vienna,Munich,"Karl von Frisch *, Physiology or Medicine, 1973",1973


In [149]:
df[0:10] #first 10 rows
df[-4:]  #last 4 rows

# masks
mask = df.year > 2000
df[mask]
df[df.year>2000] # more direct

Unnamed: 0,name,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
6,Elfriede Jelinek,,Literature,Austria,20 October 1946,,female,https://en.wikipedia.org/wiki/Elfriede_Jelinek,Mürzzuschlag,,"Elfriede Jelinek , Literature, 2004",2004
7,International Atomic Energy Agency,,Peace,Austria,,,,https://en.wikipedia.org/wiki/International_At...,,,"International Atomic Energy Agency , Peace, 2005",2005
8,Martin Karplus *,Austria,Chemistry,,15 March 1930,,male,https://en.wikipedia.org/wiki/Martin_Karplus,Vienna,,"Martin Karplus *, Chemistry, 2013",2013
16,Tawakkol Karman,,Peace,Yemen,7 February 1979,,female,https://en.wikipedia.org/wiki/Tawakkol_Karman,Ta'izz,,"Tawakkol Karman , Peace, 2011",2011
281,Leland H. Hartwell,,Physiology or Medicine,United States,30 October 1939,,male,https://en.wikipedia.org/wiki/Leland_H._Hartwell,Los Angeles,,"Leland H. Hartwell , Physiology or Medicine, 2001",2001
282,Carl E. Wieman,,Physics,United States,26 March 1951,,male,https://en.wikipedia.org/wiki/Carl_Wieman,Corvallis,,"Carl E. Wieman , Physics, 2001",2001
283,Eric A. Cornell,,Physics,United States,19 December 1961,,male,https://en.wikipedia.org/wiki/Eric_Allin_Cornell,Eilat israel,,"Eric A. Cornell , Physics, 2001",2001
284,A. Michael Spence,,Economics,United States,7 November 1943,,male,https://en.wikipedia.org/wiki/A._Michael_Spence,Montclair,,"A. Michael Spence , Economics, 2001",2001
285,George A. Akerlof,,Economics,United States,17 June 1940,,male,https://en.wikipedia.org/wiki/George_A._Akerlof,New Haven,,"George A. Akerlof , Economics, 2001",2001
286,Joseph E. Stiglitz,,Economics,United States,9 February 1943,,male,https://en.wikipedia.org/wiki/Joseph_E._Stiglitz,Gary,,"Joseph E. Stiglitz , Economics, 2001",2001


## Cleaning the Data
### Remove the asterisk from names

People that got the award in a different country than they were born in were entered two times in the original wikipedia page where we scraped this data.  An example is John Cornforth who won the award in the UK, but was born in Australia.  Look at his records:

In [150]:
df[df.name.str.contains('Cornforth')]

Unnamed: 0,name,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
14,John Cornforth *,Australia,Chemistry,,7 September 1917,8 December 2013,male,https://en.wikipedia.org/wiki/John_Cornforth,Sydney,Brighton,"John Cornforth *, Chemistry, 1975",1975
449,John Cornforth,,Chemistry,United Kingdom,7 September 1917,8 December 2013,male,https://en.wikipedia.org/wiki/John_Cornforth,Sydney,Brighton,"John Cornforth , born in Australia , Chemistr...",1975


In [151]:
# How many names with asterisk?
df[df.name.str.contains('\*')]['name'].count()

134

The entry that had an asterisk by the name was an entry in the country the person was born in as opposed to the country from which they received the award.  In these cases, we need to get rid of the duplicate entry but also enter the born_in field in the original entry without the asterisk.

In [152]:
# Take out the asterisks in people's name
df.name = df.name.str.replace('*', '')
df.name = df.name.str.strip() # strip any remaining whitespace at end
df[df.name.str.contains('\*')]['name'].count()

0

### Eliminating duplications for people with a born_in entry

In [153]:
set(df.born_in.apply(type))

{unicode}

In [154]:
bornin_col = df.born_in # or df['born_in']

# if you look at bornin_col, most entries are an empty string
# replace empty strings with NaN
bornin_col.replace('', np.nan, inplace=True)
bornin_col

0             NaN
1         Austria
2             NaN
3             NaN
4         Austria
5         Austria
6             NaN
7             NaN
8         Austria
9             NaN
10            NaN
11            NaN
12            NaN
13            NaN
14      Australia
15            NaN
16            NaN
17            NaN
18            NaN
19            NaN
20            NaN
21            NaN
22            NaN
23            NaN
24            NaN
25            NaN
26            NaN
27            NaN
28            NaN
29            NaN
          ...    
1059          NaN
1060          NaN
1061          NaN
1062          NaN
1063    Australia
1064          NaN
1065          NaN
1066          NaN
1067          NaN
1068          NaN
1069          NaN
1070          NaN
1071          NaN
1072          NaN
1073          NaN
1074          NaN
1075          NaN
1076          NaN
1077          NaN
1078          NaN
1079          NaN
1080          NaN
1081          NaN
1082          NaN
1083      

In [156]:
# now do that NaN replacement for all '' entries in the dataframe
df.replace('',np.nan, inplace=True)

### DANNY - new section to transfer born_in entries into the duplicate row of same name which has a country entry but no born_in entry

In [157]:
df[df.name.str.contains('Cornforth')]

Unnamed: 0,name,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
14,John Cornforth,Australia,Chemistry,,7 September 1917,8 December 2013,male,https://en.wikipedia.org/wiki/John_Cornforth,Sydney,Brighton,"John Cornforth *, Chemistry, 1975",1975
449,John Cornforth,,Chemistry,United Kingdom,7 September 1917,8 December 2013,male,https://en.wikipedia.org/wiki/John_Cornforth,Sydney,Brighton,"John Cornforth , born in Australia , Chemistr...",1975


In [158]:
df[df.name.str.contains('Leonid Hurwicz')]

Unnamed: 0,name,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
327,Leonid Hurwicz,,Economics,United States,21 August 1917,24 June 2008,male,https://en.wikipedia.org/wiki/Leonid_Hurwicz,Moscow,Minneapolis,"Leonid Hurwicz , born in Russia , Economics, ...",2007
615,Leonid Hurwicz,Russia and Soviet Union,Economics,,21 August 1917,24 June 2008,male,https://en.wikipedia.org/wiki/Leonid_Hurwicz,Moscow,Minneapolis,"Leonid Hurwicz *, Economics, 2007",2007
638,Leonid Hurwicz,Poland,Economics,,21 August 1917,24 June 2008,male,https://en.wikipedia.org/wiki/Leonid_Hurwicz,Moscow,Minneapolis,"Leonid Hurwicz *, born in then Russian Republ...",2007


In [159]:
# an example of join from a stackoverflow answer
# https://stackoverflow.com/questions/39816671/pandas-populate-new-dataframe-column-based-on-matching-columns-in-another-datafr
example_df = pd.DataFrame(dict(
        AUTHOR_NAME=list('AAABBCCCCDEEFGG'),
        title=      list('zyxwvutsrqponml')
    ))

example_df2 = pd.DataFrame(dict(
        AUTHOR_NAME=list('AABCCEGG'),
        title      =list('zwvtrpml'),
        CATEGORY   =list('11223344')
    ))
cols = ['AUTHOR_NAME', 'title']
example_df.join(example_df2.set_index(cols), on=cols, how='left')

Unnamed: 0,AUTHOR_NAME,title,CATEGORY
0,A,z,1.0
1,A,y,
2,A,x,
3,B,w,
4,B,v,2.0
5,C,u,
6,C,t,2.0
7,C,s,
8,C,r,3.0
9,D,q,


In [160]:
df_born_in = df[df.born_in.notnull()] # save off the entries with born_in
df_test = df[df.born_in.isnull()] # delete the rows with born_in (by saving only isnull())
df_test = df_test.drop('born_in', axis=1)
match_cols = ['name', 'year', 'category']
df_born_in = df_born_in.set_index(match_cols).loc[:,'born_in']
df_born_in

name                        year  category              
Karl von Frisch             1973  Physiology or Medicine                   Austria
Walter Kohn                 1998  Chemistry                                Austria
Eric Kandel                 2000  Physiology or Medicine                   Austria
Martin Karplus              2013  Chemistry                                Austria
John Cornforth              1975  Chemistry                              Australia
T. S. Eliot                 1948  Literature                         United States
Har Gobind Khorana          1968  Physiology or Medicine             United States
Ben Roy Mottelson           1975  Physics                            United States
Czesław Miłosz              1980  Literature                         United States
Subrahmanyan Chandrasekhar  1983  Physics                            United States
Daniel Kahneman             2002  Economics                          United States
Robert Aumann               20

In [161]:
df_born_in = df[df.born_in.notnull()] # save off the entries with born_in
df = df[df.born_in.isnull()] # delete the rows with born_in (by saving only isnull())
df = df.drop('born_in', axis=1) #drop the born_in column
# note, if you didn't drop the born_in column in df, then the join below
# would have overlapping columns.  With overlapping columns, you must supply
# either an rsuffix or lsuffix so it can specify which of the overlapping columns
# came from which side.  To avoid that, just drop the overlapping column in df and
# specify only the born_in column in df_born_in inside the join statement.

match_cols = ['name', 'year', 'category']
newdf = df.join(df_born_in.set_index(match_cols).loc[:,'born_in'], on=match_cols, sort=True)


In [162]:
newdf[newdf.name.str.contains('Cornforth')]

Unnamed: 0,name,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year,born_in
449,John Cornforth,Chemistry,United Kingdom,7 September 1917,8 December 2013,male,https://en.wikipedia.org/wiki/John_Cornforth,Sydney,Brighton,"John Cornforth , born in Australia , Chemistr...",1975,Australia


In [163]:
newdf.head()

Unnamed: 0,name,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year,born_in
284,A. Michael Spence,Economics,United States,7 November 1943,,male,https://en.wikipedia.org/wiki/A._Michael_Spence,Montclair,,"A. Michael Spence , Economics, 2001",2001,
1020,Aage Bohr,Physics,Denmark,19 June 1922,8 September 2009,male,https://en.wikipedia.org/wiki/Aage_Bohr,Copenhagen,Copenhagen,"Aage Bohr , Physics, 1975",1975,
766,Aaron Ciechanover,Chemistry,Israel,1 October 1947,,male,https://en.wikipedia.org/wiki/Aaron_Ciechanover,Haifa,,"Aaron Ciechanover , Chemistry, 2004",2004,
477,Aaron Klug,Chemistry,United Kingdom,11 August 1926,,male,https://en.wikipedia.org/wiki/Aaron_Klug,Želva,,"Aaron Klug , born in Lithuania , Chemistry, 1982",1982,Lithuania
579,Aaron Klug,Chemistry,South Africa,11 August 1926,,male,https://en.wikipedia.org/wiki/Aaron_Klug,Želva,,"Aaron Klug , Chemistry, 1982",1982,Lithuania


## Back to book flow

In [164]:
# Rows with born_in are duplicate entries, so remove all rows
# which are NaN in bornin_col
#df = df[df.born_in.isnull()]
print(df.count()) # note that born_in count is now 0
#df = df.drop('born_in', axis=1) # drop the born_in column

name              955
category          950
country           955
date_of_birth     945
date_of_death     629
gender            945
link              955
place_of_birth    945
place_of_death    629
text              955
year              955
dtype: int64


### Finding other duplicates
Internet search shows that 889 people and organizations have received Nobel prize, but our data set contains more, so let's look for more duplications

In [165]:
dupes_by_name = df[df.duplicated('name')]
dupes_by_name.count()

name              59
category          59
country           59
date_of_birth     58
date_of_death     36
gender            58
link              59
place_of_birth    58
place_of_death    36
text              59
year              59
dtype: int64

In [166]:
# By default, 'duplicated' gives the first occurance of a duplicated item
# but it has an option 'keep="last"'.  Using both options, we can get a full
# list of all the duplicated items
all_dupes = df[df.duplicated('name') | df.duplicated('name', keep='last')]
all_dupes.count()

# we could have also done this in the following way
# using the 'isin' method
all_dupes = df[df.name.isin(dupes_by_name.name)]
all_dupes.count()

# we could have also done this in the following way
# groupby returns iterator of (groupname, dataframeofrows) tuples, e.g.
for name, rows in df.groupby('name'):
    if len(rows)>1:
        print('name: {}, number of rows: {}'.format(name.encode('utf-8'), len(rows)))
# use groupby as follows:
pd.concat([g[['name','category','country','year']] for _, g in df.groupby('name') if len(g) > 1])

name: Aaron Klug, number of rows: 2
name: Adolfo Pérez Esquivel, number of rows: 2
name: Albert Einstein, number of rows: 2
name: Angus Deaton, number of rows: 2
name: Aziz Sancar, number of rows: 2
name: Baruj Benacerraf, number of rows: 2
name: Bernardo Houssay, number of rows: 2
name: Carlos Saavedra Lamas, number of rows: 2
name: Charles K. Kao, number of rows: 2
name: César Milstein, number of rows: 3
name: Daniel Bovet, number of rows: 2
name: Dennis Gabor, number of rows: 2
name: Ei-ichi Negishi, number of rows: 2
name: Emilio Segrè, number of rows: 2
name: Eugene Wigner, number of rows: 2
name: F. Duncan M. Haldane, number of rows: 2
name: Felix Bloch, number of rows: 2
name: Franco Modigliani, number of rows: 2
name: Fraser Stoddart, number of rows: 2
name: Frederick Sanger, number of rows: 2
name: Friedrich Hayek, number of rows: 2
name: Georg von Békésy, number of rows: 2
name: George Andrew Olah, number of rows: 2
name: Georges Charpak, number of rows: 2
name: Gérard Debreu

Unnamed: 0,name,category,country,year
477,Aaron Klug,Chemistry,United Kingdom,1982
579,Aaron Klug,Chemistry,South Africa,1982
1085,Adolfo Pérez Esquivel,Peace,Argentina,1980
1086,Adolfo Pérez Esquivel,Peace,Summary,1980
535,Albert Einstein,Physics,Switzerland,1921
809,Albert Einstein,Physics,Germany,1921
376,Angus Deaton,Economics,United States,2015
497,Angus Deaton,Economics,United Kingdom,2015
374,Aziz Sancar,Chemistry,United States,2015
509,Aziz Sancar,Chemistry,Turkey,2015


In [167]:
# Sort the data now
# here is how sort_values works:
df2 = pd.DataFrame(\
                  {'name':['zak', 'alice', 'bob', 'mike',
                          'bob', 'bob'],
                  'score':[4, 3, 5, 2, 3, 7]})
df2 = df2.sort_values(['name', 'score'], ascending=[1,0])
print(df2)
# Now use it on all_dupes
all_dupes.sort_values('name')[['name', 'country', 'year']]

    name  score
1  alice      3
5    bob      7
2    bob      5
4    bob      3
3   mike      2
0    zak      4


Unnamed: 0,name,country,year
579,Aaron Klug,South Africa,1982
477,Aaron Klug,United Kingdom,1982
1086,Adolfo Pérez Esquivel,Summary,1980
1085,Adolfo Pérez Esquivel,Argentina,1980
809,Albert Einstein,Germany,1921
535,Albert Einstein,Switzerland,1921
376,Angus Deaton,United States,2015
497,Angus Deaton,United Kingdom,2015
374,Aziz Sancar,United States,2015
509,Aziz Sancar,Turkey,2015


In [168]:
# This output shows that some recipients are in our dataset
# twice for the same year, but different countries.

# Change Curie's country to France for the 1911 entry
df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
      (df.year == 1911), 'country'] = 'France'

df.drop(df[(df.name=='Sidney Altman') & (df.year==1990)].index, inplace=True)
# could have done this the following way as well:
df = df[~((df.name=='Sidney Altman') & (df.year==1990))] # ~ is the logical 'not'


In [100]:
# define function for doing these cleanup activities
def clean_data(df):
    df = df.replace('', np.nan)
    df = df[df.born_in.isnull()] # eliminates the rows with 'born_in' values
    df = df.drop('born_in', axis=1) # don't need this column anymore
    df.drop(df[df.year==1809].index, inplace=True)
    df = df[~(df['name']=='Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
      (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name=='Sidney Altman') & (df.year==1990))] # selects all rows 'not' (~) Sidney Altma 1990
    return df

df = clean_data(df)

AttributeError: 'DataFrame' object has no attribute 'born_in'

In [169]:
# get rid of half of the dual country winners so each prize just counts once.
# to be fair, randomize the rows first so that country deleted is random.
df = df.reindex(np.random.permutation(df.index)) # randomize the index
# drop duplicate rows (both 'name' and 'year' the same) after the first encountered
df = df.drop_duplicates(['name', 'year'])
df = df.sort_index() # go back to unrandomized index
df.count()

name              902
category          898
country           902
date_of_birth     893
date_of_death     599
gender            893
link              902
place_of_birth    893
place_of_death    599
text              902
year              902
dtype: int64

In [170]:
# lest see how many duplicate winners we have now (some are valid)
df[df.duplicated('name') | df.duplicated('name', keep='last')]\
    .sort_values(by='name')[['name', 'country', 'year', 'category']]
    
# The output are the 4 valid multi-award winners!

Unnamed: 0,name,country,year,category
439,Frederick Sanger,United Kingdom,1958,Chemistry
457,Frederick Sanger,United Kingdom,1980,Chemistry
82,John Bardeen,United States,1956,Physics
117,John Bardeen,United States,1972,Physics
75,Linus C. Pauling,United States,1954,Chemistry
83,Linus C. Pauling,United States,1962,Peace
896,Marie Curie,France,1903,Physics
905,Marie Curie,France,1911,Chemistry
648,Marie Skłodowska-Curie,Poland,1903,Physics
649,Marie Skłodowska-Curie,France,1911,Chemistry


In [171]:
df[df.year.isin([2016, 2017])] #my total differs from book, so look at recent winners...

Unnamed: 0,name,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
377,John M. Kosterlitz,Physics,United States,22 June 1943,,male,https://en.wikipedia.org/wiki/John_M._Kosterlitz,Aberdeen,,"John M. Kosterlitz , born in United Kingdom ,...",2016
381,Bob Dylan,Literature,United States,24 May 1941,,male,https://en.wikipedia.org/wiki/Bob_Dylan,Duluth,,"Bob Dylan , Literature, 2016",2016
382,Jeffrey C. Hall,Physiology or Medicine,United States,3 May 1945,,male,https://en.wikipedia.org/wiki/Jeffrey_C._Hall,Brooklyn,,"Jeffrey C. Hall , Physiology or Medicine, 2017",2017
383,Michael Rosbash,Physiology or Medicine,United States,7 March 1944,,male,https://en.wikipedia.org/wiki/Michael_Rosbash,Kansas City,,"Michael Rosbash , Physiology or Medicine, 2017",2017
385,Michael W. Young,Physiology or Medicine,United States,28 March 1949,,male,https://en.wikipedia.org/wiki/Michael_W._Young,Miami,,"Michael W. Young , Physiology or Medicine, 2017",2017
386,Barry Barish,Physics,United States,27 January 1936,,male,https://en.wikipedia.org/wiki/Barry_Barish,Omaha,,"Barry Barish , Physics, 2017",2017
387,Kip Thorne,Physics,United States,1 June 1940,,male,https://en.wikipedia.org/wiki/Kip_Thorne,Logan,,"Kip Thorne , Physics, 2017",2017
388,Rainer Weiss,Physics,United States,29 September 1932,,male,https://en.wikipedia.org/wiki/Rainer_Weiss,Berlin,,"Rainer Weiss , born in Germany , Physics, 2017",2017
389,Joachim Frank,Chemistry,United States,12 September 1940,,male,https://en.wikipedia.org/wiki/Joachim_Frank,Weidenau,,"Joachim Frank , born in Germany , Chemistry, ...",2017
390,Richard H. Thaler,Economics,United States,12 September 1945,,male,https://en.wikipedia.org/wiki/Richard_H._Thaler,East Orange,,"Richard H. Thaler , Economics, 2017",2017


## Dealing with missing fields
First: missing categories

In [172]:
print(df.count())  # we are missing some categories and genders
df.category.unique() # recall when we scraped, we checked against this limited list.

name              902
category          898
country           902
date_of_birth     893
date_of_death     599
gender            893
link              902
place_of_birth    893
place_of_death    599
text              902
year              902
dtype: int64


array([u'Physiology or Medicine', u'Literature', u'Peace', u'Physics',
       u'Economics', u'Chemistry', nan], dtype=object)

In [179]:
# look at the missing categories
df[df.category.isnull()][['name', 'text', 'link', 'category']]

Unnamed: 0,name,text,link,category
505,Selman Waksman,"Selman Waksman , Physiology and Medicine, 1952",https://en.wikipedia.org/wiki/Selman_Waksman,


In [174]:
df.loc[df.name == u'Leopold Ružička born in Kingdom of Hungary', ['name', 'text', 'category']] # doesn't work

Unnamed: 0,name,text,category


In [175]:
print(u'Leopold Ružička born in Kingdom of Hungary'.encode('ascii', errors='backslashreplace'))
print(u'Róbert Bárány'.encode('ascii', errors='backslashreplace'))
df.loc[df.name==u'Leopold Ru\u017ei\u010dka born in Kingdom of Hungary', ['name', 'text', 'category']] # still doesn't work

Leopold Ru\u017ei\u010dka born in Kingdom of Hungary
R\xf3bert B\xe1r\xe1ny


Unnamed: 0,name,text,category


In [176]:
df.loc[df.name.str.startswith('Leopold Ru'), :] # works!

Unnamed: 0,name,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
537,Leopold Ružička,Chemistry,Switzerland,13 September 1887,26 September 1976,male,https://en.wikipedia.org/wiki/Leopold_Ru%C5%BE...,Vukovar,Mammern,"Leopold Ružička , born in then Austria-Hungar...",1939


In [177]:
df.loc[df.name.str.startswith(u'R\xf3bert B\xe1r\xe1ny')]

Unnamed: 0,name,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
1049,Róbert Bárány,Physiology or Medicine,Austria,22 April 1876,8 April 1936,male,https://en.wikipedia.org/wiki/R%C3%B3bert_B%C3...,Vienna,Uppsala domkyrkoförsamling,"Róbert Bárány , Physiology or Medicine, 1914",1914


In [178]:
def clean_data(df):
    df = df.replace('', np.nan)
    if 'born_in' in df.columns:
        df = df[df.born_in.isnull()] # eliminates the rows with 'born_in' values
        df = df.drop('born_in', axis=1) # don't need this column anymore
    df.drop(df[df.year==1809].index, inplace=True)
    df = df[~(df['name']=='Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
      (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name=='Sidney Altman') & (df.year==1990))] # selects all rows 'not' (~) Sidney Altma 1990
    # the missing categories are mostly people whose category name wasn't in standard format
    # fix that:
    df.loc[df.name=='Selman Waksman', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Alexis Carrel', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Ilya Ilyich Mechnikov', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Amartya Sen', 'category'] = 'Economics'
    df.drop(df.loc[df.name.str.startswith('Leopold Ru') & (df.year != 1939)].index, inplace=True)
    df.loc[df.name.str.startswith(u'R\xf3bert B\xe1r\xe1ny'), 'category'] = 'Physiology or Medicine'
    return df

df = clean_data(df)

Now, missing gender

In [180]:
df[df['gender'].isnull()][['name', 'year', 'category']]

Unnamed: 0,name,year,category
7,International Atomic Energy Agency,2005,Peace
57,American Friends Service Committee (The Quakers),1947,Peace
424,Friends Service Council,1947,Peace
469,Amnesty International,1977,Peace
527,Tunisian National Dialogue Quartet,2015,Peace
550,Médecins Sans Frontières,1999,Peace
694,Organisation for the Prohibition of Chemical W...,2013,Peace
1050,Pugwash Conferences on Science and World Affairs,1995,Peace
1057,Institut de Droit International,1904,Peace


In [113]:
def clean_data(df):
    df = df.replace('', np.nan)
    if 'born_in' in df.columns:
        df = df[df.born_in.isnull()] # eliminates the rows with 'born_in' values
        df = df.drop('born_in', axis=1) # don't need this column anymore
    df.drop(df[df.year==1809].index, inplace=True)
    df = df[~(df['name']=='Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
      (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name=='Sidney Altman') & (df.year==1990))] # selects all rows 'not' (~) Sidney Altma 1990
    # the missing categories are mostly people whose category name wasn't in standard format
    # fix that:
    df.loc[df.name=='Selman Waksman', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Alexis Carrel', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Ilya Ilyich Mechnikov', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Amartya Sen', 'category'] = 'Economics'
    df.drop(df.loc[df.name.str.startswith('Leopold Ru') & (df.year != 1939)].index, inplace=True)
    df.loc[df.name.str.startswith(u'R\xf3bert B\xe1r\xe1ny'), 'category'] = 'Physiology or Medicine'
    # drop null gender entries (they are all institutions/organizations)
    df = df[df.gender.notnull()] # remove genderless entries
    return df

df = clean_data(df)

## Dealing with Times and Dates

In [181]:
df.loc[:20, ['name', 'date_of_birth', 'date_of_death']]

Unnamed: 0,name,date_of_birth,date_of_death
2,Konrad Lorenz,7 November 1903,27 February 1989
6,Elfriede Jelinek,20 October 1946,
7,International Atomic Energy Agency,,
9,William Lawrence Bragg,31 March 1890,1 July 1971
10,Howard Florey,24 September 1898,21 February 1968
11,Sir Frank Macfarlane Burnet,3 September 1899,31 August 1985
12,John Carew Eccles,27 January 1903,2 May 1997
13,Patrick White,28 May 1912,30 September 1990
15,John Harsanyi,29 May 1920,9 August 2000
16,Tawakkol Karman,7 February 1979,


In [182]:
pd.to_datetime(df.date_of_birth, errors='raise') # no errors
pd.to_datetime(df.date_of_death, errors='raise')


2      1989-02-27
6             NaT
7             NaT
9      1971-07-01
10     1968-02-21
11     1985-08-31
12     1997-05-02
13     1990-09-30
15     2000-08-09
16            NaT
17     1990-10-13
18     2011-08-02
19     1919-01-06
20     1931-05-09
21     1937-02-07
22     1928-04-02
23     1924-02-03
24     1953-12-19
25     1951-04-23
26     1962-03-15
27     1937-12-21
28     1951-01-10
29     1947-12-07
30     1935-05-21
31     1957-08-16
32     1945-12-04
33     1976-02-01
34     1987-10-09
35     1950-02-25
36     1981-01-05
          ...    
1050          NaT
1051          NaT
1052   1914-06-21
1053          NaT
1054          NaT
1057          NaT
1058   1912-10-06
1060   1949-05-06
1061   1943-05-14
1062   1961-04-06
1064   1968-07-18
1065          NaT
1066   1987-12-02
1067   1969-01-30
1068   1983-05-22
1069          NaT
1071          NaT
1072          NaT
1074   1921-05-05
1075   1940-09-27
1076   1943-06-26
1077   1961-01-04
1078   1964-12-17
1079   1958-12-15
1080      

In [183]:
# if there was an error raised, this is one way to know which row caused it:
for i, row in df.iterrows():
    try:
        pd.to_datetime(row.date_of_death, errors='raise')
    except:
        print('{}({}, {})'.format(row.date_of_death.ljust(30), row['name'], i))

#

with_death_dates = df[df.date_of_death.notnull()]
bad_dates = pd.isnull(pd.to_datetime(with_death_dates.date_of_death, errors='coerce'))
with_death_dates[bad_dates][['category', 'date_of_death', 'name']]

Unnamed: 0,category,date_of_death,name


___
# FINAL
# COPY OF GETTING THE DIRTY DATA:
## Note, this has mongo functions with DAY mods for datetimes


In [200]:
import numpy as np
import pandas as pd

from pymongo import MongoClient

def get_mongo_database(db_name, host='localhost', port=27017, username=None, password=None):
    '''Get (or create) named database from MongoDB with/out authentication'''
    if username and password:
        mongo_uri = 'mongodb://{}:{}@{}/{}'.format(username, password, host, db_name)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)
    return conn[db_name]

def mongo_to_dataframe(db_name, collection_name, query={}, host='localhost',
                       port=27017, username=None, password=None, no_id=True, datecols=None):
    '''Create a Pandas DataFrame from MongoDB collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    cursor = db[collection_name].find(query)
    df = pd.DataFrame(list(cursor))
    if no_id:
        del df['_id']
    return df

def dataframe_to_mongo(dframe, db_name, collection_name, host='localhost',
                 port=27017, username=None, password=None, datecols=None):
    '''save a dataframe to mongodb collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    if datecols:
        # 1. rows with valid date entries in datecols:
        valid_mask = np.array(dframe.loc[:,datecols].notnull())
        valids = dframe[valid_mask].to_dict('records')  # list-of-dicts format
        db[collection_name].insert_many(valids)
        # 2. rows *without* valid date entries in datecols:
        invalids = dframe[~valid_mask].loc[:,df.columns.difference(datecols)].to_dict('records')  # list-of-dicts format
        db[collection_name].insert_many(invalids)
    else:
        records = dframe.to_dict('records')  # 'records' -> list-of-dicts format
        db[collection_name].insert_many(records)
        
def delete_collection(db_name, collection_name, host='localhost',
                 port=27017, username=None, password=None):
    '''save a dataframe to mongodb collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    db[collection_name].delete_many({}) # empty filter deletes all entries


DB_NOBEL_PRIZE = 'nobel_prize' # use string constants or a spell error in retrieval will create new table.
COLL_WINNERS = 'winners' # winners collection

#----------------------------
# From json file.
#----------------------------

with open('data/nwinners_raw.json') as f:
    df = pd.read_json(f)

# Make sure mongodb is clear (so we don't duplicate data), then
# save to Mongo for next section
db = get_mongo_database(DB_NOBEL_PRIZE)
db[COLL_WINNERS].delete_many({})  # deletes everything (no filter)
dataframe_to_mongo(df, DB_NOBEL_PRIZE, COLL_WINNERS) # save to Mongo for next section

#----------------------------
# From mongodb collection
#----------------------------

#df = mongo_to_dataframe(DB_NOBEL_PRIZE, COLL_WINNERS)
df.count()


born_in           1089
category          1089
country           1089
date_of_birth     1079
date_of_death      716
gender            1079
link              1089
name              1089
place_of_birth    1079
place_of_death     716
text              1089
year              1089
dtype: int64

# The final clean_data() function

In [201]:
def clean_data(df):
    df.name = df.name.str.replace('*', '') # Take out the asterisks in people's name
    df.name = df.name.str.strip() # Strip out ending whitespace
    df = df.replace('', np.nan)
    df_born_in = df[df.born_in.notnull()]   # save off the entries with born_in
    df = df[df.born_in.isnull()] # eliminates the rows with 'born_in' values (they are dups)
    df = df.drop('born_in', axis=1) # drop this column so no overlap in upcoming join()
    df.drop(df[df.year==1809].index, inplace=True)
    df = df[~(df['name']=='Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
      (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name=='Sidney Altman') & (df.year==1990))] # selects all rows 'not' (~) Sidney Altma 1990
    # drop duplicate awards where the recipient had 2 countries.  Randomize which
    # country is dropped by first randomizing the index
    df = df.reindex(np.random.permutation(df.index))
    df = df.drop_duplicates(['name', 'year']) # if duplicate on these 2 fields, drop after 1st entry
    df = df.sort_index() #restores the index so no longer random
    # similarly drop entries where two born_ins are given (DAY this fixed problem
    # where doing a join to get both country and born_in was generating duplicates
    # for some rows and therefore causing the index size to mismatch later)
    df_born_in = df_born_in.reindex(np.random.permutation(df_born_in.index))
    df_born_in = df_born_in.drop_duplicates(['name', 'year']) # if duplicate on these 2 fields, drop after 1st entry
    df_born_in = df_born_in.sort_index() #restores the index so no longer random
    # DAY added section here to copy born_in info in the main df
    match_cols = ['name', 'year']
    df = df.join(df_born_in.set_index(match_cols).loc[:,'born_in'], how='left', on=match_cols)
    df_born_in = df_born_in.drop('country', axis=1)
    df_born_in = df_born_in.join(df.set_index(match_cols).loc[:,'country'], how='left', on=match_cols)
    # end DAY section
    # the missing categories are mostly people whose category name wasn't in standard format
    # fix that:
    df.loc[df.name=='Selman Waksman', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Alexis Carrel', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Ilya Ilyich Mechnikov', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Amartya Sen', 'category'] = 'Economics'
    df.drop(df.loc[df.name.str.startswith('Leopold Ru') & (df.year != 1939)].index, inplace=True)
    df.loc[df.name.str.startswith(u'R\xf3bert B\xe1r\xe1ny'), 'category'] = 'Physiology or Medicine'
    # drop null gender entries (they are all institutions/organizations)
    df = df[df.gender.notnull()] # remove genderless entries
    df.date_of_birth = pd.to_datetime(df.date_of_birth)
    #df.date_of_death = pd.to_datetime(df.date_of_death, errors='coerce')
    df.date_of_death = pd.to_datetime(df.date_of_death, errors='ignore')
    df['award_age'] = df.year - pd.DatetimeIndex(df.date_of_birth).year
    df['death_age'] = df.date_of_death.where(df.date_of_death.isnull(),
        pd.DatetimeIndex(df.date_of_death).year - 
        pd.DatetimeIndex(df.date_of_birth).year)
    return df, df_born_in
    
df_clean, df_born_in = clean_data(df)

df_clean.count()

category          888
country           888
date_of_birth     888
date_of_death     594
gender            888
link              888
name              888
place_of_birth    888
place_of_death    594
text              888
year              888
born_in            93
award_age         888
death_age         594
dtype: int64

In [202]:
df_clean.head()



Unnamed: 0,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year,born_in,award_age,death_age
2,Physiology or Medicine,Austria,1903-11-07,1989-02-27,male,https://en.wikipedia.org/wiki/Konrad_Lorenz,Konrad Lorenz,Vienna,Vienna,"Konrad Lorenz , Physiology or Medicine, 1973",1973,,70,1970-01-01 00:00:00.000000086
6,Literature,Austria,1946-10-20,NaT,female,https://en.wikipedia.org/wiki/Elfriede_Jelinek,Elfriede Jelinek,Mürzzuschlag,,"Elfriede Jelinek , Literature, 2004",2004,,58,NaT
9,Physics,Australia,1890-03-31 00:00:00,1971-07-01,male,https://en.wikipedia.org/wiki/William_Lawrence...,William Lawrence Bragg,Adelaide,Ipswich,"William Lawrence Bragg , Physics, 1915",1915,,25,1970-01-01 00:00:00.000000081
10,Physiology or Medicine,Australia,1898-09-24 00:00:00,1968-02-21,male,https://en.wikipedia.org/wiki/Howard_Florey,Howard Florey,Adelaide,Oxford,"Howard Florey , Physiology or Medicine, 1945",1945,,47,1970-01-01 00:00:00.000000070
11,Physiology or Medicine,Australia,1899-09-03 00:00:00,1985-08-31,male,https://en.wikipedia.org/wiki/Frank_Macfarlane...,Sir Frank Macfarlane Burnet,Traralgon,Melbourne,"Sir Frank Macfarlane Burnet , Physiology or Me...",1960,,61,1970-01-01 00:00:00.000000086


# Save the data

In [203]:
with open('data/nwinners_clean.json', 'w') as f:
    df_clean.to_json(f, date_format='iso', orient='records')
with open('data/nwinners_born_in.json', 'w') as f:
    df_born_in.to_json(f, date_format='iso', orient='records')

if True:
    db = get_mongo_database(DB_NOBEL_PRIZE)
    db[COLL_WINNERS].delete_many({})  # deletes everything (no filter) to make sure we're starting fresh
    db['winners_born_in'].delete_many({})  # deletes everything (no filter) to make sure we're starting fresh
    dataframe_to_mongo(df_clean, DB_NOBEL_PRIZE, COLL_WINNERS, datecols=['date_of_death']) # save to Mongo for next section
    dataframe_to_mongo(df_born_in, DB_NOBEL_PRIZE, 'winners_born_in', datecols=['date_of_death'])

 

# also save a local copy on sqlite
if False:
    import sqlalchemy

    engine = sqlalchemy.create_engine('sqlite:///data/nobel_prize.db')
    df_clean.to_sql('winners', engine)


# Merge MiniBio Dataframe

In [204]:
with open('data/minibios.json') as f:
    df_winners_bios = pd.read_json(f)
    
df_winners_all = pd.merge(df_clean, df_winners_bios, how='outer', on='link')
df_winners_all.count()

category          1043
country           1043
date_of_birth     1043
date_of_death      694
gender            1043
link              1094
name              1043
place_of_birth    1043
place_of_death     694
text              1043
year              1043
born_in            186
award_age         1043
death_age          694
bio_image         1020
image_urls        1081
mini_bio          1081
dtype: int64

In [205]:
# remove redundancies
# remove duplicates of any rows that share a 'link' and 'year' field after removing
# any rows without a name field
df_winners_all = df_winners_all[~df_winners_all.name.isnull()].drop_duplicates(subset=['link', 'year'])
df_winners_all.count()
df_winners_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 887 entries, 0 to 1041
Data columns (total 17 columns):
category          887 non-null object
country           887 non-null object
date_of_birth     887 non-null datetime64[ns]
date_of_death     594 non-null datetime64[ns]
gender            887 non-null object
link              887 non-null object
name              887 non-null object
place_of_birth    887 non-null object
place_of_death    594 non-null object
text              887 non-null object
year              887 non-null float64
born_in           93 non-null object
award_age         887 non-null float64
death_age         594 non-null datetime64[ns]
bio_image         825 non-null object
image_urls        874 non-null object
mini_bio          874 non-null object
dtypes: datetime64[ns](3), float64(2), object(12)
memory usage: 124.7+ KB


In [207]:
db['winners_all'].delete_many({})  # deletes everything (no filter) to make sure we're starting fresh
dataframe_to_mongo(df_winners_all, DB_NOBEL_PRIZE, 'winners_all', datecols=['date_of_death'])



In [208]:
df_winners_all.columns

Index([      u'category',        u'country',  u'date_of_birth',
        u'date_of_death',         u'gender',           u'link',
                 u'name', u'place_of_birth', u'place_of_death',
                 u'text',           u'year',        u'born_in',
            u'award_age',      u'death_age',      u'bio_image',
           u'image_urls',       u'mini_bio'],
      dtype='object')

In [209]:
# Save json file (need to deal with datetimes which don't json)

# here is one way I could have done it.
import datetime
print(df_winners_all.date_of_birth.apply(lambda x: x.isoformat()))

#but I am using Pandas' built-in to_json which deals with datetime
with open('data/nwinners_all.json', 'w') as f:
    df_winners_all.to_json(f, date_format='iso', orient='records')
    
    
# ARCHIVE
if False:
    import datetime
    from dateutil import parser
    import json

    class JSONDateTimeEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, (datetime.date, datetime.datetime)):
                return obj.isoformat()
            else:
                return json.JSONEncoder.default(self, obj)

    def mydumps(obj):
        return json.dumps(obj, cls=JSONDateTimeEncoder)
    now_str = mydumps({'time': datetime.datetime.now()})
    print(now_str)

    from datetime import datetime
    import dateutil

    d = datetime.datetime.now()
    d_iso = d.isoformat()  # convert to text
    print(d_iso)

    d = dateutil.parser.parse(d_iso) # back to a datetime object
    print(d)




0       1903-11-07T00:00:00
1       1946-10-20T00:00:00
2       1890-03-31T00:00:00
3       1898-09-24T00:00:00
4       1899-09-03T00:00:00
5       1903-01-27T00:00:00
6       1912-05-28T00:00:00
8       1979-02-07T00:00:00
9       1911-10-14T00:00:00
10      1858-10-27T00:00:00
11      1852-12-19T00:00:00
12      1845-02-15T00:00:00
13      1868-01-31T00:00:00
14      1856-12-28T00:00:00
15      1868-03-22T00:00:00
16      1865-08-27T00:00:00
17      1892-09-10T00:00:00
18      1856-12-22T00:00:00
19      1885-02-07T00:00:00
20      1862-04-02T00:00:00
21      1860-09-06T00:00:00
22      1881-01-31T00:00:00
23      1866-09-25T00:00:00
24      1878-08-28T00:00:00
25      1892-02-06T00:00:00
26      1885-12-02T00:00:00
27      1893-04-29T00:00:00
28      1905-09-03T00:00:00
29      1888-10-16T00:00:00
30      1881-10-22T00:00:00
               ...         
1004    1932-10-24T00:00:00
1005    1843-06-09T00:00:00
1007    1931-07-10T00:00:00
1008    1943-08-29T00:00:00
1009    1829-07-26T0

In [210]:
df = df_winners_all