# Cleaning Data with Pandas
## Reading the data

In [47]:
import numpy as np
import pandas as pd

from pymongo import MongoClient

def get_mongo_database(db_name, host='localhost', port=27017, username=None, password=None):
    '''Get (or create) named database from MongoDB with/out authentication'''
    if username and password:
        mongo_uri = 'mongodb://{}:{}@{}/{}'.format(username, password, host, db_name)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)
    return conn[db_name]

def mongo_to_dataframe(db_name, collection_name, query={}, host='localhost',
                       port=27017, username=None, password=None, no_id=True):
    '''Create a Pandas DataFrame from MongoDB collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    cursor = db[collection_name].find(query)
    df = pd.DataFrame(list(cursor))
    if no_id:
        del df['_id']
    return df

def dataframe_to_mongo(dframe, db_name, collection_name, host='localhost',
                 port=27017, username=None, password=None):
    '''save a dataframe to mongodb collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    records = dframe.to_dict('records')  # 'records' puts it into our list-of-dicts format
    db[collection_name].insert_many(records)


DB_NOBEL_PRIZE = 'nobel_prize' # use string constants or a spell error in retrieval will create new table.
COLL_WINNERS = 'winners' # winners collection

#----------------------------
# From json file.
#----------------------------

with open('data/nwinners.json') as f:
    df = pd.read_json(f)

# Make sure mongodb is clear (so we don't duplicate data), then
# save to Mongo for next section
db = get_mongo_database(DB_NOBEL_PRIZE)
db[COLL_WINNERS].delete_many({})  # deletes everything (no filter)
dataframe_to_mongo(df, DB_NOBEL_PRIZE, COLL_WINNERS) # save to Mongo for next section

#----------------------------
# From mongodb collection
#----------------------------

df = mongo_to_dataframe(DB_NOBEL_PRIZE, COLL_WINNERS)


## Inspecting the data

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1069 entries, 0 to 1068
Data columns (total 12 columns):
born_in           1069 non-null object
category          1069 non-null object
country           1069 non-null object
date_of_birth     1060 non-null object
date_of_death     712 non-null object
gender            1060 non-null object
link              1069 non-null object
name              1069 non-null object
place_of_birth    1060 non-null object
place_of_death    712 non-null object
text              1069 non-null object
year              1069 non-null int64
dtypes: int64(1), object(11)
memory usage: 100.3+ KB


In [3]:
df.describe() # only acts on numeric columns
df.describe(include=['object']) # include specifies other types to include (besides numeric)

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text
count,1069.0,1069,1069,1060,712,1060,1069,1069,1060,712,1069
unique,33.0,7,59,868,589,2,912,1002,607,316,1057
top,,Physiology or Medicine,United States,7 November 1867,4 July 1934,male,https://en.wikipedia.org/wiki/Marie_Curie,César Milstein,New York City,Cambridge,"Adolfo Pérez Esquivel , Peace, 1980"
freq,938.0,256,352,4,4,1003,4,3,44,37,2


In [4]:
df.tail()
df.head(3)

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,,Chemistry,Hungary,1 August 1885,5 July 1966,male,https://en.wikipedia.org/wiki/George_de_Hevesy,George de Hevesy,Budapest,Freiburg im Breisgau,"George de Hevesy , Chemistry, 1943",1943
1,,Physiology or Medicine,Hungary,16 September 1893,22 October 1986,male,https://en.wikipedia.org/wiki/Albert_Szent-Gy%...,Albert Szent-Györgyi,Budapest,Woods Hole,"Albert Szent-Györgyi , Physiology or Medicine,...",1937
2,,Peace,Ghana,8 April 1938,,male,https://en.wikipedia.org/wiki/Kofi_Annan,Kofi Annan,Kumasi,,"Kofi Annan , Peace, 2001",2001


## Set indices (optional, but useful)

In [5]:
print(df.columns) # the column index
df = df.set_index('name') # row index (note we need to assign to df)
df.loc['Albert Einstein'] # loc accesses by label index

Index([u'born_in', u'category', u'country', u'date_of_birth', u'date_of_death',
       u'gender', u'link', u'name', u'place_of_birth', u'place_of_death',
       u'text', u'year'],
      dtype='object')


Unnamed: 0_level_0,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Albert Einstein,,Physics,Switzerland,14 March 1879,18 April 1955,male,https://en.wikipedia.org/wiki/Albert_Einstein,Ulm,Princeton,"Albert Einstein , born in Germany , Physics, ...",1921
Albert Einstein,,Physics,Germany,14 March 1879,18 April 1955,male,https://en.wikipedia.org/wiki/Albert_Einstein,Ulm,Princeton,"Albert Einstein , Physics, 1921",1921


In [6]:
df.reset_index(inplace=True) # go back to default int index.  Note the 'inplace'
df.iloc[2] # iloc accesses by integer index
df.head(2)

Unnamed: 0,name,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
0,George de Hevesy,,Chemistry,Hungary,1 August 1885,5 July 1966,male,https://en.wikipedia.org/wiki/George_de_Hevesy,Budapest,Freiburg im Breisgau,"George de Hevesy , Chemistry, 1943",1943
1,Albert Szent-Györgyi,,Physiology or Medicine,Hungary,16 September 1893,22 October 1986,male,https://en.wikipedia.org/wiki/Albert_Szent-Gy%...,Budapest,Woods Hole,"Albert Szent-Györgyi , Physiology or Medicine,...",1937


In [7]:
df[0:10] #first 10 rows
df[-4:]  #last 4 rows

# masks
mask = df.year > 2000
df[mask]
df[df.year>2000] # more direct

Unnamed: 0,name,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
2,Kofi Annan,,Peace,Ghana,8 April 1938,,male,https://en.wikipedia.org/wiki/Kofi_Annan,Kumasi,,"Kofi Annan , Peace, 2001",2001
46,J. Robin Warren,,Physiology or Medicine,Australia,11 June 1937,,male,https://en.wikipedia.org/wiki/Robin_Warren,Adelaide,,"J. Robin Warren , Physiology or Medicine, 2005",2005
102,Martin Karplus *,Austria,Chemistry,,15 March 1930,,male,https://en.wikipedia.org/wiki/Martin_Karplus,Vienna,,"Martin Karplus *, Chemistry, 2013",2013
103,International Atomic Energy Agency,,Peace,Austria,,,,https://en.wikipedia.org/wiki/International_At...,,,"International Atomic Energy Agency , Peace, 2005",2005
116,Stefan Hell,,Chemistry,Germany,23 December 1962,,male,https://en.wikipedia.org/wiki/Stefan_Hell,Arad,,"Stefan Hell , born in Romania , Chemistry, 2014",2014
117,Thomas C. Südhof,,Physiology or Medicine,Germany,22 December 1955,,male,https://en.wikipedia.org/wiki/Thomas_C._S%C3%B...,Göttingen,,"Thomas C. Südhof , Physiology or Medicine, 2013",2013
126,Herta Müller,,Literature,Germany,17 August 1953,,female,https://en.wikipedia.org/wiki/Herta_M%C3%BCller,Nițchidorf,,"Herta Müller , born in Romania , Literature, ...",2009
167,Patrick Modiano,,Literature,France,30 July 1945,,male,https://en.wikipedia.org/wiki/Patrick_Modiano,Boulogne-Billancourt,,"Patrick Modiano , Literature, 2014",2014
168,Serge Haroche,,Physics,France,11 September 1944,,male,https://en.wikipedia.org/wiki/Serge_Haroche,Casablanca,,"Serge Haroche , born in Morocco, then under F...",2012
169,Jules A. Hoffmann,,Physiology or Medicine,France,2 August 1941,,male,https://en.wikipedia.org/wiki/Jules_A._Hoffmann,Echternach,,"Jules A. Hoffmann , born in Luxembourg , Phys...",2011


## Cleaning the Data
### Remove the asterisk from names

In [15]:
# How many names with asterisk?
df[df.name.str.contains('\*')]['name'].count()

131

In [16]:
# Take out the asterisks in people's name
df.name = df.name.str.replace('*', '')
df[df.name.str.contains('\*')]['name'].count()

0

### Eliminating duplications for people with a born_in entry

In [17]:
set(df.born_in.apply(type))

{unicode}

In [18]:
bornin_col = df.born_in # or df['born_in']

# if you look at bornin_col, most entries are an empty string
# replace empty strings with NaN
bornin_col.replace('', np.nan, inplace=True)
bornin_col

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
5           NaN
6           NaN
7           NaN
8           NaN
9           NaN
10          NaN
11          NaN
12          NaN
13          NaN
14          NaN
15          NaN
16          NaN
17          NaN
18          NaN
19          NaN
20      Germany
21          NaN
22          NaN
23          NaN
24          NaN
25          NaN
26          NaN
27          NaN
28          NaN
29      Germany
         ...   
1039        NaN
1040      India
1041      India
1042        NaN
1043        NaN
1044        NaN
1045        NaN
1046        NaN
1047        NaN
1048        NaN
1049        NaN
1050        NaN
1051        NaN
1052        NaN
1053        NaN
1054        NaN
1055        NaN
1056      India
1057        NaN
1058      India
1059        NaN
1060        NaN
1061    Hungary
1062        NaN
1063        NaN
1064        NaN
1065        NaN
1066        NaN
1067        NaN
1068        NaN
Name: born_in, Length: 1

In [12]:
# Rows with born_in are duplicate entries, so remove all rows
# which are NaN in bornin_col
df = df[df.born_in.isnull()]
print(df.count()) # note that born_in count is now 0
df = df.drop('born_in', axis=1) # drop the born_in column

name              938
born_in             0
category          938
country           938
date_of_birth     929
date_of_death     627
gender            929
link              938
place_of_birth    929
place_of_death    627
text              938
year              938
dtype: int64


### Finding other duplicates
Internet search shows that 889 people and organizations have received Nobel prize, but our data set contains more, so let's look for more duplications

In [44]:
dupes_by_name = df[df.duplicated('name')]
dupes_by_name.count()

category          4
country           4
date_of_birth     4
date_of_death     4
gender            4
link              4
name              4
place_of_birth    4
place_of_death    4
text              4
year              4
award_age         4
bio_image         4
image_urls        4
mini_bio          4
dtype: int64

In [45]:
# By default, 'duplicated' gives the first occurance of a duplicated item
# but it has an option 'keep="last"'.  Using both options, we can get a full
# list of all the duplicated items
all_dupes = df[df.duplicated('name') | df.duplicated('name', keep='last')]
all_dupes.count()

# we could have also done this in the following way
# using the 'isin' method
all_dupes = df[df.name.isin(dupes_by_name.name)]
all_dupes.count()

# we could have also done this in the following way
# groupby returns iterator of (groupname, dataframeofrows) tuples, e.g.
for name, rows in df.groupby('name'):
    if len(rows)>1:
        print('name: {}, number of rows: {}'.format(name.encode('utf-8'), len(rows)))
# use groupby as follows:
pd.concat([g[['name','category','country','year']] for _, g in df.groupby('name') if len(g) > 1])

name: Frederick Sanger, number of rows: 2
name: John Bardeen, number of rows: 2
name: Linus C. Pauling, number of rows: 2
name: Marie Skłodowska-Curie, number of rows: 2


Unnamed: 0,name,category,country,year
734,Frederick Sanger,Chemistry,United Kingdom,1958.0
736,Frederick Sanger,Chemistry,United Kingdom,1980.0
332,John Bardeen,Physics,United States,1956.0
334,John Bardeen,Physics,United States,1972.0
343,Linus C. Pauling,Chemistry,United States,1954.0
345,Linus C. Pauling,Peace,United States,1962.0
887,Marie Skłodowska-Curie,Physics,Poland,1903.0
891,Marie Skłodowska-Curie,Chemistry,France,1911.0


In [46]:
# Sort the data now
# here is how sort_values works:
df2 = pd.DataFrame(\
                  {'name':['zak', 'alice', 'bob', 'mike',
                          'bob', 'bob'],
                  'score':[4, 3, 5, 2, 3, 7]})
df2 = df2.sort_values(['name', 'score'], ascending=[1,0])
print(df2)
# Now use it on all_dupes
all_dupes.sort_values('name')[['name', 'country', 'year']]

    name  score
1  alice      3
5    bob      7
2    bob      5
4    bob      3
3   mike      2
0    zak      4


Unnamed: 0,name,country,year
734,Frederick Sanger,United Kingdom,1958.0
736,Frederick Sanger,United Kingdom,1980.0
332,John Bardeen,United States,1956.0
334,John Bardeen,United States,1972.0
343,Linus C. Pauling,United States,1954.0
345,Linus C. Pauling,United States,1962.0
887,Marie Skłodowska-Curie,Poland,1903.0
891,Marie Skłodowska-Curie,France,1911.0


In [16]:
# This output shows that some recipients are in our dataset
# twice for the same year, but different countries.

# Change Curie's country to France for the 1911 entry
df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
      (df.year == 1911), 'country'] = 'France'

df.drop(df[(df.name=='Sidney Altman') & (df.year==1990)].index, inplace=True)
# could have done this the following way as well:
df = df[~((df.name=='Sidney Altman') & (df.year==1990))] # ~ is the logical 'not'


In [17]:
# define function for doing these cleanup activities
def clean_data(df):
    df = df.replace('', np.nan)
    df = df[df.born_in.isnull()] # eliminates the rows with 'born_in' values
    df = df.drop('born_in', axis=1) # don't need this column anymore
    df.drop(df[df.year==1809].index, inplace=True)
    df = df[~(df['name']=='Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
      (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name=='Sidney Altman') & (df.year==1990))] # selects all rows 'not' (~) Sidney Altma 1990
    return df

df = clean_data(df)

In [18]:
# get rid of half of the dual country winners so each prize just counts once.
# to be fair, randomize the rows first so that country deleted is random.
df = df.reindex(np.random.permutation(df.index)) # randomize the index
# drop duplicate rows (both 'name' and 'year' the same) after the first encountered
df = df.drop_duplicates(['name', 'year'])
df = df.sort_index() # go back to unrandomized index
df.count()

name              883
category          879
country           883
date_of_birth     875
date_of_death     593
gender            875
link              883
place_of_birth    875
place_of_death    593
text              883
year              883
dtype: int64

In [47]:
# lest see how many duplicate winners we have now (some are valid)
df[df.duplicated('name') | df.duplicated('name', keep='last')]\
    .sort_values(by='name')[['name', 'country', 'year', 'category']]
    
# The output are the 4 valid multi-award winners!

Unnamed: 0,name,country,year,category
734,Frederick Sanger,United Kingdom,1958.0,Chemistry
736,Frederick Sanger,United Kingdom,1980.0,Chemistry
332,John Bardeen,United States,1956.0,Physics
334,John Bardeen,United States,1972.0,Physics
343,Linus C. Pauling,United States,1954.0,Chemistry
345,Linus C. Pauling,United States,1962.0,Peace
887,Marie Skłodowska-Curie,Poland,1903.0,Physics
891,Marie Skłodowska-Curie,France,1911.0,Chemistry


In [48]:
df[df.year.isin([2016, 2017])] #my total differs from book, so look at recent winners...

Unnamed: 0,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year,award_age,bio_image,image_urls,mini_bio
162,Economics,Finland,1949-04-18,NaT,male,https://en.wikipedia.org/wiki/Bengt_R._Holmstr...,Bengt R. Holmström,Helsinki,,"Bengt R. Holmström , Economics, 2016",2016.0,67.0,full/071002a0805cc5f8c9547c7ced7754740d864163.jpg,[https://upload.wikimedia.org/wikipedia/common...,<p><b>Bengt Robert Holmström</b> (born 18 Apri...
164,Chemistry,France,1944-10-21,NaT,male,https://en.wikipedia.org/wiki/Jean-Pierre_Sauvage,Jean-Pierre Sauvage,Paris,,"Jean-Pierre Sauvage , Chemistry, 2016",2016.0,72.0,full/fe9720dd654cdad9f65e70738e78c765de4366ba.jpg,[https://upload.wikimedia.org/wikipedia/common...,<p><b>Jean-Pierre Sauvage</b> (<small>French p...
192,Peace,Colombia,1951-08-10,NaT,male,https://en.wikipedia.org/wiki/Juan_Manuel_Santos,Juan Manuel Santos,Bogota,,"Juan Manuel Santos , Peace, 2016",2016.0,65.0,full/eff1c4010469810f2dfa814711106255060c618b.jpg,[https://upload.wikimedia.org/wikipedia/common...,<p><b>Juan Manuel Santos Calderón</b> (<small>...
533,Literature,United States,1941-05-24,NaT,male,https://en.wikipedia.org/wiki/Bob_Dylan,Bob Dylan,Duluth,,"Bob Dylan , Literature, 2016",2016.0,75.0,full/4cb79033375bc66bf1ec295e6c1633985665c747.jpg,[https://upload.wikimedia.org/wikipedia/common...,"<p><b>Bob Dylan</b> (<span class=""nowrap""><spa..."
667,Chemistry,United States,1942-05-24,NaT,male,https://en.wikipedia.org/wiki/Fraser_Stoddart,Fraser Stoddart,Edinburgh,,"Fraser Stoddart , born in United Kingdom , Ch...",2016.0,74.0,full/0455003b3d009c6cb9e33dabb746a876e030647c.jpg,[https://upload.wikimedia.org/wikipedia/common...,<p><b>Sir James Fraser Stoddart</b> <span styl...
669,Physics,United States,1951-09-14,NaT,male,https://en.wikipedia.org/wiki/F._Duncan_M._Hal...,F. Duncan M. Haldane,London,,"F. Duncan M. Haldane , born in United Kingdom...",2016.0,65.0,full/f5166c0ccafcc56c3b61ca5f2ce361a97a9658ad.jpg,[https://upload.wikimedia.org/wikipedia/common...,<p><b>Frederick Duncan Michael Haldane</b> <sp...
671,Physics,United States,1943-06-22,NaT,male,https://en.wikipedia.org/wiki/John_M._Kosterlitz,John M. Kosterlitz,Aberdeen,,"John M. Kosterlitz , born in United Kingdom ,...",2016.0,73.0,full/0caf9aac08352aa229bf8d8a33bb28e8336fcfa7.jpg,[https://upload.wikimedia.org/wikipedia/common...,<p><b>John Michael Kosterlitz</b> (born June 2...
799,Economics,United Kingdom,1948-10-09,NaT,male,https://en.wikipedia.org/wiki/Oliver_Hart_(eco...,Oliver Hart,London,,"Oliver Hart , Economics, 2016",2016.0,68.0,full/29d0c0c75945eada060987640bc150122876e151.jpg,[https://upload.wikimedia.org/wikipedia/common...,<p><b>Oliver Simon D'Arcy Hart</b> (born on Oc...
801,Physics,United Kingdom,1934-09-21,NaT,male,https://en.wikipedia.org/wiki/David_J._Thouless,David J. Thouless,Bearsden,,"David J. Thouless , Physics, 2016",2016.0,82.0,full/76dab6a284b4513e70375a7aae257743eccfe946.jpg,[https://upload.wikimedia.org/wikipedia/common...,"<p><b>David James Thouless</b> <span style=""fo..."
953,Chemistry,Netherlands,1951-05-18,NaT,male,https://en.wikipedia.org/wiki/Ben_Feringa,Ben Feringa,Barger-Compascuum,,"Ben Feringa , Chemistry, 2016",2016.0,65.0,full/2334592ca03c102b4d1acf82483f5d6bf3c327b9.jpg,[https://upload.wikimedia.org/wikipedia/common...,"<p><b>Bernard Lucas ""Ben"" Feringa</b> (<small>..."


## Dealing with missing fields
First: missing categories

In [49]:
print(df.count())  # we are missing some categories and genders
df.category.unique() # recall when we scraped, we checked against this limited list.

category          873
country           873
date_of_birth     873
date_of_death     592
gender            873
link              873
name              873
place_of_birth    873
place_of_death    592
text              873
year              873
award_age         873
bio_image         824
image_urls        873
mini_bio          873
dtype: int64


array([u'Chemistry', u'Physiology or Medicine', u'Peace', u'Literature',
       u'Physics', u'Economics'], dtype=object)

In [50]:
# look at the missing categories
df[df.category.isnull()][['name', 'text', 'link', 'category']]

Unnamed: 0,name,text,link,category


In [23]:
df.loc[df.name == u'Leopold Ružička born in Kingdom of Hungary', ['name', 'text', 'category']] # doesn't work

Unnamed: 0,name,text,category


In [24]:
print(u'Leopold Ružička born in Kingdom of Hungary'.encode('ascii', errors='backslashreplace'))
print(u'Róbert Bárány'.encode('ascii', errors='backslashreplace'))
df.loc[df.name==u'Leopold Ru\u017ei\u010dka born in Kingdom of Hungary', ['name', 'text', 'category']] # still doesn't work

Leopold Ru\u017ei\u010dka born in Kingdom of Hungary
R\xf3bert B\xe1r\xe1ny


Unnamed: 0,name,text,category


In [25]:
df.loc[df.name.str.startswith('Leopold Ru'), :] # works!

Unnamed: 0,name,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
9,Leopold Ružička born in Kingdom of Hungary,,Hungary,13 September 1887,26 September 1976,male,https://en.wikipedia.org/wiki/Leopold_Ru%C5%BE...,Vukovar,Mammern,Leopold Ružička born in Kingdom of Hungary,0
834,Leopold Ružička,Chemistry,Switzerland,13 September 1887,26 September 1976,male,https://en.wikipedia.org/wiki/Leopold_Ru%C5%BE...,Vukovar,Mammern,"Leopold Ružička , born in then Austria-Hungar...",1939


In [26]:
df.loc[df.name.str.startswith(u'R\xf3bert B\xe1r\xe1ny')]

Unnamed: 0,name,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
263,Róbert Bárány,Physiology or Medicine,Austria,22 April 1876,8 April 1936,male,https://en.wikipedia.org/wiki/R%C3%B3bert_B%C3...,Vienna,Uppsala domkyrkoförsamling,"Róbert Bárány , Physiology or Medicine, 1914",1914


In [27]:
def clean_data(df):
    df = df.replace('', np.nan)
    if 'born_in' in df.columns:
        df = df[df.born_in.isnull()] # eliminates the rows with 'born_in' values
        df = df.drop('born_in', axis=1) # don't need this column anymore
    df.drop(df[df.year==1809].index, inplace=True)
    df = df[~(df['name']=='Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
      (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name=='Sidney Altman') & (df.year==1990))] # selects all rows 'not' (~) Sidney Altma 1990
    # the missing categories are mostly people whose category name wasn't in standard format
    # fix that:
    df.loc[df.name=='Alexis Carrel', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Ilya Ilyich Mechnikov', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Amartya Sen', 'category'] = 'Economics'
    df.drop(df.loc[df.name.str.startswith('Leopold Ru') & (df.year != 1939)].index, inplace=True)
    df.loc[df.name.str.startswith(u'R\xf3bert B\xe1r\xe1ny'), 'category'] = 'Physiology or Medicine'
    return df

df = clean_data(df)

Now, missing gender

In [51]:
df[df['gender'].isnull()][['name', 'year', 'category']]

Unnamed: 0,name,year,category


In [29]:
def clean_data(df):
    df = df.replace('', np.nan)
    if 'born_in' in df.columns:
        df = df[df.born_in.isnull()] # eliminates the rows with 'born_in' values
        df = df.drop('born_in', axis=1) # don't need this column anymore
    df.drop(df[df.year==1809].index, inplace=True)
    df = df[~(df['name']=='Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
      (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name=='Sidney Altman') & (df.year==1990))] # selects all rows 'not' (~) Sidney Altma 1990
    # the missing categories are mostly people whose category name wasn't in standard format
    # fix that:
    df.loc[df.name=='Alexis Carrel', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Ilya Ilyich Mechnikov', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Amartya Sen', 'category'] = 'Economics'
    df.drop(df.loc[df.name.str.startswith('Leopold Ru') & (df.year != 1939)].index, inplace=True)
    df.loc[df.name.str.startswith(u'R\xf3bert B\xe1r\xe1ny'), 'category'] = 'Physiology or Medicine'
    # drop null gender entries (they are all institutions/organizations)
    df = df[df.gender.notnull()] # remove genderless entries
    return df

df = clean_data(df)

## Dealing with Times and Dates

In [52]:
df.loc[:20, ['name', 'date_of_birth']]

Unnamed: 0,name,date_of_birth
0,George de Hevesy,1885-08-01 00:00:00
1,Albert Szent-Györgyi,1893-09-16 00:00:00
2,Kofi Annan,1938-04-08
3,Odysseas Elytis,1911-11-02
4,Giorgos Seferis,1900-03-13
5,Rigoberta Menchú,1959-01-09
6,Richard Adolf Zsigmondy,1865-04-01 00:00:00
8,Róbert Bárány,1876-04-22 00:00:00
10,Philipp Lenard,1862-06-07 00:00:00
12,Fritz Haber,1868-12-09 00:00:00


In [53]:
pd.to_datetime(df.date_of_birth, errors='raise') # no errors
pd.to_datetime(df.date_of_death, errors='raise')


0      1966-07-05
1      1986-10-22
2             NaT
3      1996-03-18
4      1971-09-20
5             NaT
6      1929-09-23
8      1936-04-08
10     1947-05-20
12     1934-01-29
13     1947-10-04
14     1942-08-03
15     1960-04-24
16     1946-06-06
17     1928-08-30
18     1931-02-26
19     1927-07-05
20     1914-04-02
21     1918-04-20
22     1938-05-04
23     1941-09-09
24     1976-02-01
25     1970-08-01
26     1940-04-26
27     1949-03-30
28     1945-03-31
29     1955-08-12
30     1959-06-09
31     1941-03-04
32     1957-08-05
          ...    
988    1995-11-04
989    2016-09-28
991    2013-08-30
992    1988-01-15
994    1989-12-22
995    1995-06-25
996    1997-09-05
997    1970-11-21
998    1941-08-07
999           NaT
1000          NaT
1001          NaT
1002          NaT
1004          NaT
1005          NaT
1007          NaT
1009          NaT
1010          NaT
1011          NaT
1012          NaT
1013   1998-02-08
1014          NaT
1015          NaT
1016   1974-06-09
1017      

In [54]:
# if there was an error raised, this is one way to know which row caused it:
for i, row in df.iterrows():
    try:
        pd.to_datetime(row.date_of_death, errors='raise')
    except:
        print('{}({}, {})'.format(row.date_of_death.ljust(30), row['name'], i))

#
with_death_dates = df[df.date_of_death.notnull()]
bad_dates = pd.isnull(pd.to_datetime(with_death_dates.date_of_death, errors='coerce'))
with_death_dates[bad_dates][['category', 'date_of_death', 'name']]

Unnamed: 0,category,date_of_death,name


___
# FINAL
# COPY OF GETTING THE DIRTY DATA:


In [10]:
import numpy as np
import pandas as pd

from pymongo import MongoClient

def get_mongo_database(db_name, host='localhost', port=27017, username=None, password=None):
    '''Get (or create) named database from MongoDB with/out authentication'''
    if username and password:
        mongo_uri = 'mongodb://{}:{}@{}/{}'.format(username, password, host, db_name)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)
    return conn[db_name]

def mongo_to_dataframe(db_name, collection_name, query={}, host='localhost',
                       port=27017, username=None, password=None, no_id=True):
    '''Create a Pandas DataFrame from MongoDB collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    cursor = db[collection_name].find(query)
    df = pd.DataFrame(list(cursor))
    if no_id:
        del df['_id']
    return df

def dataframe_to_mongo(dframe, db_name, collection_name, host='localhost',
                 port=27017, username=None, password=None):
    '''save a dataframe to mongodb collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    records = dframe.to_dict('records')  # 'records' puts it into our list-of-dicts format
    db[collection_name].insert_many(records)

def delete_collection(db_name, collection_name, host='localhost',
                 port=27017, username=None, password=None):
    '''save a dataframe to mongodb collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    db[collection_name].delete_many({}) # empty filter deletes all entries


DB_NOBEL_PRIZE = 'nobel_prize' # use string constants or a spell error in retrieval will create new table.
COLL_WINNERS = 'winners' # winners collection

#----------------------------
# From json file.
#----------------------------

with open('data/nwinners.json') as f:
    df = pd.read_json(f)

# Make sure mongodb is clear (so we don't duplicate data), then
# save to Mongo for next section
db = get_mongo_database(DB_NOBEL_PRIZE)
db[COLL_WINNERS].delete_many({})  # deletes everything (no filter)
dataframe_to_mongo(df, DB_NOBEL_PRIZE, COLL_WINNERS) # save to Mongo for next section

#----------------------------
# From mongodb collection
#----------------------------

#df = mongo_to_dataframe(DB_NOBEL_PRIZE, COLL_WINNERS)
df.count()


born_in           1069
category          1069
country           1069
date_of_birth     1060
date_of_death      712
gender            1060
link              1069
name              1069
place_of_birth    1060
place_of_death     712
text              1069
year              1069
dtype: int64

# The final clean_data() function

In [19]:
def clean_data(df):
    df.name = df.name.str.replace('*', '') # Take out the asterisks in people's name
    df = df.replace('', np.nan)
    df_born_in = df[df.born_in.notnull()]
    df = df[df.born_in.isnull()] # eliminates the rows with 'born_in' values
    df = df.drop('born_in', axis=1) # don't need this column anymore
    df.drop(df[df.year==1809].index, inplace=True)
    df = df[~(df['name']=='Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
      (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name=='Sidney Altman') & (df.year==1990))] # selects all rows 'not' (~) Sidney Altma 1990
    # drop duplicate awards where the recipient had 2 countries.  Randomize which
    # country is dropped by first randomizing the index
    df = df.reindex(np.random.permutation(df.index))
    df = df.drop_duplicates(['name', 'year']) # if duplicate on these 2 fields, drop after 1st entry
    df = df.sort_index() #restores the index so no longer random
    # the missing categories are mostly people whose category name wasn't in standard format
    # fix that:
    df.loc[df.name=='Alexis Carrel', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Ilya Ilyich Mechnikov', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Amartya Sen', 'category'] = 'Economics'
    df.drop(df.loc[df.name.str.startswith('Leopold Ru') & (df.year != 1939)].index, inplace=True)
    df.loc[df.name.str.startswith(u'R\xf3bert B\xe1r\xe1ny'), 'category'] = 'Physiology or Medicine'
    # drop null gender entries (they are all institutions/organizations)
    df = df[df.gender.notnull()] # remove genderless entries
    df.date_of_birth = pd.to_datetime(df.date_of_birth)
    df.date_of_death = pd.to_datetime(df.date_of_death, errors='coerce')
    df['award_age'] = df.year - pd.DatetimeIndex(df.date_of_birth).year
    return df, df_born_in
    
df_clean, df_born_in = clean_data(df)

df_clean.count()

category          874
country           874
date_of_birth     874
date_of_death     592
gender            874
link              874
name              874
place_of_birth    874
place_of_death    592
text              874
year              874
award_age         874
dtype: int64

In [21]:
df_born_in.country.unique()


array([nan], dtype=object)

# Save the data

In [13]:
if False:
    db = get_mongo_database(DB_NOBEL_PRIZE)
    db[COLL_WINNERS].delete_many({})  # deletes everything (no filter) to make sure we're starting fresh
    #db['winners_born_in'].delete_many({})  # deletes everything (no filter) to make sure we're starting fresh
    dataframe_to_mongo(df_clean, DB_NOBEL_PRIZE, COLL_WINNERS) # save to Mongo for next section
    dataframe_to_mongo(df_born_in, DB_NOBEL_PRIZE, 'winners_born_in')

with open('data/nobel_winners_plus_bornin.json', 'w') as f:
    df_born_in.to_json(f, date_format='iso', orient='records')
 

# also save a local copy on sqlite
if False:
    import sqlalchemy

    engine = sqlalchemy.create_engine('sqlite:///data/nobel_prize.db')
    df_clean.to_sql('winners', engine)


# Merge MiniBio Dataframe

In [14]:
with open('data/minibios.json') as f:
    df_winners_bios = pd.read_json(f)
    
df_winners_all = pd.merge(df_clean, df_winners_bios, how='outer', on='link')
df_winners_all.count()

category          1029
country           1029
date_of_birth     1029
date_of_death      691
gender            1029
link              1081
name              1029
place_of_birth    1029
place_of_death     691
text              1029
year              1029
award_age         1029
bio_image         1020
image_urls        1081
mini_bio          1081
dtype: int64

In [15]:
# remove redundancies
# remove duplicates of any rows that share a 'link' and 'year' field after removing
# any rows without a name field
df_winners_all = df_winners_all[~df_winners_all.name.isnull()].drop_duplicates(subset=['link', 'year'])
df_winners_all.count()
df_winners_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 873 entries, 0 to 1028
Data columns (total 15 columns):
category          873 non-null object
country           873 non-null object
date_of_birth     873 non-null datetime64[ns]
date_of_death     592 non-null datetime64[ns]
gender            873 non-null object
link              873 non-null object
name              873 non-null object
place_of_birth    873 non-null object
place_of_death    592 non-null object
text              873 non-null object
year              873 non-null float64
award_age         873 non-null float64
bio_image         824 non-null object
image_urls        873 non-null object
mini_bio          873 non-null object
dtypes: datetime64[ns](2), float64(2), object(11)
memory usage: 109.1+ KB


In [16]:
db['winners_all'].delete_many({})  # deletes everything (no filter) to make sure we're starting fresh
dataframe_to_mongo(df_winners_all, DB_NOBEL_PRIZE, 'winners_all')



ValueError: NaTType does not support utcoffset

In [17]:
df_winners_all.columns

Index([      u'category',        u'country',  u'date_of_birth',
        u'date_of_death',         u'gender',           u'link',
                 u'name', u'place_of_birth', u'place_of_death',
                 u'text',           u'year',      u'award_age',
            u'bio_image',     u'image_urls',       u'mini_bio'],
      dtype='object')

In [18]:
# Save json file (need to deal with datetimes which don't json)

# here is one way I could have done it.
import datetime
print(df_winners_all.date_of_birth.apply(lambda x: x.isoformat()))

#but I am using Pandas' built-in to_json which deals with datetime
with open('data/nwinners_all.json', 'w') as f:
    df_winners_all.to_json(f, date_format='iso', orient='records')
    
    
# ARCHIVE
if False:
    import datetime
    from dateutil import parser
    import json

    class JSONDateTimeEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, (datetime.date, datetime.datetime)):
                return obj.isoformat()
            else:
                return json.JSONEncoder.default(self, obj)

    def mydumps(obj):
        return json.dumps(obj, cls=JSONDateTimeEncoder)
    now_str = mydumps({'time': datetime.datetime.now()})
    print(now_str)

    from datetime import datetime
    import dateutil

    d = datetime.datetime.now()
    d_iso = d.isoformat()  # convert to text
    print(d_iso)

    d = dateutil.parser.parse(d_iso) # back to a datetime object
    print(d)




0       1946-10-20T00:00:00
1       1878-11-01T00:00:00
3       1927-10-08T00:00:00
6       1931-11-26T00:00:00
8       1906-09-06T00:00:00
11      1887-04-10T00:00:00
13      1967-02-24T00:00:00
14      1951-09-30T00:00:00
15      1936-03-11T00:00:00
16      1936-10-10T00:00:00
17      1882-08-26T00:00:00
18      1887-07-22T00:00:00
19      1918-10-04T00:00:00
20      1901-03-27T00:00:00
21      1862-06-05T00:00:00
22      1833-02-19T00:00:00
23      1843-05-21T00:00:00
24      1905-09-30T00:00:00
25      1892-09-06T00:00:00
26      1886-09-13T00:00:00
27      1906-06-19T00:00:00
29      1941-02-19T00:00:00
30      1947-06-08T00:00:00
31      1920-05-29T00:00:00
32      1928-06-13T00:00:00
33      1919-04-22T00:00:00
34      1924-08-23T00:00:00
35      1940-05-24T00:00:00
37      1932-06-18T00:00:00
38      1936-11-19T00:00:00
               ...         
990     1918-07-15T00:00:00
991     1932-04-26T00:00:00
993     1929-11-02T00:00:00
994     1939-05-07T00:00:00
996     1932-11-06T0

In [19]:
df = df_winners_all