# Cleaning Data with Pandas
## Reading the data

In [41]:
import numpy as np
import pandas as pd

from pymongo import MongoClient

def get_mongo_database(db_name, host='localhost', port=27017, username=None, password=None):
    '''Get (or create) named database from MongoDB with/out authentication'''
    if username and password:
        mongo_uri = 'mongodb://{}:{}@{}/{}'.format(username, password, host, db_name)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)
    return conn[db_name]

def mongo_to_dataframe(db_name, collection_name, query={}, host='localhost',
                       port=27017, username=None, password=None, no_id=True):
    '''Create a Pandas DataFrame from MongoDB collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    cursor = db[collection_name].find(query)
    df = pd.DataFrame(list(cursor))
    if no_id:
        del df['_id']
    return df

def dataframe_to_mongo(dframe, db_name, collection_name, host='localhost',
                 port=27017, username=None, password=None):
    '''save a dataframe to mongodb collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    records = df.to_dict('records')  # 'records' puts it into our list-of-dicts format
    db[collection_name].insert_many(records)


DB_NOBEL_PRIZE = 'nobel_prize' # use string constants or a spell error in retrieval will create new table.
COLL_WINNERS = 'winners' # winners collection

#----------------------------
# From json file.
#----------------------------

with open('data/nwinners.json') as f:
    df = pd.read_json(f)

# Make sure mongodb is clear (so we don't duplicate data), then
# save to Mongo for next section
db = get_mongo_database(DB_NOBEL_PRIZE)
db[COLL_WINNERS].delete_many({})  # deletes everything (no filter)
dataframe_to_mongo(df, DB_NOBEL_PRIZE, COLL_WINNERS) # save to Mongo for next section

#----------------------------
# From mongodb collection
#----------------------------

df = mongo_to_dataframe(DB_NOBEL_PRIZE, COLL_WINNERS)


## Inspecting the data

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1069 entries, 0 to 1068
Data columns (total 12 columns):
born_in           1069 non-null object
category          1069 non-null object
country           1069 non-null object
date_of_birth     1060 non-null object
date_of_death     712 non-null object
gender            1060 non-null object
link              1069 non-null object
name              1069 non-null object
place_of_birth    1060 non-null object
place_of_death    712 non-null object
text              1069 non-null object
year              1069 non-null int64
dtypes: int64(1), object(11)
memory usage: 100.3+ KB


In [3]:
df.describe() # only acts on numeric columns
df.describe(include=['object']) # include specifies other types to include (besides numeric)

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text
count,1069.0,1069,1069,1060,712,1060,1069,1069,1060,712,1069
unique,33.0,7,59,868,589,2,912,1002,607,316,1057
top,,Physiology or Medicine,United States,7 November 1867,4 July 1934,male,https://en.wikipedia.org/wiki/Marie_Curie,César Milstein,New York City,Cambridge,"Adolfo Pérez Esquivel , Peace, 1980"
freq,938.0,256,352,4,4,1003,4,3,44,37,2


In [4]:
df.tail()
df.head(3)

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,,Chemistry,Hungary,1 August 1885,5 July 1966,male,https://en.wikipedia.org/wiki/George_de_Hevesy,George de Hevesy,Budapest,Freiburg im Breisgau,"George de Hevesy , Chemistry, 1943",1943
1,,Physiology or Medicine,Hungary,16 September 1893,22 October 1986,male,https://en.wikipedia.org/wiki/Albert_Szent-Gy%...,Albert Szent-Györgyi,Budapest,Woods Hole,"Albert Szent-Györgyi , Physiology or Medicine,...",1937
2,,Peace,Ghana,8 April 1938,,male,https://en.wikipedia.org/wiki/Kofi_Annan,Kofi Annan,Kumasi,,"Kofi Annan , Peace, 2001",2001


## Set indices (optional, but useful)

In [5]:
print(df.columns) # the column index
df = df.set_index('name') # row index (note we need to assign to df)
df.loc['Albert Einstein'] # loc accesses by label index

Index([u'born_in', u'category', u'country', u'date_of_birth', u'date_of_death',
       u'gender', u'link', u'name', u'place_of_birth', u'place_of_death',
       u'text', u'year'],
      dtype='object')


Unnamed: 0_level_0,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Albert Einstein,,Physics,Switzerland,14 March 1879,18 April 1955,male,https://en.wikipedia.org/wiki/Albert_Einstein,Ulm,Princeton,"Albert Einstein , born in Germany , Physics, ...",1921
Albert Einstein,,Physics,Germany,14 March 1879,18 April 1955,male,https://en.wikipedia.org/wiki/Albert_Einstein,Ulm,Princeton,"Albert Einstein , Physics, 1921",1921


In [6]:
df.reset_index(inplace=True) # go back to default int index.  Note the 'inplace'
df.iloc[2] # iloc accesses by integer index
df.head(2)

Unnamed: 0,name,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
0,George de Hevesy,,Chemistry,Hungary,1 August 1885,5 July 1966,male,https://en.wikipedia.org/wiki/George_de_Hevesy,Budapest,Freiburg im Breisgau,"George de Hevesy , Chemistry, 1943",1943
1,Albert Szent-Györgyi,,Physiology or Medicine,Hungary,16 September 1893,22 October 1986,male,https://en.wikipedia.org/wiki/Albert_Szent-Gy%...,Budapest,Woods Hole,"Albert Szent-Györgyi , Physiology or Medicine,...",1937


In [7]:
df[0:10] #first 10 rows
df[-4:]  #last 4 rows

# masks
mask = df.year > 2000
df[mask]
df[df.year>2000] # more direct

Unnamed: 0,name,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
2,Kofi Annan,,Peace,Ghana,8 April 1938,,male,https://en.wikipedia.org/wiki/Kofi_Annan,Kumasi,,"Kofi Annan , Peace, 2001",2001
46,J. Robin Warren,,Physiology or Medicine,Australia,11 June 1937,,male,https://en.wikipedia.org/wiki/Robin_Warren,Adelaide,,"J. Robin Warren , Physiology or Medicine, 2005",2005
102,Martin Karplus *,Austria,Chemistry,,15 March 1930,,male,https://en.wikipedia.org/wiki/Martin_Karplus,Vienna,,"Martin Karplus *, Chemistry, 2013",2013
103,International Atomic Energy Agency,,Peace,Austria,,,,https://en.wikipedia.org/wiki/International_At...,,,"International Atomic Energy Agency , Peace, 2005",2005
116,Stefan Hell,,Chemistry,Germany,23 December 1962,,male,https://en.wikipedia.org/wiki/Stefan_Hell,Arad,,"Stefan Hell , born in Romania , Chemistry, 2014",2014
117,Thomas C. Südhof,,Physiology or Medicine,Germany,22 December 1955,,male,https://en.wikipedia.org/wiki/Thomas_C._S%C3%B...,Göttingen,,"Thomas C. Südhof , Physiology or Medicine, 2013",2013
126,Herta Müller,,Literature,Germany,17 August 1953,,female,https://en.wikipedia.org/wiki/Herta_M%C3%BCller,Nițchidorf,,"Herta Müller , born in Romania , Literature, ...",2009
167,Patrick Modiano,,Literature,France,30 July 1945,,male,https://en.wikipedia.org/wiki/Patrick_Modiano,Boulogne-Billancourt,,"Patrick Modiano , Literature, 2014",2014
168,Serge Haroche,,Physics,France,11 September 1944,,male,https://en.wikipedia.org/wiki/Serge_Haroche,Casablanca,,"Serge Haroche , born in Morocco, then under F...",2012
169,Jules A. Hoffmann,,Physiology or Medicine,France,2 August 1941,,male,https://en.wikipedia.org/wiki/Jules_A._Hoffmann,Echternach,,"Jules A. Hoffmann , born in Luxembourg , Phys...",2011


## Cleaning the Data
### Remove the asterisk from names

In [8]:
# How many names with asterisk?
df[df.name.str.contains('\*')]['name'].count()

131

In [9]:
# Take out the asterisks in people's name
df.name = df.name.str.replace('*', '')
df[df.name.str.contains('\*')]['name'].count()

0

### Eliminating duplications for people with a born_in entry

In [10]:
set(df.born_in.apply(type))

{unicode}

In [11]:
bornin_col = df.born_in # or df['born_in']

# if you look at bornin_col, most entries are an empty string
# replace empty strings with NaN
bornin_col.replace('', np.nan, inplace=True)
bornin_col

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
5           NaN
6           NaN
7           NaN
8           NaN
9           NaN
10          NaN
11          NaN
12          NaN
13          NaN
14          NaN
15          NaN
16          NaN
17          NaN
18          NaN
19          NaN
20      Germany
21          NaN
22          NaN
23          NaN
24          NaN
25          NaN
26          NaN
27          NaN
28          NaN
29      Germany
         ...   
1039        NaN
1040      India
1041      India
1042        NaN
1043        NaN
1044        NaN
1045        NaN
1046        NaN
1047        NaN
1048        NaN
1049        NaN
1050        NaN
1051        NaN
1052        NaN
1053        NaN
1054        NaN
1055        NaN
1056      India
1057        NaN
1058      India
1059        NaN
1060        NaN
1061    Hungary
1062        NaN
1063        NaN
1064        NaN
1065        NaN
1066        NaN
1067        NaN
1068        NaN
Name: born_in, Length: 1

In [12]:
# Rows with born_in are duplicate entries, so remove all rows
# which are not NaN in bornin_col
df = df[df.born_in.isnull()]
print(df.count()) # note that born_in count is now 0
#df = df.drop('born_in', axis=1) # drop the born_in column

name              938
born_in             0
category          938
country           938
date_of_birth     929
date_of_death     627
gender            929
link              938
place_of_birth    929
place_of_death    627
text              938
year              938
dtype: int64


### Finding other duplicates
Internet search shows that 889 people and organizations have received Nobel prize, but our data set contains more, so let's look for more duplications

In [13]:
dupes_by_name = df[df.duplicated('name')]
dupes_by_name.count()

name              56
born_in            0
category          56
country           56
date_of_birth     55
date_of_death     35
gender            55
link              56
place_of_birth    55
place_of_death    35
text              56
year              56
dtype: int64

In [14]:
# By default, 'duplicated' gives the first occurance of a duplicated item
# but it has an option 'keep="last"'.  Using both options, we can get a full
# list of all the duplicated items
all_dupes = df[df.duplicated('name') | df.duplicated('name', keep='last')]
all_dupes.count()

# we could have also done this in the following way
# using the 'isin' method
all_dupes = df[df.name.isin(dupes_by_name.name)]
all_dupes.count()

# we could have also done this in the following way
# groupby returns iterator of (groupname, dataframeofrows) tuples, e.g.
for name, rows in df.groupby('name'):
    print('name: {}, number of rows: {}'.format(name.encode('utf-8'), len(rows)))
# use groupby as follows:
pd.concat([g for _, g in df.groupby('name') if len(g) > 1])

name: A. Michael Spence, number of rows: 1
name: Aage Bohr, number of rows: 1
name: Aaron Ciechanover, number of rows: 1
name: Aaron Klug, number of rows: 2
name: Abdus Salam, number of rows: 1
name: Ada Yonath, number of rows: 1
name: Adam G. Riess, number of rows: 1
name: Adolf Butenandt, number of rows: 1
name: Adolf Otto Reinhold Windaus, number of rows: 1
name: Adolf von Baeyer, number of rows: 1
name: Adolfo Pérez Esquivel, number of rows: 2
name: Ahmed H. Zewail, number of rows: 1
name: Ahmed Zewail, number of rows: 1
name: Akira Suzuki, number of rows: 1
name: Al Gore, number of rows: 1
name: Alan Heeger, number of rows: 1
name: Alan Lloyd Hodgkin, number of rows: 1
name: Alan MacDiarmid, number of rows: 1
name: Albert A. Michelson, number of rows: 1
name: Albert Camus, number of rows: 1
name: Albert Claude, number of rows: 1
name: Albert Einstein, number of rows: 2
name: Albert Fert, number of rows: 1
name: Albert Lutuli, number of rows: 1
name: Albert Schweitzer, number of ro

Unnamed: 0,name,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
785,Aaron Klug,,Chemistry,United Kingdom,11 August 1926,,male,https://en.wikipedia.org/wiki/Aaron_Klug,Želva,,"Aaron Klug , born in Lithuania , Chemistry, 1982",1982
799,Aaron Klug,,Chemistry,South Africa,11 August 1926,,male,https://en.wikipedia.org/wiki/Aaron_Klug,Želva,,"Aaron Klug , Chemistry, 1982",1982
277,Adolfo Pérez Esquivel,,Peace,Summary,26 November 1931,,male,https://en.wikipedia.org/wiki/Adolfo_P%C3%A9re...,Buenos Aires,,"Adolfo Pérez Esquivel , Peace, 1980",1980
289,Adolfo Pérez Esquivel,,Peace,Argentina,26 November 1931,,male,https://en.wikipedia.org/wiki/Adolfo_P%C3%A9re...,Buenos Aires,,"Adolfo Pérez Esquivel , Peace, 1980",1980
836,Albert Einstein,,Physics,Switzerland,14 March 1879,18 April 1955,male,https://en.wikipedia.org/wiki/Albert_Einstein,Ulm,Princeton,"Albert Einstein , born in Germany , Physics, ...",1921
986,Albert Einstein,,Physics,Germany,14 March 1879,18 April 1955,male,https://en.wikipedia.org/wiki/Albert_Einstein,Ulm,Princeton,"Albert Einstein , Physics, 1921",1921
692,Angus Deaton,,Economics,United States,19 October 1945,,male,https://en.wikipedia.org/wiki/Angus_Deaton,Edinburgh,,"Angus Deaton , born in United Kingdom , Econo...",2015
841,Angus Deaton,,Economics,United Kingdom,19 October 1945,,male,https://en.wikipedia.org/wiki/Angus_Deaton,Edinburgh,,"Angus Deaton , Economics, 2015",2015
675,Aziz Sancar,,Chemistry,United States,8 September 1946,,male,https://en.wikipedia.org/wiki/Aziz_Sancar,Savur,,"Aziz Sancar , born in Turkey , Chemistry, 2015",2015
814,Aziz Sancar,,Chemistry,Turkey,8 September 1946,,male,https://en.wikipedia.org/wiki/Aziz_Sancar,Savur,,"Aziz Sancar , Chemistry, 2015",2015


In [15]:
# Sort the data now
# here is how sort_values works:
df2 = pd.DataFrame(\
                  {'name':['zak', 'alice', 'bob', 'mike',
                          'bob', 'bob'],
                  'score':[4, 3, 5, 2, 3, 7]})
df2 = df2.sort_values(['name', 'score'], ascending=[1,0])
print(df2)
# Now use it on all_dupes
all_dupes.sort_values('name')[['name', 'country', 'year']]

    name  score
1  alice      3
5    bob      7
2    bob      5
4    bob      3
3   mike      2
0    zak      4


Unnamed: 0,name,country,year
785,Aaron Klug,United Kingdom,1982
799,Aaron Klug,South Africa,1982
277,Adolfo Pérez Esquivel,Summary,1980
289,Adolfo Pérez Esquivel,Argentina,1980
986,Albert Einstein,Germany,1921
836,Albert Einstein,Switzerland,1921
841,Angus Deaton,United Kingdom,2015
692,Angus Deaton,United States,2015
814,Aziz Sancar,Turkey,2015
675,Aziz Sancar,United States,2015


In [16]:
# This output shows that some recipients are in our dataset
# twice for the same year, but different countries.

# Change Curie's country to France for the 1911 entry
df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
      (df.year == 1911), 'country'] = 'France'

df.drop(df[(df.name=='Sidney Altman') & (df.year==1990)].index, inplace=True)
# could have done this the following way as well:
df = df[~((df.name=='Sidney Altman') & (df.year==1990))] # ~ is the logical 'not'


In [17]:
# define function for doing these cleanup activities
def clean_data(df):
    df = df.replace('', np.nan)
    df = df[df.born_in.isnull()] # eliminates the rows with 'born_in' values
    df = df.drop('born_in', axis=1) # don't need this column anymore
    df.drop(df[df.year==1809].index, inplace=True)
    df = df[~(df['name']=='Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
      (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name=='Sidney Altman') & (df.year==1990))] # selects all rows 'not' (~) Sidney Altma 1990
    return df

df = clean_data(df)

In [18]:
# get rid of half of the dual country winners so each prize just counts once.
# to be fair, randomize the rows first so that country deleted is random.
df = df.reindex(np.random.permutation(df.index)) # randomize the index
# drop duplicate rows (both 'name' and 'year' the same) after the first encountered
df = df.drop_duplicates(['name', 'year'])
df = df.sort_index() # go back to unrandomized index
df.count()

name              883
category          878
country           883
date_of_birth     875
date_of_death     593
gender            875
link              883
place_of_birth    875
place_of_death    593
text              883
year              883
dtype: int64

In [19]:
# lest see how many duplicate winners we have now (some are valid)
df[df.duplicated('name') | df.duplicated('name', keep='last')]\
    .sort_values(by='name')[['name', 'country', 'year', 'category']]
    
# The output are the 4 valid multi-award winners!

Unnamed: 0,name,country,year,category
748,Frederick Sanger,United Kingdom,1958,Chemistry
769,Frederick Sanger,United Kingdom,1980,Chemistry
376,John Bardeen,United States,1956,Physics
448,John Bardeen,United States,1972,Physics
383,Linus C. Pauling,United States,1954,Chemistry
401,Linus C. Pauling,United States,1962,Peace
924,Marie Skłodowska-Curie,Poland,1903,Physics
942,Marie Skłodowska-Curie,France,1911,Chemistry


In [20]:
df[df.year.isin([2016, 2017])] #my total differs from book, so look at recent winners...

Unnamed: 0,name,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
186,Bengt R. Holmström,Economics,Finland,18 April 1949,,male,https://en.wikipedia.org/wiki/Bengt_R._Holmstr...,Helsinki,,"Bengt R. Holmström , Economics, 2016",2016
191,Jean-Pierre Sauvage,Chemistry,France,21 October 1944,,male,https://en.wikipedia.org/wiki/Jean-Pierre_Sauvage,Paris,,"Jean-Pierre Sauvage , Chemistry, 2016",2016
221,Juan Manuel Santos,Peace,Colombia,10 August 1951,,male,https://en.wikipedia.org/wiki/Juan_Manuel_Santos,Bogota,,"Juan Manuel Santos , Peace, 2016",2016
555,Bob Dylan,Literature,United States,24 May 1941,,male,https://en.wikipedia.org/wiki/Bob_Dylan,Duluth,,"Bob Dylan , Literature, 2016",2016
687,Oliver Hart,Economics,United States,9 October 1948,,male,https://en.wikipedia.org/wiki/Oliver_Hart_(eco...,London,,"Oliver Hart , born in United Kingdom , Econom...",2016
690,F. Duncan M. Haldane,Physics,United States,14 September 1951,,male,https://en.wikipedia.org/wiki/F._Duncan_M._Hal...,London,,"F. Duncan M. Haldane , born in United Kingdom...",2016
691,John M. Kosterlitz,Physics,United States,22 June 1943,,male,https://en.wikipedia.org/wiki/John_M._Kosterlitz,Aberdeen,,"John M. Kosterlitz , born in United Kingdom ,...",2016
821,Fraser Stoddart,Chemistry,United Kingdom,24 May 1942,,male,https://en.wikipedia.org/wiki/Fraser_Stoddart,Edinburgh,,"Fraser Stoddart , Chemistry, 2016",2016
822,David J. Thouless,Physics,United Kingdom,21 September 1934,,male,https://en.wikipedia.org/wiki/David_J._Thouless,Bearsden,,"David J. Thouless , Physics, 2016",2016
996,Ben Feringa,Chemistry,Netherlands,18 May 1951,,male,https://en.wikipedia.org/wiki/Ben_Feringa,Barger-Compascuum,,"Ben Feringa , Chemistry, 2016",2016


## Dealing with missing fields
First: missing categories

In [34]:
print(df.count())  # we are missing some categories and genders
df.category.unique() # recall when we scraped, we checked against this limited list.

name              874
category          874
country           874
date_of_birth     874
date_of_death     592
gender            874
link              874
place_of_birth    874
place_of_death    592
text              874
year              874
dtype: int64


array([u'Chemistry', u'Physiology or Medicine', u'Peace', u'Literature',
       u'Physics', u'Economics'], dtype=object)

In [22]:
# look at the missing categories
df[df.category.isnull()][['name', 'text', 'link', 'category']]

Unnamed: 0,name,text,link,category
7,Róbert Bárány,"Róbert Bárány , born in Austria-Hungary, Medi...",https://en.wikipedia.org/wiki/R%C3%B3bert_B%C3...,
9,Leopold Ružička born in Kingdom of Hungary,Leopold Ružička born in Kingdom of Hungary,https://en.wikipedia.org/wiki/Leopold_Ru%C5%BE...,
138,Alexis Carrel,"Alexis Carrel , Medicine, 1912",https://en.wikipedia.org/wiki/Alexis_Carrel,
819,Ilya Ilyich Mechnikov,"Ilya Ilyich Mechnikov , Physiology and Medicin...",https://en.wikipedia.org/wiki/Ilya_Ilyich_Mech...,
1057,Amartya Sen,"Amartya Sen , Economic Sciences, 1998",https://en.wikipedia.org/wiki/Amartya_Sen,


In [23]:
df.loc[df.name == u'Leopold Ružička born in Kingdom of Hungary', ['name', 'text', 'category']] # doesn't work

Unnamed: 0,name,text,category


In [24]:
print(u'Leopold Ružička born in Kingdom of Hungary'.encode('ascii', errors='backslashreplace'))
print(u'Róbert Bárány'.encode('ascii', errors='backslashreplace'))
df.loc[df.name==u'Leopold Ru\u017ei\u010dka born in Kingdom of Hungary', ['name', 'text', 'category']] # still doesn't work

Leopold Ru\u017ei\u010dka born in Kingdom of Hungary
R\xf3bert B\xe1r\xe1ny


Unnamed: 0,name,text,category


In [25]:
df.loc[df.name.str.startswith('Leopold Ru'), :] # works!

Unnamed: 0,name,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
9,Leopold Ružička born in Kingdom of Hungary,,Hungary,13 September 1887,26 September 1976,male,https://en.wikipedia.org/wiki/Leopold_Ru%C5%BE...,Vukovar,Mammern,Leopold Ružička born in Kingdom of Hungary,0
834,Leopold Ružička,Chemistry,Switzerland,13 September 1887,26 September 1976,male,https://en.wikipedia.org/wiki/Leopold_Ru%C5%BE...,Vukovar,Mammern,"Leopold Ružička , born in then Austria-Hungar...",1939


In [26]:
df.loc[df.name.str.startswith(u'R\xf3bert B\xe1r\xe1ny')]

Unnamed: 0,name,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
7,Róbert Bárány,,Hungary,22 April 1876,8 April 1936,male,https://en.wikipedia.org/wiki/R%C3%B3bert_B%C3...,Vienna,Uppsala domkyrkoförsamling,"Róbert Bárány , born in Austria-Hungary, Medi...",1914


In [27]:
def clean_data(df):
    df = df.replace('', np.nan)
    if 'born_in' in df.columns:
        df = df[df.born_in.isnull()] # eliminates the rows with 'born_in' values
        df = df.drop('born_in', axis=1) # don't need this column anymore
    df.drop(df[df.year==1809].index, inplace=True)
    df = df[~(df['name']=='Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
      (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name=='Sidney Altman') & (df.year==1990))] # selects all rows 'not' (~) Sidney Altma 1990
    # the missing categories are mostly people whose category name wasn't in standard format
    # fix that:
    df.loc[df.name=='Alexis Carrel', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Ilya Ilyich Mechnikov', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Amartya Sen', 'category'] = 'Economics'
    df.drop(df.loc[df.name.str.startswith('Leopold Ru') & (df.year != 1939)].index, inplace=True)
    df.loc[df.name.str.startswith(u'R\xf3bert B\xe1r\xe1ny'), 'category'] = 'Physiology or Medicine'
    return df

df = clean_data(df)

Now, missing gender

In [29]:
df[df['gender'].isnull()][['name', 'year', 'category']]

Unnamed: 0,name,year,category
103,International Atomic Energy Agency,2005,Peace
242,Pugwash Conferences on Science and World Affairs,1995,Peace
269,Institut de Droit International,1904,Peace
307,Amnesty International,1977,Peace
310,Friends Service Council,1947,Peace
366,American Friends Service Committee (The Quakers),1947,Peace
813,Tunisian National Dialogue Quartet,2015,Peace
851,Médecins Sans Frontières,1999,Peace


In [30]:
def clean_data(df):
    df = df.replace('', np.nan)
    if 'born_in' in df.columns:
        df = df[df.born_in.isnull()] # eliminates the rows with 'born_in' values
        df = df.drop('born_in', axis=1) # don't need this column anymore
    df.drop(df[df.year==1809].index, inplace=True)
    df = df[~(df['name']=='Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
      (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name=='Sidney Altman') & (df.year==1990))] # selects all rows 'not' (~) Sidney Altma 1990
    # the missing categories are mostly people whose category name wasn't in standard format
    # fix that:
    df.loc[df.name=='Alexis Carrel', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Ilya Ilyich Mechnikov', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Amartya Sen', 'category'] = 'Economics'
    df.drop(df.loc[df.name.str.startswith('Leopold Ru') & (df.year != 1939)].index, inplace=True)
    df.loc[df.name.str.startswith(u'R\xf3bert B\xe1r\xe1ny'), 'category'] = 'Physiology or Medicine'
    # drop null gender entries (they are all institutions/organizations)
    df = df[df.gender.notnull()] # remove genderless entries
    return df

df = clean_data(df)

## Dealing with Times and Dates

In [31]:
df.loc[:20, ['name', 'date_of_birth']]

Unnamed: 0,name,date_of_birth
0,George de Hevesy,1 August 1885
1,Albert Szent-Györgyi,16 September 1893
2,Kofi Annan,8 April 1938
3,Odysseas Elytis,2 November 1911
4,Giorgos Seferis,13 March 1900
5,Rigoberta Menchú,9 January 1959
6,Richard Adolf Zsigmondy,1 April 1865
7,Róbert Bárány,22 April 1876
10,Fritz Haber,9 December 1868
11,Max Karl Ernst Ludwig Planck,23 April 1858


In [32]:
pd.to_datetime(df.date_of_birth, errors='raise') # no errors
pd.to_datetime(df.date_of_death, errors='raise')


0      1966-07-05
1      1986-10-22
2             NaT
3      1996-03-18
4      1971-09-20
5             NaT
6      1929-09-23
7      1936-04-08
10     1934-01-29
11     1947-10-04
12     1942-08-03
13     1960-04-24
14     1946-06-06
15     1928-08-30
16     1931-02-26
17     1927-07-05
18     1914-04-02
19     1918-04-20
21     1938-05-04
22     1941-09-09
23     1976-02-01
24     1970-08-01
25     1940-04-26
26     1949-03-30
27     1945-03-31
28     1955-08-12
30     1959-06-09
31     1941-03-04
32     1957-08-05
33     1929-10-03
          ...    
1023   1992-04-08
1024   1968-06-14
1026   1979-05-02
1029   1995-11-04
1030   2016-09-28
1031   2013-08-30
1032   1988-01-15
1033   1989-12-22
1034   1995-06-25
1036   1997-09-05
1038   1970-11-21
1039   1941-08-07
1042          NaT
1043          NaT
1044          NaT
1045          NaT
1046          NaT
1047          NaT
1048          NaT
1049          NaT
1050          NaT
1051          NaT
1052          NaT
1053          NaT
1054   199

In [36]:
# if there was an error raised, this is one way to know which row caused it:
for i, row in df.iterrows():
    try:
        pd.to_datetime(row.date_of_death, errors='raise')
    except:
        print('{}({}, {})'.format(row.date_of_death.ljust(30), row['name'], i))

#
with_death_dates = df[df.date_of_death.notnull()]
bad_dates = pd.isnull(pd.to_datetime(with_death_dates.date_of_death, errors='coerce'))
with_death_dates[bad_dates][['category', 'date_of_death', 'name']]

Unnamed: 0,category,date_of_death,name


___
# FINAL
# COPY OF GETTING THE DIRTY DATA:


In [69]:
import numpy as np
import pandas as pd

from pymongo import MongoClient

def get_mongo_database(db_name, host='localhost', port=27017, username=None, password=None):
    '''Get (or create) named database from MongoDB with/out authentication'''
    if username and password:
        mongo_uri = 'mongodb://{}:{}@{}/{}'.format(username, password, host, db_name)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)
    return conn[db_name]

def mongo_to_dataframe(db_name, collection_name, query={}, host='localhost',
                       port=27017, username=None, password=None, no_id=True):
    '''Create a Pandas DataFrame from MongoDB collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    cursor = db[collection_name].find(query)
    df = pd.DataFrame(list(cursor))
    if no_id:
        del df['_id']
    return df

def dataframe_to_mongo(dframe, db_name, collection_name, host='localhost',
                 port=27017, username=None, password=None):
    '''save a dataframe to mongodb collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    records = df.to_dict('records')  # 'records' puts it into our list-of-dicts format
    db[collection_name].insert_many(records)


DB_NOBEL_PRIZE = 'nobel_prize' # use string constants or a spell error in retrieval will create new table.
COLL_WINNERS = 'winners' # winners collection

#----------------------------
# From json file.
#----------------------------

with open('data/nwinners.json') as f:
    df = pd.read_json(f)

# Make sure mongodb is clear (so we don't duplicate data), then
# save to Mongo for next section
db = get_mongo_database(DB_NOBEL_PRIZE)
db[COLL_WINNERS].delete_many({})  # deletes everything (no filter)
dataframe_to_mongo(df, DB_NOBEL_PRIZE, COLL_WINNERS) # save to Mongo for next section

#----------------------------
# From mongodb collection
#----------------------------

df = mongo_to_dataframe(DB_NOBEL_PRIZE, COLL_WINNERS)
df.count()


born_in           1069
category          1069
country           1069
date_of_birth     1060
date_of_death      712
gender            1060
link              1069
name              1069
place_of_birth    1060
place_of_death     712
text              1069
year              1069
dtype: int64

# The final clean_data() function

In [70]:
def clean_data(df):
    df = df.replace('', np.nan)
    df_born_in = df[df.born_in.notnull()]
    df = df[df.born_in.isnull()] # eliminates the rows with 'born_in' values
    df = df.drop('born_in', axis=1) # don't need this column anymore
    df.drop(df[df.year==1809].index, inplace=True)
    df = df[~(df['name']=='Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
      (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name=='Sidney Altman') & (df.year==1990))] # selects all rows 'not' (~) Sidney Altma 1990
    # drop duplicate awards where the recipient had 2 countries.  Randomize which
    # country is dropped by first randomizing the index
    df = df.reindex(np.random.permutation(df.index))
    df = df.drop_duplicates(['name', 'year']) # if duplicate on these 2 fields, drop after 1st entry
    df = df.sort_index() #restores the index so no longer random
    # the missing categories are mostly people whose category name wasn't in standard format
    # fix that:
    df.loc[df.name=='Alexis Carrel', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Ilya Ilyich Mechnikov', 'category'] = 'Physiology or Medicine'
    df.loc[df.name=='Amartya Sen', 'category'] = 'Economics'
    df.drop(df.loc[df.name.str.startswith('Leopold Ru') & (df.year != 1939)].index, inplace=True)
    df.loc[df.name.str.startswith(u'R\xf3bert B\xe1r\xe1ny'), 'category'] = 'Physiology or Medicine'
    # drop null gender entries (they are all institutions/organizations)
    df = df[df.gender.notnull()] # remove genderless entries
    df.date_of_birth = pd.to_datetime(df.date_of_birth)
    df.date_of_death = pd.to_datetime(df.date_of_death, errors='coerce')
    df['award_age'] = df.year - pd.DatetimeIndex(df.date_of_birth).year
    return df, df_born_in

df_clean, df_born_in = clean_data(df)

df_clean.count()

category          874
country           874
date_of_birth     874
date_of_death     592
gender            874
link              874
name              874
place_of_birth    874
place_of_death    592
text              874
year              874
award_age         874
dtype: int64

# Save the data

In [71]:
db = get_mongo_database(DB_NOBEL_PRIZE)
db[COLL_WINNERS].delete_many({})  # deletes everything (no filter) to make sure we're starting fresh
db['winners_born_in'].delete_many({})  # deletes everything (no filter) to make sure we're starting fresh
dataframe_to_mongo(df_clean, DB_NOBEL_PRIZE, COLL_WINNERS) # save to Mongo for next section
dataframe_to_mongo(df_born_in, DB_NOBEL_PRIZE, 'winners_born_in')

# also save a local copy on sqlite
import sqlalchemy

engine = sqlalchemy.create_engine('sqlite:///data/nobel_prize.db')
df_clean.to_sql('winners', engine)


# Merge MiniBio Dataframe

In [72]:
with open('data/minibios.json') as f:
    df_winners_bios = pd.read_json(f)
    
df_winners_all = pd.merge(df_clean, df_winners_bios, how='outer', on='link')
df_winners_all.count()

category          1029
country           1029
date_of_birth     1029
date_of_death      691
gender            1029
link              1081
name              1029
place_of_birth    1029
place_of_death     691
text              1029
year              1029
award_age         1029
bio_image         1020
image_urls        1081
mini_bio          1081
dtype: int64

In [73]:
# remove redundancies
# remove duplicates of any rows that share a 'link' and 'year' field after removing
# any rows without a name field
df_winners_all = df_winners_all[~df_winners_all.name.isnull()].drop_duplicates(subset=['link', 'year'])
df_winners_all.count()

category          873
country           873
date_of_birth     873
date_of_death     592
gender            873
link              873
name              873
place_of_birth    873
place_of_death    592
text              873
year              873
award_age         873
bio_image         824
image_urls        873
mini_bio          873
dtype: int64

In [74]:
dataframe_to_mongo(df_winners_all, DB_NOBEL_PRIZE, 'winners_all')