# Cleaning Data with Pandas
## Reading the data

In [48]:
import numpy as np
import pandas as pd

from pymongo import MongoClient

def get_mongo_database(db_name, host='localhost', port=27017, username=None, password=None):
    '''Get (or create) named database from MongoDB with/out authentication'''
    if username and password:
        mongo_uri = 'mongodb://{}:{}@{}/{}'.format(username, password, host, db_name)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)
    return conn[db_name]

def mongo_to_dataframe(db_name, collection_name, query={}, host='localhost',
                       port=27017, username=None, password=None, no_id=True):
    '''Create a Pandas DataFrame from MongoDB collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    cursor = db[collection_name].find(query)
    df = pd.DataFrame(list(cursor))
    if no_id:
        del df['_id']
    return df

def save_to_mongo(dframe, db_name, collection_name, host='localhost',
                 port=27017, username=None, password=None):
    db = get_mongo_database(db_name, host, port, username, password)
    records = df.to_dict('records')  # 'records' puts it into our list-of-dicts format
    db[collection_name].insert_many(records)


DB_NOBEL_PRIZE = 'nobel_prize' # use string constants or a spell error in retrieval will create new table.
COLL_WINNERS = 'winners' # winners collection

#----------------------------
# From json file.
#----------------------------

with open('data/nwinners.json') as f:
    df = pd.read_json(f)

# Make sure mongodb is clear (so we don't duplicate data), then
# save to Mongo for next section
db = get_mongo_database(DB_NOBEL_PRIZE)
db[COLL_WINNERS].delete_many({})  # deletes everything (no filter)
save_to_mongo(df, DB_NOBEL_PRIZE, COLL_WINNERS) # save to Mongo for next section

#----------------------------
# From mongodb collection
#----------------------------

df = mongo_to_dataframe(DB_NOBEL_PRIZE, COLL_WINNERS)


## Inspecting the data

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1069 entries, 0 to 1068
Data columns (total 11 columns):
born_in           1069 non-null object
category          1069 non-null object
country           1069 non-null object
date_of_birth     1060 non-null object
gender            1060 non-null object
link              1069 non-null object
name              1069 non-null object
place_of_birth    1060 non-null object
place_of_death    712 non-null object
text              1069 non-null object
year              1069 non-null int64
dtypes: int64(1), object(10)
memory usage: 91.9+ KB


In [50]:
df.describe() # only acts on numeric columns
df.describe(include=['object']) # include specifies other types to include (besides numeric)

Unnamed: 0,born_in,category,country,date_of_birth,gender,link,name,place_of_birth,place_of_death,text
count,1069.0,1069,1069,1060,1060,1069,1069,1060,712,1069
unique,33.0,7,59,868,2,912,1002,607,316,1057
top,,Physiology or Medicine,United States,7 November 1867,male,http://en.wikipedia.org/wiki/Marie_Curie,César Milstein,New York City,Cambridge,"Luis Federico Leloir , Chemistry, 1970"
freq,938.0,256,352,4,1003,4,3,44,37,2


In [51]:
df.tail()
df.head(3)

Unnamed: 0,born_in,category,country,date_of_birth,gender,link,name,place_of_birth,place_of_death,text,year
0,,Physics,Ireland,6 October 1903,male,http://en.wikipedia.org/wiki/Ernest_Walton,Ernest Walton,Dungarvan,Belfast,"Ernest Walton , Physics, 1951",1951
1,,Chemistry,Germany,10 October 1936,male,http://en.wikipedia.org/wiki/Gerhard_Ertl,Gerhard Ertl,Stuttgart,,"Gerhard Ertl , Chemistry, 2007",2007
2,,Physics,Germany,18 May 1939,male,http://en.wikipedia.org/wiki/Peter_Gr%C3%BCnberg,Peter Grünberg,Plzeň,,"Peter Grünberg , born in then Protectorate of...",2007


## Set indices (optional, but useful)

In [56]:
print(df.columns) # the column index
df = df.set_index('name') # row index (note we need to assign to df)
df.loc['Albert Einstein'] # loc accesses by label index

Index([          u'name',        u'born_in',       u'category',
              u'country',  u'date_of_birth',         u'gender',
                 u'link', u'place_of_birth', u'place_of_death',
                 u'text',           u'year'],
      dtype='object')


Unnamed: 0_level_0,born_in,category,country,date_of_birth,gender,link,place_of_birth,place_of_death,text,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Albert Einstein,,Physics,Germany,14 March 1879,male,http://en.wikipedia.org/wiki/Albert_Einstein,Ulm,Princeton,"Albert Einstein , Physics, 1921",1921
Albert Einstein,,Physics,Switzerland,14 March 1879,male,http://en.wikipedia.org/wiki/Albert_Einstein,Ulm,Princeton,"Albert Einstein , born in Germany , Physics, ...",1921


In [57]:
df.reset_index(inplace=True) # go back to default int index.  Note the 'inplace'
df.iloc[2] # iloc accesses by integer index
df.head(2)

Unnamed: 0,name,born_in,category,country,date_of_birth,gender,link,place_of_birth,place_of_death,text,year
0,Ernest Walton,,Physics,Ireland,6 October 1903,male,http://en.wikipedia.org/wiki/Ernest_Walton,Dungarvan,Belfast,"Ernest Walton , Physics, 1951",1951
1,Gerhard Ertl,,Chemistry,Germany,10 October 1936,male,http://en.wikipedia.org/wiki/Gerhard_Ertl,Stuttgart,,"Gerhard Ertl , Chemistry, 2007",2007


In [58]:
bornin_col = df.born_in # or df['born_in']
bornin_col

0              
1              
2              
3              
4              
5              
6              
7         India
8              
9         India
10             
11        India
12             
13             
14             
15             
16             
17             
18             
19             
20             
21             
22             
23      Germany
24             
25             
26             
27             
28             
29             
         ...   
1039           
1040           
1041           
1042           
1043           
1044           
1045           
1046    Ireland
1047           
1048           
1049           
1050    Germany
1051           
1052           
1053           
1054           
1055           
1056           
1057           
1058           
1059           
1060           
1061    Hungary
1062           
1063           
1064           
1065           
1066           
1067           
1068           
Name: born_in, Length: 1

In [72]:
df[0:10] #first 10 rows
df[-4:]  #last 4 rows

# masks
mask = df.year > 2000
df[mask]
df[df.year>2000] # more direct

Unnamed: 0,name,born_in,category,country,date_of_birth,gender,link,place_of_birth,place_of_death,text,year
1,Gerhard Ertl,,Chemistry,Germany,10 October 1936,male,http://en.wikipedia.org/wiki/Gerhard_Ertl,Stuttgart,,"Gerhard Ertl , Chemistry, 2007",2007
2,Peter Grünberg,,Physics,Germany,18 May 1939,male,http://en.wikipedia.org/wiki/Peter_Gr%C3%BCnberg,Plzeň,,"Peter Grünberg , born in then Protectorate of...",2007
7,Venkatraman Ramakrishnan *,India,Chemistry,,1952,male,http://en.wikipedia.org/wiki/Venkatraman_Ramak...,Chidambaram,,"Venkatraman Ramakrishnan *, as a UK Citizen ,...",2009
10,Kailash Satyarthi,,Peace,India,11 January 1954,male,http://en.wikipedia.org/wiki/Kailash_Satyarthi,Vidisha,,"Kailash Satyarthi , Peace, 2014",2014
12,Kofi Annan,,Peace,Ghana,8 April 1938,male,http://en.wikipedia.org/wiki/Kofi_Annan,Kumasi,,"Kofi Annan , Peace, 2001",2001
17,Imre Kertész,,Literature,Hungary,9 November 1929,male,http://en.wikipedia.org/wiki/Imre_Kert%C3%A9sz,Budapest,Budapest,"Imre Kertész , Literature, 2002",2002
92,Martin Karplus *,Austria,Chemistry,,15 March 1930,male,http://en.wikipedia.org/wiki/Martin_Karplus,Vienna,,"Martin Karplus *, Chemistry, 2013",2013
93,International Atomic Energy Agency,,Peace,Austria,,,http://en.wikipedia.org/wiki/International_Ato...,,,"International Atomic Energy Agency , Peace, 2005",2005
94,Elfriede Jelinek,,Literature,Austria,20 October 1946,female,http://en.wikipedia.org/wiki/Elfriede_Jelinek,Mürzzuschlag,,"Elfriede Jelinek , Literature, 2004",2004
174,Yves Chauvin,,Chemistry,France,10 October 1930,male,http://en.wikipedia.org/wiki/Yves_Chauvin,Menen,Tours,"Yves Chauvin , Chemistry, 2005",2005


## Cleaning the Data


In [73]:
set(df.born_in.apply(type))

{unicode}

In [75]:
# replace empty string with NaN
bornin_col.replace('', np.nan, inplace=True)
bornin_col

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
5           NaN
6           NaN
7         India
8           NaN
9         India
10          NaN
11        India
12          NaN
13          NaN
14          NaN
15          NaN
16          NaN
17          NaN
18          NaN
19          NaN
20          NaN
21          NaN
22          NaN
23      Germany
24          NaN
25          NaN
26          NaN
27          NaN
28          NaN
29          NaN
         ...   
1039        NaN
1040        NaN
1041        NaN
1042        NaN
1043        NaN
1044        NaN
1045        NaN
1046    Ireland
1047        NaN
1048        NaN
1049        NaN
1050    Germany
1051        NaN
1052        NaN
1053        NaN
1054        NaN
1055        NaN
1056        NaN
1057        NaN
1058        NaN
1059        NaN
1060        NaN
1061    Hungary
1062        NaN
1063        NaN
1064        NaN
1065        NaN
1066        NaN
1067        NaN
1068        NaN
Name: born_in, Length: 1

In [76]:
# How many names with asterisk?
df[df.name.str.contains('\*')]['name']

7          Venkatraman Ramakrishnan *
9        Subrahmanyan Chandrasekhar *
11               Har Gobind Khorana *
23                       Otto Loewi *
31           Hans von Euler-Chelpin *
55                     John Polanyi *
59              Georges J.F. Köhler *
61                     Arno Penzias *
62                  Henry Kissinger *
68                     Bernard Katz *
69                     Max Delbrück *
70              Hans Albrecht Bethe *
71                      Nelly Sachs *
73                     Konrad Bloch *
75             Maria Goeppert-Mayer *
79                   Polykarp Kusch *
80                         Max Born *
83             Fritz Albert Lipmann *
84                 Hans Adolf Krebs *
85                       Otto Stern *
88                 Jack Steinberger *
90                   John Cornforth *
92                   Martin Karplus *
95                      Eric Kandel *
96                      Walter Kohn *
105                Gerhard Herzberg *
117         

In [78]:
df.name = df.name.str.replace('*', '')
df[df.name.str.contains('\*')]['name']

Series([], Name: name, dtype: object)