# Pandas Basics

## Index access

In [1]:
import pandas as pd

df = pd.read_json('data/nwinners.json')
df.head()

Unnamed: 0,born_in,category,country,date_of_birth,gender,link,name,place_of_birth,place_of_death,text,year
0,,Physics,Austria,25 April 1900,male,http://en.wikipedia.org/wiki/Wolfgang_Pauli,Wolfgang Pauli,Vienna,Zurich,"Wolfgang Pauli , Physics, 1945",1945
1,Austria,Chemistry,,3 December 1900,male,http://en.wikipedia.org/wiki/Richard_Kuhn,Richard Kuhn *,Vienna,Heidelberg,"Richard Kuhn *, Chemistry, 1938",1938
2,,Physiology or Medicine,Australia,27 January 1903,male,http://en.wikipedia.org/wiki/John_Eccles_(neur...,John Carew Eccles,Melbourne,Locarno,"John Carew Eccles , Physiology or Medicine, 1963",1963
3,,Physiology or Medicine,Australia,3 September 1899,male,http://en.wikipedia.org/wiki/Frank_Macfarlane_...,Sir Frank Macfarlane Burnet,Traralgon,Melbourne,"Sir Frank Macfarlane Burnet , Physiology or Me...",1960
4,,Physiology or Medicine,Australia,24 September 1898,male,http://en.wikipedia.org/wiki/Howard_Florey,Howard Florey,Adelaide,Oxford,"Howard Florey , Physiology or Medicine, 1945",1945


Initially, Pandas DataFrames are indexed by a columns property, which is a Panda index instance.   DataFrame rows also initially have a single numeric index (Pandas can have multiple indexes, including string or datetime indices, if necessary) which is called by the index property

In [2]:
df.columns

Index([u'born_in', u'category', u'country', u'date_of_birth', u'gender',
       u'link', u'name', u'place_of_birth', u'place_of_death', u'text',
       u'year'],
      dtype='object')

In [3]:
df.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068],
           dtype='int64', length=1069)

Often, to aid selections, a column of the dataframe will be se to the index via the set_index method

In [4]:
df = df.set_index('name')
df.loc['Albert Einstein']

Unnamed: 0_level_0,born_in,category,country,date_of_birth,gender,link,place_of_birth,place_of_death,text,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Albert Einstein,,Physics,Switzerland,14 March 1879,male,http://en.wikipedia.org/wiki/Albert_Einstein,Ulm,Princeton,"Albert Einstein , born in Germany , Physics, ...",1921
Albert Einstein,,Physics,Germany,14 March 1879,male,http://en.wikipedia.org/wiki/Albert_Einstein,Ulm,Princeton,"Albert Einstein , Physics, 1921",1921


In [5]:
df.reset_index()
pass

In [6]:
df.iloc[2]

born_in                                                            
category                                     Physiology or Medicine
country                                                   Australia
date_of_birth                                       27 January 1903
gender                                                         male
link              http://en.wikipedia.org/wiki/John_Eccles_(neur...
place_of_birth                                            Melbourne
place_of_death                                              Locarno
text               John Carew Eccles , Physiology or Medicine, 1963
year                                                           1963
Name: John Carew Eccles, dtype: object

You can get a column with dot notation or conventional array access by keyword string

In [7]:
gender_col = df['gender']  # or df.gender
gender_col.head()

name
Wolfgang Pauli                 male
Richard Kuhn *                 male
John Carew Eccles              male
Sir Frank Macfarlane Burnet    male
Howard Florey                  male
Name: gender, dtype: object

## Grouping

To select groups (or subsets of rows) and return a new, filtered dataframe, use groupby

In [8]:
df = df.groupby('category')
df.groups.keys()

[u'',
 u'Physiology or Medicine',
 u'Literature',
 u'Economics',
 u'Peace',
 u'Chemistry',
 u'Physics']

In [9]:
phy_group = df.get_group('Physics')
phy_group.head()

Unnamed: 0_level_0,born_in,category,country,date_of_birth,gender,link,place_of_birth,place_of_death,text,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Wolfgang Pauli,,Physics,Austria,25 April 1900,male,http://en.wikipedia.org/wiki/Wolfgang_Pauli,Vienna,Zurich,"Wolfgang Pauli , Physics, 1945",1945
Arthur H. Compton,,Physics,United States,10 September 1892,male,http://en.wikipedia.org/wiki/Arthur_H._Compton,Wooster,Berkeley,"Arthur H. Compton , Physics, 1927",1927
Robert A. Millikan,,Physics,United States,22 March 1868,male,http://en.wikipedia.org/wiki/Robert_A._Millikan,Morrison,San Marino,"Robert A. Millikan , Physics, 1923",1923
Albert A. Michelson,,Physics,United States,19 December 1852,male,http://en.wikipedia.org/wiki/Albert_A._Michelson,Province of Posen,Pasadena,"Albert A. Michelson , born in then Germany, n...",1907
Ernest Lawrence,,Physics,United States,8 August 1901,male,http://en.wikipedia.org/wiki/Ernest_Lawrence,Canton,Palo Alto,"Ernest Lawrence , Physics, 1939",1939


Another way to do this is with boolean masks (like numpy and R)

In [10]:
df = pd.read_json('data/nwinners.json')
phy_group = df[df.category == 'Physics']
phy_group.head()

Unnamed: 0,born_in,category,country,date_of_birth,gender,link,name,place_of_birth,place_of_death,text,year
0,,Physics,Austria,25 April 1900,male,http://en.wikipedia.org/wiki/Wolfgang_Pauli,Wolfgang Pauli,Vienna,Zurich,"Wolfgang Pauli , Physics, 1945",1945
11,,Physics,United States,10 September 1892,male,http://en.wikipedia.org/wiki/Arthur_H._Compton,Arthur H. Compton,Wooster,Berkeley,"Arthur H. Compton , Physics, 1927",1927
13,,Physics,United States,22 March 1868,male,http://en.wikipedia.org/wiki/Robert_A._Millikan,Robert A. Millikan,Morrison,San Marino,"Robert A. Millikan , Physics, 1923",1923
25,,Physics,United States,19 December 1852,male,http://en.wikipedia.org/wiki/Albert_A._Michelson,Albert A. Michelson,Province of Posen,Pasadena,"Albert A. Michelson , born in then Germany, n...",1907
29,,Physics,United States,8 August 1901,male,http://en.wikipedia.org/wiki/Ernest_Lawrence,Ernest Lawrence,Canton,Palo Alto,"Ernest Lawrence , Physics, 1939",1939


## Series
DataFrames are a collection of Series, which are 1-dimensional.  The key idea of Series is the index.

In [20]:
import numpy as np
import pandas as pd

s1 = pd.Series(np.arange(4))
s = pd.Series([1, 2, 3, 4]) # default index is integers
print(s1 == s) # True
print('First: \n{}'.format(s)) # 1 column of int indices and 1 column of values

s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) 
print('Second: \n{}'.format(s))  # now the index will be a, b, c, d

s = pd.Series({'a':1, 'b':2, 'c':3, 'd':4})
print('Third: \n{}'.format(s))

s = pd.Series({'a':1, 'b':2}, index=['a', 'b', 'c']) # index 'c' will show NaN
print('Fourth: \n{}'.format(s))

s = pd.Series(9, index=['a', 'b', 'c']) # All indices will be 9
print('Fifth: \n{}'.format(s))

np.sqrt(s) # can use numpy functions -> a: 3, b: 3, c:3

s[1:3] # can slice like python or numpy, but indices (a,b,c) are preserved

## BIG DEAL
## You can have mixed types (like lists, unlike numpy arrays)
pd.Series([1, 2.1, 'foo']) + pd.Series([2, 3, 'bar'])


0    False
1    False
2    False
3    False
dtype: bool
First: 
0    1
1    2
2    3
3    4
dtype: int64
Second: 
a    1
b    2
c    3
d    4
dtype: int64
Third: 
a    1
b    2
c    3
d    4
dtype: int64
Fourth: 
a    1.0
b    2.0
c    NaN
dtype: float64
Fifth: 
a    9
b    9
c    9
dtype: int64


0         3
1       5.1
2    foobar
dtype: object

## Panels
Panels are 3D objects (multiple DataFrames)

In [23]:
df1 = pd.DataFrame({'foo':[1,2,3], 'bar':['a','b','c']})
df2 = pd.DataFrame({'baz':[7,8,9,10], 'qux':['p','q', 'r', 't']})
pn = pd.Panel({'item1':df1, 'item2':df2})
print(pn)
pn['item1']

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 4 (major_axis) x 4 (minor_axis)
Items axis: item1 to item2
Major_axis axis: 0 to 3
Minor_axis axis: bar to qux


Unnamed: 0,bar,baz,foo,qux
0,a,,1.0,
1,b,,2.0,
2,c,,3.0,
3,,,,


## Working with MongoDB inside Pandas

In [None]:
import pandas as pd
from pymongo import MongoClient

def get_mongo_database(db_name, host='localhost', port=27017, username=None, password=None):
    '''Get (or create) named database from MongoDB with/out authentication'''
    if username and password:
        mongo_uri = 'mongodb://{}:{}@{}/{}'.format(username, password, host, db_name)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)
    return conn[db_name]

def mongo_to_dataframe(db_name, collection_name, query={}, host='localhost',
                       port=27017, username=None, password=None, no_id=True):
    '''Create a Pandas DataFrame from MongoDB collection'''
    db = get_mongo_database(db_name, host, port, username, password)
    cursor = db[collection_name].find(query)
    df = pd.DataFrame(list(cursor))
    if no_id:
        del df['_id']
    return df

def save_to_mongo(dframe, db_name, collection_name, host='localhost',
                 port=27017, username=None, password=None):
    db = get_mongo_database(db_name, host, port, username, password)
    records = df.to_dict('records')  # 'records' puts it into our list-of-dicts format
    db[collection_name].insert_many(records)
    

DB_NOBEL_PRIZE = 'nobel_prize' # use string constants or a spell error in retrieval will create new table.
COLL_WINNERS = 'winners' # winners collection

db = get_mongo_database(DB_NOBEL_PRIZE)
records  = df.to_dict('records') # 'records' puts it into our list-of-dicts format
db[COLL_WINNERS].insert_many(records)

df2 = mongo_to_dataframe(DB_NOBEL_PRIZE, COLL_WINNERS)
