## Pandas 
* pandas is a open source library built on top of Numpy
* Allows for fast analysis and data cleaning and preparation
* has great performance and productivity
* it can work with a variety of data sources.

## Pandas Series 

In [4]:
import pandas as pd 
import numpy as np

In [5]:
arr = np.arange(0,5)
arr

array([0, 1, 2, 3, 4])

In [6]:
data = [0,1,2,3,4]
labels = ['a', 'b', 'c', 'd', 'e']
d = {0:'a', 1: 'b', 2: 'c', 3:'d', 4:'e'}

In [7]:
pd.Series(data, labels)

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [8]:
pd.Series(d) 

0    a
1    b
2    c
3    d
4    e
dtype: object

In [9]:
series = pd.Series(data=labels)
type(series)

pandas.core.series.Series

In [15]:
series[2]  # servies in pandas is like a map in python where we ahve keys and values. 

'c'

## Data Frames - Workhouse of the Pandas Library

In [11]:
from numpy.random import randn

In [21]:
rows = ['A','B','C','D','E']
cols = ['W','X','Y','Z']
df = pd.DataFrame(randn(5,4),rows,cols)
df.head()

Unnamed: 0,W,X,Y,Z
A,0.026057,-0.407968,0.379272,0.003552
B,0.219872,1.103009,-1.24291,0.459821
C,0.426043,-2.280854,0.989676,0.87539
D,-0.928994,0.223091,1.216253,-0.03559
E,-1.651167,-1.354544,-1.609429,-2.297259


In [23]:
df.head()

Unnamed: 0,W,X,Y,Z
A,0.026057,-0.407968,0.379272,0.003552
B,0.219872,1.103009,-1.24291,0.459821
C,0.426043,-2.280854,0.989676,0.87539
D,-0.928994,0.223091,1.216253,-0.03559
E,-1.651167,-1.354544,-1.609429,-2.297259


In [24]:
df['P'] = df['W']+df['Y']   # creates a new column of name P and adds to the data frame.
df

Unnamed: 0,W,X,Y,Z,P
A,0.026057,-0.407968,0.379272,0.003552,0.405329
B,0.219872,1.103009,-1.24291,0.459821,-1.023037
C,0.426043,-2.280854,0.989676,0.87539,1.415719
D,-0.928994,0.223091,1.216253,-0.03559,0.287258
E,-1.651167,-1.354544,-1.609429,-2.297259,-3.260596


In [25]:
df.drop('P', axis=1, inplace=True)  # this will drop the new column P and do it inplace. 

In [26]:
df

Unnamed: 0,W,X,Y,Z
A,0.026057,-0.407968,0.379272,0.003552
B,0.219872,1.103009,-1.24291,0.459821
C,0.426043,-2.280854,0.989676,0.87539
D,-0.928994,0.223091,1.216253,-0.03559
E,-1.651167,-1.354544,-1.609429,-2.297259


In [27]:
df.shape

(5, 4)

In [30]:
df['X']  # getting colum values for X - this returns a Series type.

pandas.core.series.Series

In [38]:
df.loc['A'] # string based index search. this is a series type too.

W   -0.927018
X   -0.375506
Y   -0.636176
Z    0.486354
Name: A, dtype: float64

In [44]:
df.loc[['A','C'],['X','Z']]  # getting a subset of elements in that arrays.

Unnamed: 0,X,Z
A,-0.375506,0.486354
C,0.616125,-0.28031


In [45]:
df.iloc[2] # numerical based index search - row at index 2

W    0.785058
X    0.616125
Y   -0.587598
Z   -0.280310
Name: C, dtype: float64

In [31]:
df > 0 

Unnamed: 0,W,X,Y,Z
A,True,False,True,True
B,True,True,False,True
C,True,False,True,True
D,False,True,True,False
E,False,False,False,False


In [32]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,0.026057,,0.379272,0.003552
B,0.219872,1.103009,,0.459821
C,0.426043,,0.989676,0.87539
D,,0.223091,1.216253,
E,,,,


In [33]:
df['W']>0 

A     True
B     True
C     True
D    False
E    False
Name: W, dtype: bool

In [34]:
resultdf = df[df['X']>0]  # other places df[df['Y']<0]
resultdf

Unnamed: 0,W,X,Y,Z
B,0.219872,1.103009,-1.24291,0.459821
D,-0.928994,0.223091,1.216253,-0.03559


In [35]:
resultdf[['X','Y']]   

Unnamed: 0,X,Y
B,1.103009,-1.24291
D,0.223091,1.216253


In [36]:
df[ (df['W']<0) & (df['Y']>1)]   # & operatior for more conditions to get more values. 

Unnamed: 0,W,X,Y,Z
D,-0.928994,0.223091,1.216253,-0.03559


In [37]:
df

Unnamed: 0,W,X,Y,Z
A,0.026057,-0.407968,0.379272,0.003552
B,0.219872,1.103009,-1.24291,0.459821
C,0.426043,-2.280854,0.989676,0.87539
D,-0.928994,0.223091,1.216253,-0.03559
E,-1.651167,-1.354544,-1.609429,-2.297259


In [38]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,0.026057,-0.407968,0.379272,0.003552
1,B,0.219872,1.103009,-1.24291,0.459821
2,C,0.426043,-2.280854,0.989676,0.87539
3,D,-0.928994,0.223091,1.216253,-0.03559
4,E,-1.651167,-1.354544,-1.609429,-2.297259


In [39]:
new_index = 'CO NY CA NO NE'.split()
new_index

['CO', 'NY', 'CA', 'NO', 'NE']

In [40]:
df['States'] = new_index
df

Unnamed: 0,W,X,Y,Z,States
A,0.026057,-0.407968,0.379272,0.003552,CO
B,0.219872,1.103009,-1.24291,0.459821,NY
C,0.426043,-2.280854,0.989676,0.87539,CA
D,-0.928994,0.223091,1.216253,-0.03559,NO
E,-1.651167,-1.354544,-1.609429,-2.297259,NE


In [41]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CO,0.026057,-0.407968,0.379272,0.003552
NY,0.219872,1.103009,-1.24291,0.459821
CA,0.426043,-2.280854,0.989676,0.87539
NO,-0.928994,0.223091,1.216253,-0.03559
NE,-1.651167,-1.354544,-1.609429,-2.297259


In [42]:
df

Unnamed: 0,W,X,Y,Z,States
A,0.026057,-0.407968,0.379272,0.003552,CO
B,0.219872,1.103009,-1.24291,0.459821,NY
C,0.426043,-2.280854,0.989676,0.87539,CA
D,-0.928994,0.223091,1.216253,-0.03559,NO
E,-1.651167,-1.354544,-1.609429,-2.297259,NE
