Pandas Notes

- Pandas is an open source library built on top of NumPy.
- It allows for fast analysis and data cleaning and preparation.
- It excels in perforance and productivity.
- It also has built-in visulization features.
- It can work with data from a wide variety of sources.

In [22]:
import numpy as np
import pandas as pd
from numpy.random import randn

In [17]:
labels = ['a32','b33','c34']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10,'b':20,'c':30}

In [19]:
# Have to tell pandas what we want the index to be
pd.Series(data = my_data,index=labels)

a32    10
b33    20
c34    30
dtype: int64

In [21]:
#Dict automatically adds index to the series
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [35]:
#Gives you random numbers for your DataFrame
np.random.seed(101)

In [36]:
#Creates a DataFrame of RANDN (random) numbers with A,B,C,D,E as index AND W,Z,Y,Z as columns
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [37]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [52]:
#Returns the W column
df['W']

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [53]:
#Returns the W & Z column
df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [55]:
#Creates a new column called "New" adding columns W and Y together
df['New'] = df['W'] + df['Y']

In [57]:
df

Unnamed: 0,W,X,Y,Z,New
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [59]:
#Drops that new column temporarily
df.drop('New',axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [61]:
# Drops it permanently with the inplace argument
df.drop('New',axis=1,inplace=True)

In [65]:
# Returns the 3rd row in or [2] of values
df.iloc[2]

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [68]:
#Create a variable on the df called 'booldf'
booldf = df > 0

In [70]:
#Call the dataframe and the booldf varible on it, then replace all the NaN values with 0
df[booldf].fillna(0)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,0.0,0.0,0.605965
C,0.0,0.740122,0.528813,0.0
D,0.188695,0.0,0.0,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [74]:
# Returns all the values in the df that are greater than 2 and fills all NaN values with 0
df[df>2].fillna(0)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.0,0.0,0.0
B,0.0,0.0,0.0,0.0
C,0.0,0.0,0.0,0.0
D,0.0,0.0,0.0,0.0
E,0.0,0.0,2.605967,0.0


In [88]:
#Multi Conditional argument can use (& or |) as (%and / |or)
df[(df['W']>0) & (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [90]:
#Resets index as a column, not saved since we didnt do inplace = True arguement
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [91]:
#Creates a varaible with 'ca ny wy or co' as values
newind = 'CA NY WY OR CO'.split()

In [92]:
#Shows us the new row of values
newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [93]:
#Creates a new column called 'States' where we assign 'newind' as the column values
df['States'] = newind

In [94]:
#Show the new df with States
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [96]:
#Sets States as the new index
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


In [97]:
#Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [102]:
#Creates Multi-Index dataframe
df = pd.DataFrame(randn(6,2),hier_index,['A','B'])