# Introduction to Pandas
- Pandas is an open source library built on top of NumPy
- It allows for fast analysis and data cleaning and preparation
- It excels in performance and productivity.
- It also has built-in visualization features.
- It can work with data from a wide variety of sources.

## Agenda
- Series
- DataFrames
- Missing Data
- GroupBy
- Merging,Joining,and Concatenating
- Operations
- Data Input and Output

In [14]:
# It is similar to Numpy arrays with a slight addition of labels to NumPy Array
import numpy as np
import pandas as pd

In [15]:
labels=['a1','b1','c1']
my_data=[10,20,30]
arr=np.array(my_data)
d={'a':10,'b':20,'c':30}

In [16]:
print("Printing of Normal Lists")
print(pd.Series(data=my_data))
"""
A Pandas Series is like a column in a table.
It is a one-dimensional array holding data of any type.
"""
# For custom Row Labels We can also use dictionary ;)
print("Printing with Custom Row Labels using dictionary")
myvar=pd.Series(d)
print(myvar)

Printing of Normal Lists
0    10
1    20
2    30
dtype: int64
Printing with Custom Row Labels using dictionary
a    10
b    20
c    30
dtype: int64


In [17]:
print("Printing with Custom Row Labels without using Dictionary")
labels=['a1','b1','c1']
pd.Series(data=my_data,index=labels)

Printing with Custom Row Labels without using Dictionary


a1    10
b1    20
c1    30
dtype: int64

In [18]:
# Since the Data and Index are in sequence so we do not need to define Explcitly as labels and data
print("Syntax of Pandas for defining Row Labels along with columns")
pd.Series(my_data,labels)

Syntax of Pandas for defining Row Labels along with columns


a1    10
b1    20
c1    30
dtype: int64

In [19]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int32

In [20]:
pd.Series(arr,labels)

a1    10
b1    20
c1    30
dtype: int32

In [21]:
# We can also pass Dictionaries too!
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [22]:
# Pandas can also hold labels as data items 
pd.Series(data=labels)

0    a1
1    b1
2    c1
dtype: object

In [23]:
# data can be declared in-place inside thge function call!
# Note Pandas should have equal Number of Rows-labels and Columns-Row Elements  
ser1=pd.Series([2,3,4,5,6,7],['age','Children','a','b','c','d'])
print(ser1)

age         2
Children    3
a           4
b           5
c           6
d           7
dtype: int64


In [24]:
ser1['age']

2

In [25]:
ser3=pd.Series(data=labels)

In [26]:
ser3[0]

'a1'

In [27]:
# When we are performing some operations in pandas our operations are going to be converted to float
ser1

age         2
Children    3
a           4
b           5
c           6
d           7
dtype: int64

In [28]:
ser3

0    a1
1    b1
2    c1
dtype: object

In [29]:
ser1+ser3

0           NaN
1           NaN
2           NaN
Children    NaN
a           NaN
age         NaN
b           NaN
c           NaN
d           NaN
dtype: object

In [30]:
from numpy.random import randn

In [31]:
print(np.random.seed(101))

None


In [32]:
#Can be thought of as a dict-like container for Series objects. The primary pandas data structure.
# For storing 2-D Elements in Row-Column Format we use Dataframes 
df=pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [33]:
df
# Each column is a Pandas Series

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [34]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [35]:
type(df['W'])

pandas.core.series.Series

In [36]:
type(df)

pandas.core.frame.DataFrame

In [37]:
# Sequel-like Syntax to grab Columns 
# df.column_name 
# Note It is not recommended to use this type of notation
# Instead use the [] bracket notation!
df.X

A    0.628133
B   -0.319318
C    0.740122
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [38]:
df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [39]:
# For grabing multiple columns
df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [40]:
# When a key is used which doesn't exist in DataFrame then it will show an error 
# Key doesn't Exist!
df['New']

KeyError: 'New'

In [None]:
print(df['W'])
print(df['Y'])
df['New']=df['W']+df['Y']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64
A    0.907969
B   -0.848077
C    0.528813
D   -0.933237
E    2.605967
Name: Y, dtype: float64


In [None]:
df['New']

A    3.614819
B   -0.196959
C   -1.489355
D   -0.744542
E    2.796762
Name: New, dtype: float64

In [None]:
# TO remove Columns Use df.drop('name_of_the_column') also note we need to use axis=1
print(df)
df.drop('New',axis=1)

          W         X         Y         Z       New
A  2.706850  0.628133  0.907969  0.503826  3.614819
B  0.651118 -0.319318 -0.848077  0.605965 -0.196959
C -2.018168  0.740122  0.528813 -0.589001 -1.489355
D  0.188695 -0.758872 -0.933237  0.955057 -0.744542
E  0.190794  1.978757  2.605967  0.683509  2.796762


Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [None]:
# Even after Drop you encounter that it is still there so there's another parameter we need to use
# inplace parameter 
df

Unnamed: 0,W,X,Y,Z,New
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [None]:
df.drop('New',axis=1,inplace=True)

In [None]:
# Here new column is permanently removed
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [None]:
# We can also drop rows 
df.drop('E',axis=0,inplace=True)

In [None]:
# Rows are referred to as 0th Axis 
# Columns are referred to as 1st Axis ;)
df.shape

(4, 4)

In [None]:
# To select rows in a dataframe
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [None]:
df.loc['A']
# The above will help you grab the Row Series with the help of Row Label

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [None]:
# If you dont want to grab rows with the help of row-Label such if it is a string then
# use iloc <Numerical based index>
df.iloc[0]

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [None]:
# Selecting subsets of rows and columns
df.loc['B','Y']
# The above will give the element present in row-labelled 'B' and Column-Labelled 'Y'

-0.8480769834036315

In [None]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


In [None]:
import itertools
l1=[4,5,6,7,8]
l2=[1,2,3,4,5]
# l3=[(x*y) for x,y in l1,l2]
itertools.product(l1,l2)
list(itertools.product(l1,l2))

[(4, 1),
 (4, 2),
 (4, 3),
 (4, 4),
 (4, 5),
 (5, 1),
 (5, 2),
 (5, 3),
 (5, 4),
 (5, 5),
 (6, 1),
 (6, 2),
 (6, 3),
 (6, 4),
 (6, 5),
 (7, 1),
 (7, 2),
 (7, 3),
 (7, 4),
 (7, 5),
 (8, 1),
 (8, 2),
 (8, 3),
 (8, 4),
 (8, 5)]

In [None]:
# Multiply two lists
l1,l2
list3=[(l1[x]*l2[x]) for x in range(len(l1))]
print(list3)

[4, 10, 18, 28, 40]


In [None]:
df

Unnamed: 0,W,X,Y,Z,New
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [None]:
df.loc['E']

W      0.190794
X      1.978757
Y      2.605967
Z      0.683509
New    2.796762
Name: E, dtype: float64

In [None]:
#conditional Notation
bool_df=df>0
print(bool_df)
df[bool_df]

       W      X      Y      Z    New
A   True   True   True   True   True
B   True  False  False   True  False
C  False   True   True  False  False
D   True  False  False   True  False
E   True   True   True   True   True


Unnamed: 0,W,X,Y,Z,New
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,,,0.605965,
C,,0.740122,0.528813,,
D,0.188695,,,0.955057,
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [41]:
print(df['W']>0)
print(df['X']>0)
# COnditional Selection to be used as Series Value
# Below Signifies Har ek element in the Column should be greater than 0 as NaN.
print(df[df>0])
# Below Will show all the rows where the Series is True 
print((df['W']>0) & (df['X']>0))
print(df[(df['W']>0) & (df['X']>0)])

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool
A     True
B    False
C     True
D    False
E     True
Name: X, dtype: bool
          W         X         Y         Z
A  2.706850  0.628133  0.907969  0.503826
B  0.651118       NaN       NaN  0.605965
C       NaN  0.740122  0.528813       NaN
D  0.188695       NaN       NaN  0.955057
E  0.190794  1.978757  2.605967  0.683509
A     True
B    False
C    False
D    False
E     True
dtype: bool
          W         X         Y         Z
A  2.706850  0.628133  0.907969  0.503826
E  0.190794  1.978757  2.605967  0.683509


In [42]:
# Formal Language : Grab all the rows where Z<0
print(df[df['Z']<0])
# When passing whole data frame as a condition we might encounter
# Null values instead try passing series of data 
print(df['Z']<0)

          W         X         Y         Z
C -2.018168  0.740122  0.528813 -0.589001
A    False
B    False
C     True
D    False
E    False
Name: Z, dtype: bool


In [43]:
df
resultDf=df[df['W']>0]
print(resultDf['X'])

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64


In [44]:
boolser=df['W']>0
result=df[boolser]
mycols=['Y','X']
result[mycols]

# Note the one-liner code for the above is :-
df[df['W']>0][['Y','X']]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [45]:
result[mycols]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


## Note
### Why "**and**" Operator and "OR" Operator doesn't work in panda?
### Explanation
when we Write:-
~~~
df['W']>0
~~~
It return multiple True Values, but the "**and**" operator can onlt deal with single and operator at a time!
True and True -> True or False
[True,True,True] and [False,False,True] -> Error 
Instead we use **and(&)** operator ;) to deal with this issue
Same goes for **Or(|)** Operator 

In [46]:
df[(df['W']>0) & (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [47]:
df[(df['W']>0) | (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [48]:
# To get the actual Index instead of custom defined Index 
df.reset_index()
# This operation will not take In-place 
# To take it in-place use the value in parenthesis
# df.reset_index(inplace=True)

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [49]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [50]:
newind='CA WX WY OR CO'.split()

In [51]:
df['States']=newind

In [52]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,WX
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [53]:
# We have an column and we want to set it as an Index
# There is a method called set-index
df.set_index('States')
# Again the above is not inplace 
# Method goes same!

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
WX,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


In [54]:
# df.set_index('States',inplace=True)
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,WX
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


### Pandas FataFrames part-3

In [55]:
import numpy as np
import pandas as pd


In [56]:
# Index level
outside=['G1','G1','G1','G2','G2','G2']
inside=[1,2,3,1,2,3]
hier_index=list(zip(outside,inside))
hier_index=pd.MultiIndex.from_tuples(hier_index)

In [57]:
outside

['G1', 'G1', 'G1', 'G2', 'G2', 'G2']

In [58]:
inside

[1, 2, 3, 1, 2, 3]

In [59]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [92]:
df=pd.DataFrame(randn(6,2),hier_index,['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-1.467514,-0.494095
G1,2,-0.162535,0.485809
G1,3,0.392489,0.221491
G2,1,-0.855196,1.54199
G2,2,0.666319,-0.538235
G2,3,-0.568581,1.407338


In [94]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-1.467514,-0.494095
G1,2,-0.162535,0.485809
G1,3,0.392489,0.221491
G2,1,-0.855196,1.54199
G2,2,0.666319,-0.538235
G2,3,-0.568581,1.407338


In [95]:
df.index.names=['Groups','Num']

In [96]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-1.467514,-0.494095
G1,2,-0.162535,0.485809
G1,3,0.392489,0.221491
G2,1,-0.855196,1.54199
G2,2,0.666319,-0.538235
G2,3,-0.568581,1.407338


In [99]:
# Let's Suppose i want to grab -538235 from Group G2, Num-2, Row B
df.loc['G2'].loc[2]['B']

-0.5382346255173922

In [104]:
df.loc['G1'].loc[2]['A']

-0.1625347347726149

In [105]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-1.467514,-0.494095
G1,2,-0.162535,0.485809
G1,3,0.392489,0.221491
G2,1,-0.855196,1.54199
G2,2,0.666319,-0.538235
G2,3,-0.568581,1.407338


In [108]:
df.xs(1,level='Num')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-1.467514,-0.494095
G2,-0.855196,1.54199
