## Pandas DataFrames

In [1]:
import numpy as np
import pandas as pd

In [3]:
from numpy.random import randn

In [44]:
np.random.seed(0) # it will create the same random numbers

In [99]:
my_row = ['A','B','C','D','E']
my_col = ['X','Y','Z']

In [100]:
data = randn(5,3)

In [101]:
df = pd.DataFrame(data,my_row,my_col)

In [102]:
df

Unnamed: 0,X,Y,Z
A,0.333674,1.494079,-0.205158
B,0.313068,-0.854096,-2.55299
C,0.653619,0.864436,-0.742165
D,2.269755,-1.454366,0.045759
E,-0.187184,1.532779,1.469359


## Here each columns are the Series which are sharing the same index
## So in this DataFrame we have 3 Series and 4 Index
## So Python Data Fram is the combination of Series and Index

In [103]:
df['X']

A    0.333674
B    0.313068
C    0.653619
D    2.269755
E   -0.187184
Name: X, dtype: float64

In [104]:
df['Y']

A    1.494079
B   -0.854096
C    0.864436
D   -1.454366
E    1.532779
Name: Y, dtype: float64

In [105]:
df['Z']['D']

0.04575851730144607

In [106]:
df['Y']['A']

1.4940790731576061

In [107]:
type(df['X']) # With this we can verify that every column is a Panda Series

pandas.core.series.Series

In [108]:
type(df['Y']['A'])

numpy.float64

In [109]:
type(df) # to check the type

pandas.core.frame.DataFrame

## Also we can get the Col details like we use in SQL

In [110]:
df.X

A    0.333674
B    0.313068
C    0.653619
D    2.269755
E   -0.187184
Name: X, dtype: float64

In [111]:
df.Y

A    1.494079
B   -0.854096
C    0.864436
D   -1.454366
E    1.532779
Name: Y, dtype: float64

In [112]:
df.Z

A   -0.205158
B   -2.552990
C   -0.742165
D    0.045759
E    1.469359
Name: Z, dtype: float64

In [113]:
df.Z.A

-0.20515826376580087

In [114]:
df.Y.D

-1.4543656745987648

In [115]:
df.X.E

-0.1871838500258336

In [116]:
type(df.Z)

pandas.core.series.Series

In [117]:
type(df.X.E)

numpy.float64

## To get the Multiple Columns

In [118]:
df

Unnamed: 0,X,Y,Z
A,0.333674,1.494079,-0.205158
B,0.313068,-0.854096,-2.55299
C,0.653619,0.864436,-0.742165
D,2.269755,-1.454366,0.045759
E,-0.187184,1.532779,1.469359


In [119]:
df[['Y','Z']]

Unnamed: 0,Y,Z
A,1.494079,-0.205158
B,-0.854096,-2.55299
C,0.864436,-0.742165
D,-1.454366,0.045759
E,1.532779,1.469359


In [120]:
type(df[['Y','Z']])

pandas.core.frame.DataFrame

## Creating A New Column

In [121]:
df['SUM'] = df['X'] + df['Z']

In [122]:
df

Unnamed: 0,X,Y,Z,SUM
A,0.333674,1.494079,-0.205158,0.128516
B,0.313068,-0.854096,-2.55299,-2.239922
C,0.653619,0.864436,-0.742165,-0.088546
D,2.269755,-1.454366,0.045759,2.315513
E,-0.187184,1.532779,1.469359,1.282175


In [123]:
df['SUB'] =df['Y'] - df['Z']

In [124]:
df

Unnamed: 0,X,Y,Z,SUM,SUB
A,0.333674,1.494079,-0.205158,0.128516,1.699237
B,0.313068,-0.854096,-2.55299,-2.239922,1.698894
C,0.653619,0.864436,-0.742165,-0.088546,1.606601
D,2.269755,-1.454366,0.045759,2.315513,-1.500124
E,-0.187184,1.532779,1.469359,1.282175,0.06342


## To Drop the Column

In [125]:
df.drop('SUM')

KeyError: "['SUM'] not found in axis"

## We can see the drop did not happen due to Value Error
## Reason by default it will point to the axis=0 which is row, And in order to drop the column we have to mention axis=1

In [126]:
df.drop('SUM',axis=1)

Unnamed: 0,X,Y,Z,SUB
A,0.333674,1.494079,-0.205158,1.699237
B,0.313068,-0.854096,-2.55299,1.698894
C,0.653619,0.864436,-0.742165,1.606601
D,2.269755,-1.454366,0.045759,-1.500124
E,-0.187184,1.532779,1.469359,0.06342


In [127]:
df # But the change is not Permanent

Unnamed: 0,X,Y,Z,SUM,SUB
A,0.333674,1.494079,-0.205158,0.128516,1.699237
B,0.313068,-0.854096,-2.55299,-2.239922,1.698894
C,0.653619,0.864436,-0.742165,-0.088546,1.606601
D,2.269755,-1.454366,0.045759,2.315513,-1.500124
E,-0.187184,1.532779,1.469359,1.282175,0.06342


In [128]:
df.drop('A') # To remove a ROW, by default the AXIS is 1

Unnamed: 0,X,Y,Z,SUM,SUB
B,0.313068,-0.854096,-2.55299,-2.239922,1.698894
C,0.653619,0.864436,-0.742165,-0.088546,1.606601
D,2.269755,-1.454366,0.045759,2.315513,-1.500124
E,-0.187184,1.532779,1.469359,1.282175,0.06342


In [129]:
df # Change is not Permanent

Unnamed: 0,X,Y,Z,SUM,SUB
A,0.333674,1.494079,-0.205158,0.128516,1.699237
B,0.313068,-0.854096,-2.55299,-2.239922,1.698894
C,0.653619,0.864436,-0.742165,-0.088546,1.606601
D,2.269755,-1.454366,0.045759,2.315513,-1.500124
E,-0.187184,1.532779,1.469359,1.282175,0.06342


## Reason behind it is not affecting the Data Frame as by default the inplace=False
## The reason Pandas does that because we should not accidentally loose information

In [130]:
df

Unnamed: 0,X,Y,Z,SUM,SUB
A,0.333674,1.494079,-0.205158,0.128516,1.699237
B,0.313068,-0.854096,-2.55299,-2.239922,1.698894
C,0.653619,0.864436,-0.742165,-0.088546,1.606601
D,2.269755,-1.454366,0.045759,2.315513,-1.500124
E,-0.187184,1.532779,1.469359,1.282175,0.06342


In [131]:
df.drop('X',axis=1,inplace=True) # To Remove a Column Permanently

In [132]:
df

Unnamed: 0,Y,Z,SUM,SUB
A,1.494079,-0.205158,0.128516,1.699237
B,-0.854096,-2.55299,-2.239922,1.698894
C,0.864436,-0.742165,-0.088546,1.606601
D,-1.454366,0.045759,2.315513,-1.500124
E,1.532779,1.469359,1.282175,0.06342


In [133]:
df.drop('A',inplace=True)

In [135]:
df # Row deleted Permanently

Unnamed: 0,Y,Z,SUM,SUB
B,-0.854096,-2.55299,-2.239922,1.698894
C,0.864436,-0.742165,-0.088546,1.606601
D,-1.454366,0.045759,2.315513,-1.500124
E,1.532779,1.469359,1.282175,0.06342


In [138]:
df.shape # To check no of Rows and Cloumns

(4, 4)

## Selecting Columns
# Single Column

In [139]:
df['Y']

B   -0.854096
C    0.864436
D   -1.454366
E    1.532779
Name: Y, dtype: float64

## Multiple Column

In [140]:
df[['Y','Z']]

Unnamed: 0,Y,Z
B,-0.854096,-2.55299
C,0.864436,-0.742165
D,-1.454366,0.045759
E,1.532779,1.469359


## Selecting Rows

In [146]:
df.loc['B'] # To Selct a Row we have to use loc method

Y     -0.854096
Z     -2.552990
SUM   -2.239922
SUB    1.698894
Name: B, dtype: float64

In [147]:
type(df.loc['B']) # Rows are also Series

pandas.core.series.Series

## Another way to select the Row (Index Based Location)

In [148]:
df

Unnamed: 0,Y,Z,SUM,SUB
B,-0.854096,-2.55299,-2.239922,1.698894
C,0.864436,-0.742165,-0.088546,1.606601
D,-1.454366,0.045759,2.315513,-1.500124
E,1.532779,1.469359,1.282175,0.06342


In [150]:
df.iloc[2]

Y     -1.454366
Z      0.045759
SUM    2.315513
SUB   -1.500124
Name: D, dtype: float64

In [152]:
type(df.iloc[2])  # It is also a Series

pandas.core.series.Series

## If we need to get the value of a ROW and A Column then

In [153]:
df

Unnamed: 0,Y,Z,SUM,SUB
B,-0.854096,-2.55299,-2.239922,1.698894
C,0.864436,-0.742165,-0.088546,1.606601
D,-1.454366,0.045759,2.315513,-1.500124
E,1.532779,1.469359,1.282175,0.06342


In [154]:
df.loc['C','SUM'] # df.loc(ROW, column)

-0.08854642496608134

## To get multiple values

In [155]:
df

Unnamed: 0,Y,Z,SUM,SUB
B,-0.854096,-2.55299,-2.239922,1.698894
C,0.864436,-0.742165,-0.088546,1.606601
D,-1.454366,0.045759,2.315513,-1.500124
E,1.532779,1.469359,1.282175,0.06342


In [158]:
df.loc[['D','E'], ['Y','SUB']] # Which rows, Which Columns

Unnamed: 0,Y,SUB
D,-1.454366,-1.500124
E,1.532779,0.06342
