# Data Frame in Pandas
A dataframe is a bunch of panda series sharing the same index.

In [2]:
import pandas as pd
import numpy as np

In [3]:
from numpy.random import randn

In [4]:
np.random.seed(101)

In [5]:
# Create a sample data frame
df = pd.DataFrame(randn(5,4), ['A', 'B', 'C', 'D', 'E'], ['W', 'X', 'Y', 'Z'])

In [6]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [7]:
# We may grab a column using column name in array notation bracket
# E.g. 
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [8]:
type(df['W'])

pandas.core.series.Series

In [9]:
# we may grab multiple columns by passing a list of column names in bracket notations
# E.g. 
df[['W', 'X']]

Unnamed: 0,W,X
A,2.70685,0.628133
B,0.651118,-0.319318
C,-2.018168,0.740122
D,0.188695,-0.758872
E,0.190794,1.978757


In [10]:
type(df[['W', 'X']])

pandas.core.frame.DataFrame

# Note : If we request one column then we get a series as result but if we request a list of column we get dataframe in result

In [11]:
# Add a new column to dataframe
# add a new column by performing operations between 2 existing columns
df['new'] = df['W'] + df['X']
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [13]:
# we may also add comletely new column
df['cnew'] = randn(5)

In [14]:
df

Unnamed: 0,W,X,Y,Z,new,cnew
A,2.70685,0.628133,0.907969,0.503826,3.334983,0.302665
B,0.651118,-0.319318,-0.848077,0.605965,0.3318,1.693723
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046,-1.706086
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177,-1.159119
E,0.190794,1.978757,2.605967,0.683509,2.169552,-0.134841


In [15]:
# We may drop columns, but drop doesn't happen in place. To make it inplace we have to specify
# E.g.
df.drop('cnew', axis=1)  # axis =1 required to drop column, default is axis=0 which refers to rows

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [16]:
df

Unnamed: 0,W,X,Y,Z,new,cnew
A,2.70685,0.628133,0.907969,0.503826,3.334983,0.302665
B,0.651118,-0.319318,-0.848077,0.605965,0.3318,1.693723
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046,-1.706086
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177,-1.159119
E,0.190794,1.978757,2.605967,0.683509,2.169552,-0.134841


In [19]:
# To drop the column in place,
df.drop(['cnew', 'new'], axis = 1, inplace=True)

In [20]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [22]:
# ROWS
# We may grab a row with 2 methods, loc and iloc. loc takes row name as index wheres as iloc takes numeric index position of row
# NOTE, we have to use the array notation with both methods but the function Notations
# E.g.  To pick row C from above dataframe
# we may use loc funtion as,
df.loc['C']

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [25]:
# Or we may use iloc as
df.iloc[2]

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [26]:
df.loc['newrow'] = randn(4)

In [30]:
df.drop('newrow', inplace=True)

In [31]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [33]:
# We may grab a specific value or a section of data frame by passing a comma seperated list(row,column) to loc function
# This is similar to numpy arrays
# E.g. 
df.loc['A', 'Y']

0.9079694464765431

In [34]:
# For grabbing a section of dataframe, pass a comma seperated lists of rows and columns
df.loc[['B', 'C'], ['X', 'Y']]

Unnamed: 0,X,Y
B,-0.319318,-0.848077
C,0.740122,0.528813
