<b>Creating dataframes

In [1]:
import pandas as pd
import numpy as np

In [20]:
df = pd.DataFrame(np.random.randint(1,15,(4,3)),['r1','r2','r3','r4'],['c1','c2','c3'])

In [21]:
df

Unnamed: 0,c1,c2,c3
r1,14,11,8
r2,1,2,4
r3,3,8,6
r4,14,12,4


<b>Reading columns

In [22]:
df[['c1','c2']]

Unnamed: 0,c1,c2
r1,14,11
r2,1,2
r3,3,8
r4,14,12


<b>Adding a column

In [23]:
df['c4'] = [64,23,44,21]
df

Unnamed: 0,c1,c2,c3,c4
r1,14,11,8,64
r2,1,2,4,23
r3,3,8,6,44
r4,14,12,4,21


<b>Reading rows

In [24]:
df.iloc[0]

c1    14
c2    11
c3     8
c4    64
Name: r1, dtype: int64

In [25]:
df.loc['r2']

c1     1
c2     2
c3     4
c4    23
Name: r2, dtype: int64

<b>Dropping rows or columns

In [26]:
df.drop('c1',axis=1,inplace=True)
df

Unnamed: 0,c2,c3,c4
r1,11,8,64
r2,2,4,23
r3,8,6,44
r4,12,4,21


In [27]:
df.drop('r3')

Unnamed: 0,c2,c3,c4
r1,11,8,64
r2,2,4,23
r4,12,4,21


In [28]:
df

Unnamed: 0,c2,c3,c4
r1,11,8,64
r2,2,4,23
r3,8,6,44
r4,12,4,21


In [33]:
df[(df['c2']<10) | (df['c3']<5)]

Unnamed: 0,c2,c3,c4
r2,2,4,23
r3,8,6,44
r4,12,4,21


In [36]:
df.reset_index(inplace=True)

In [37]:
df

Unnamed: 0,index,c2,c3,c4
0,r1,11,8,64
1,r2,2,4,23
2,r3,8,6,44
3,r4,12,4,21


In [39]:
df.set_index('index',inplace=True)

In [40]:
df

Unnamed: 0_level_0,c2,c3,c4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
r1,11,8,64
r2,2,4,23
r3,8,6,44
r4,12,4,21


<b> Multi-indexing and hierarchy indexes

In [3]:
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [4]:
df = pd.DataFrame(np.random.randint(1,10,(6,2)),hier_index,['c1','c2'])
df

Unnamed: 0,Unnamed: 1,c1,c2
G1,1,6,6
G1,2,2,4
G1,3,6,2
G2,1,2,4
G2,2,1,3
G2,3,6,9


In [6]:
df.loc['G1'].loc[1]

c1    6
c2    6
Name: 1, dtype: int64

In [7]:
df.index.names = ['Groups','Sr. No.']

In [8]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,c1,c2
Groups,Sr. No.,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,6,6
G1,2,2,4
G1,3,6,2
G2,1,2,4
G2,2,1,3
G2,3,6,9


In [9]:
df['c1'].loc['G1'].loc[1]

6

In [15]:
df.xs(1,level='Sr. No.')

Unnamed: 0_level_0,c1,c2
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,6,6
G2,2,4


<b>Missing data

In [18]:
d = {'Name':['DT','Nehal','Karina','Mali'],'Age':[np.nan,21,20,np.nan],'Wt':[48,53,np.nan,np.nan]}

In [20]:
df = pd.DataFrame(d)
df

Unnamed: 0,Name,Age,Wt
0,DT,,48.0
1,Nehal,21.0,53.0
2,Karina,20.0,
3,Mali,,


In [22]:
#dropping rows w NaN vals
df.dropna()

Unnamed: 0,Name,Age,Wt
1,Nehal,21.0,53.0


In [23]:
df.dropna(axis=1)

Unnamed: 0,Name
0,DT
1,Nehal
2,Karina
3,Mali


In [28]:
#filling with mean of column
df['Age'].fillna(df['Age'].mean())

0    20.5
1    21.0
2    20.0
3    20.5
Name: Age, dtype: float64

In [29]:
#dropping rows with NaN>=thresh
df.dropna(thresh=2)

Unnamed: 0,Name,Age,Wt
0,DT,,48.0
1,Nehal,21.0,53.0
2,Karina,20.0,


<b>Groupby

In [30]:
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}

In [31]:
df = pd.DataFrame(data)
df

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


In [35]:
df.groupby('Company').describe().transpose()['GOOG']

Sales  count      2.000000
       mean     160.000000
       std       56.568542
       min      120.000000
       25%      140.000000
       50%      160.000000
       75%      180.000000
       max      200.000000
Name: GOOG, dtype: float64

In [37]:
df.groupby('Company').count()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,2,2
GOOG,2,2
MSFT,2,2
