### Pandas DataFrame

In [1]:
import numpy as np
import pandas as pd

In [2]:
rand_mat = np.random.randn(6,4)

In [3]:
df = pd.DataFrame(data = rand_mat,index= 'A B C D E F'.split(),columns=  'W X Y Z'.split())

In [4]:
df

Unnamed: 0,W,X,Y,Z
A,0.622001,-2.135633,0.440833,-1.002049
B,-0.665045,0.195553,1.012294,-1.354823
C,-0.957143,0.376753,-0.659888,-0.345925
D,0.089988,0.936318,0.2023,-1.158987
E,0.53259,0.090805,-2.650191,-0.486483
F,1.034381,-2.122694,1.105953,0.189065


In [5]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,0.622001,-2.135633,0.440833,-1.002049
1,B,-0.665045,0.195553,1.012294,-1.354823
2,C,-0.957143,0.376753,-0.659888,-0.345925
3,D,0.089988,0.936318,0.2023,-1.158987
4,E,0.53259,0.090805,-2.650191,-0.486483
5,F,1.034381,-2.122694,1.105953,0.189065


In [6]:
newind = 'DEL UP UK TN AP KL'.split()

In [7]:
df['States'] = newind

In [8]:
df

Unnamed: 0,W,X,Y,Z,States
A,0.622001,-2.135633,0.440833,-1.002049,DEL
B,-0.665045,0.195553,1.012294,-1.354823,UP
C,-0.957143,0.376753,-0.659888,-0.345925,UK
D,0.089988,0.936318,0.2023,-1.158987,TN
E,0.53259,0.090805,-2.650191,-0.486483,AP
F,1.034381,-2.122694,1.105953,0.189065,KL


In [10]:
df.set_index('States',inplace=True)

In [11]:
df

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DEL,0.622001,-2.135633,0.440833,-1.002049
UP,-0.665045,0.195553,1.012294,-1.354823
UK,-0.957143,0.376753,-0.659888,-0.345925
TN,0.089988,0.936318,0.2023,-1.158987
AP,0.53259,0.090805,-2.650191,-0.486483
KL,1.034381,-2.122694,1.105953,0.189065


### Multi-Index-Index Levels

In [14]:
outside = ['North','North','North','South','South','South']
inside = newind

In [18]:
hier_index = list(zip(outside,inside))
hier_index

[('North', 'DEL'),
 ('North', 'UP'),
 ('North', 'UK'),
 ('South', 'TN'),
 ('South', 'AP'),
 ('South', 'KL')]

In [20]:
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [21]:
hier_index

MultiIndex([('North', 'DEL'),
            ('North',  'UP'),
            ('North',  'UK'),
            ('South',  'TN'),
            ('South',  'AP'),
            ('South',  'KL')],
           )

In [25]:
df = pd.DataFrame(np.random.randn(6,2),index=hier_index,columns = ['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
North,DEL,1.038792,-0.075932
North,UP,0.854332,0.813554
North,UK,-0.004641,-1.862789
South,TN,1.543313,0.426668
South,AP,0.817971,0.648497
South,KL,-1.134397,0.174879


In [26]:
df.loc['North']

Unnamed: 0,A,B
DEL,1.038792,-0.075932
UP,0.854332,0.813554
UK,-0.004641,-1.862789


In [27]:
df.loc['North'].loc['DEL']

A    1.038792
B   -0.075932
Name: DEL, dtype: float64

In [28]:
df.index.names

FrozenList([None, None])

In [29]:
df.index.names = ['Region','States']

In [30]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Region,States,Unnamed: 2_level_1,Unnamed: 3_level_1
North,DEL,1.038792,-0.075932
North,UP,0.854332,0.813554
North,UK,-0.004641,-1.862789
South,TN,1.543313,0.426668
South,AP,0.817971,0.648497
South,KL,-1.134397,0.174879


In [31]:
df.xs('North')

Unnamed: 0_level_0,A,B
States,Unnamed: 1_level_1,Unnamed: 2_level_1
DEL,1.038792,-0.075932
UP,0.854332,0.813554
UK,-0.004641,-1.862789


### Data Input & Output

### CSV Input

In [34]:
df = pd.read_csv('C:\\Users\\AEL04\\Desktop\\example.csv')

In [35]:
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [36]:
df = pd.read_csv('C:/Users/AEL04/Desktop/example.csv')

In [37]:
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [44]:
df = pd.read_csv('example.csv')

In [39]:
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


### CSV Output

In [45]:
df.to_csv('example1.csv',index=False)

In [46]:
df = pd.read_csv('example1.csv')

In [47]:
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


### Excel Input

In [48]:
df = pd.read_excel('Excel_Sample.xlsx',sheet_name='Sheet1')

In [49]:
df

Unnamed: 0.1,Unnamed: 0,a,b,c,d
0,0,0,1,2,3
1,1,4,5,6,7
2,2,8,9,10,11
3,3,12,13,14,15


In [50]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [51]:
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [52]:
df.to_excel('Excel_Sample.xlsx',sheet_name='Sheet1',index=False)

In [54]:
pd.read_csv('population_india_census2011.csv',encoding='unicode_escape')

Unnamed: 0,Sno,Region,State / Union Territory,Population,Rural population,Urban population,Area,Gender Ratio
0,1,North,Uttar Pradesh,199812341,155317278,44495063,"240,928 km2",912
1,2,West,Maharashtra,112374333,61556074,50818259,"307,713 km2",929
2,3,North,Bihar,104099452,92341436,11758016,"94,163 km2",918
3,4,East,West Bengal,91276115,62183113,29093002,"88,752 km2",953
4,5,Cntral,Madhya Pradesh,72626809,52557404,20069405,"308,245 km2",931
5,6,South,Tamil Nadu,72147030,37229590,34917440,"130,058 km2",996
6,7,North,Rajasthan,68548437,51500352,17048085,"342,239 km2",928
7,8,South,Karnataka,61095297,37469335,23625962,"191,791 km2",973
8,9,West,Gujarat,60439692,34694609,25745083,"196,024 km2",919
9,10,South,Andhra Pradesh,49577103,34966693,14610410,"162,968 km2",993


### Missing Data

In [55]:
df = pd.read_csv('df11.csv')

In [56]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [57]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [58]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [59]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [60]:
df.fillna(value='FILL VALUE')

Unnamed: 0,A,B,C
0,1,5,1
1,2,FILL VALUE,2
2,FILL VALUE,FILL VALUE,3


In [61]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [62]:
value = df['A'].mean()

In [63]:
value

1.5

In [64]:
df['A'].fillna(value=df['A'].mean(),inplace=True)

In [65]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,1.5,,3


In [67]:
df.iloc[1] = df.iloc[1].fillna(value=df.iloc[1].mean())

In [68]:
df.iloc[1]

A    2.0
B    2.0
C    2.0
Name: 1, dtype: float64

In [69]:
df['B'].fillna(value=df['B'].mean(),inplace=True)

In [70]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1.0
1,2.0,2.0,2.0
2,1.5,3.5,3.0
