In [1]:
import pandas as pd
df = pd.DataFrame({'col1':['a','b','c','d'],'col2':[10,20,30,40],'col3':[1,2,1,1]})
df.head()

Unnamed: 0,col1,col2,col3
0,a,10,1
1,b,20,2
2,c,30,1
3,d,40,1


## Handy DataFrame Operations

In [2]:
df.shape

(4, 3)

In [3]:
df

Unnamed: 0,col1,col2,col3
0,a,10,1
1,b,20,2
2,c,30,1
3,d,40,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
col1    4 non-null object
col2    4 non-null int64
col3    4 non-null int64
dtypes: int64(2), object(1)
memory usage: 176.0+ bytes


In [5]:
df.describe()

Unnamed: 0,col2,col3
count,4.0,4.0
mean,25.0,1.25
std,12.909944,0.5
min,10.0,1.0
25%,17.5,1.0
50%,25.0,1.0
75%,32.5,1.25
max,40.0,2.0


In [6]:
df['col3'].unique()

array([1, 2], dtype=int64)

In [7]:
df['col3'].nunique()

2

In [8]:
df['col3'].value_counts()

1    3
2    1
Name: col3, dtype: int64

In [9]:
df.columns

Index(['col1', 'col2', 'col3'], dtype='object')

In [10]:
df.index

RangeIndex(start=0, stop=4, step=1)

## Sorting

In [11]:
df.sort_values(by='col3')

Unnamed: 0,col1,col2,col3
0,a,10,1
2,c,30,1
3,d,40,1
1,b,20,2


In [12]:
df.sort_values()

TypeError: sort_values() missing 1 required positional argument: 'by'

In [13]:
df.sort_values(by='col3', ascending=False)

Unnamed: 0,col1,col2,col3
1,b,20,2
0,a,10,1
2,c,30,1
3,d,40,1


In [14]:
df['col3'].sort_values()

0    1
2    1
3    1
1    2
Name: col3, dtype: int64

## apply()

In [15]:
def add(x):
    return x+2

In [18]:
df['col2'].apply(lambda x:x+2)

0    12
1    22
2    32
3    42
Name: col2, dtype: int64

In [17]:
df['col2']

0    10
1    20
2    30
3    40
Name: col2, dtype: int64

In [19]:
df['col2'].sum()

100

In [22]:
df['col2'].apply('sum')

100

In [23]:
df['col2'].apply('mean')

25.0

In [24]:
def rename(x):
    return x+'.?'
df['col1'].apply(rename)

0    a.?
1    b.?
2    c.?
3    d.?
Name: col1, dtype: object

In [25]:
df2=pd.DataFrame({'lat':["22°35'","25°55'"],'long':["82°35'","85°55'"]})
df2

Unnamed: 0,lat,long
0,22°35',82°35'
1,25°55',85°55'


In [28]:
def deg(x):
    degree= float(x.split("°")[0])
    minute= float(x.split("°")[1][:-1])
    total= degree+ (minute/60)
    return total

df2['lat'].apply(deg)

0    22.583333
1    25.916667
Name: lat, dtype: float64

In [29]:
df2['long'].apply(deg)

0    82.583333
1    85.916667
Name: long, dtype: float64

## Missing values

In [31]:
import numpy as np

In [32]:
df=pd.DataFrame({'a':[1, np.nan, 3,10], 'b':[np.nan,np.nan,6,11], 'c':[7,8,np.nan,12]})
df

Unnamed: 0,a,b,c
0,1.0,,7.0
1,,,8.0
2,3.0,6.0,
3,10.0,11.0,12.0


In [42]:
df.isnull()

Unnamed: 0,a,b,c
0,False,True,False
1,True,True,False
2,False,False,True
3,False,False,False


In [43]:
df.isnull().sum()

a    1
b    2
c    1
dtype: int64

In [44]:
df.isnull().sum(axis=1)

0    1
1    2
2    1
3    0
dtype: int64

In [45]:
# dropna

In [46]:
df.dropna()

Unnamed: 0,a,b,c
3,10.0,11.0,12.0


In [47]:
df.dropna(axis=1)

0
1
2
3


In [48]:
df.dropna(thresh=2)

Unnamed: 0,a,b,c
0,1.0,,7.0
2,3.0,6.0,
3,10.0,11.0,12.0


In [49]:
df

Unnamed: 0,a,b,c
0,1.0,,7.0
1,,,8.0
2,3.0,6.0,
3,10.0,11.0,12.0


In [50]:
df.dropna(thresh=3)

Unnamed: 0,a,b,c
3,10.0,11.0,12.0
