# Operations

There are lots of operations with pandas that will be really useful to you, but don't fall into any distinct category. Let's show them here in this lecture:

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'col1':[1, 2, 5, 4, 3],
                   'col2':[444,555,666,444, np.nan],
                   'col3':['abc','defss','ghirfd','xyzh', 'sdgdgff']})
df

Unnamed: 0,col1,col2,col3
0,1,444.0,abc
1,2,555.0,defss
2,5,666.0,ghirfd
3,4,444.0,xyzh
4,3,,sdgdgff


### Info on Unique Values

In [3]:
df['col2']

0    444.0
1    555.0
2    666.0
3    444.0
4      NaN
Name: col2, dtype: float64

In [4]:
np.array(df['col2'])

array([444., 555., 666., 444.,  nan])

In [5]:
df['col2'].unique() # main

array([444., 555., 666.,  nan])

In [6]:
df['col2'].nunique()
# = len(df['col2'].unique())

3

In [7]:
a = df['col2'].value_counts()
a

444.0    2
666.0    1
555.0    1
Name: col2, dtype: int64

In [8]:
type(a)

pandas.core.series.Series

### Selecting Data


In [9]:
df

Unnamed: 0,col1,col2,col3
0,1,444.0,abc
1,2,555.0,defss
2,5,666.0,ghirfd
3,4,444.0,xyzh
4,3,,sdgdgff


___

In [10]:
ser = df['col2'] > 450 
ser

0    False
1     True
2     True
3    False
4    False
Name: col2, dtype: bool

In [11]:
type(ser)

pandas.core.series.Series

______________

In [12]:
df1 = pd.DataFrame(ser)
df1

Unnamed: 0,col2
0,False
1,True
2,True
3,False
4,False


In [13]:
type(df1)

pandas.core.frame.DataFrame

______

In [14]:
df[ser]
# = df[df['col2'] > 450 ]

Unnamed: 0,col1,col2,col3
1,2,555.0,defss
2,5,666.0,ghirfd


In [15]:
df[(df['col2'] < 600) & (df['col2'] == 444)]

Unnamed: 0,col1,col2,col3
0,1,444.0,abc
3,4,444.0,xyzh


In [16]:
df[(df['col2'] < 600) & (df['col1'] == 2)]

Unnamed: 0,col1,col2,col3
1,2,555.0,defss


### Applying Functions

In [17]:
def time2(x):
    return x * 2

In [18]:
df['col1'].apply(time2)
# = df['col1'].apply(lambda x : x * 2)

0     2
1     4
2    10
3     8
4     6
Name: col1, dtype: int64

In [19]:
df['col3'].apply(len)
# = df['col3'].apply(lambda x : len(x))

0    3
1    5
2    6
3    4
4    7
Name: col3, dtype: int64

In [20]:
df['col2'].sum()

2109.0

### Permanently Removing a Column

In [21]:
df

Unnamed: 0,col1,col2,col3
0,1,444.0,abc
1,2,555.0,defss
2,5,666.0,ghirfd
3,4,444.0,xyzh
4,3,,sdgdgff


In [22]:
del df['col1']
df

Unnamed: 0,col2,col3
0,444.0,abc
1,555.0,defss
2,666.0,ghirfd
3,444.0,xyzh
4,,sdgdgff


### Get column and index names

In [23]:
df.columns

Index(['col2', 'col3'], dtype='object')

In [24]:
df.index

RangeIndex(start=0, stop=5, step=1)

### Sorting and Ordering a DataFrame

In [25]:
df

Unnamed: 0,col2,col3
0,444.0,abc
1,555.0,defss
2,666.0,ghirfd
3,444.0,xyzh
4,,sdgdgff


In [26]:
df.sort_values(by = 'col2', inplace = True)

In [27]:
df

Unnamed: 0,col2,col3
0,444.0,abc
3,444.0,xyzh
1,555.0,defss
2,666.0,ghirfd
4,,sdgdgff


### Check for Null Values

In [28]:
# Find Null Values
df.isnull()

Unnamed: 0,col2,col3
0,False,False
3,False,False
1,False,False
2,False,False
4,True,False


In [29]:
# Drop rows with NaN Values
df.dropna(inplace = True)
df

Unnamed: 0,col2,col3
0,444.0,abc
3,444.0,xyzh
1,555.0,defss
2,666.0,ghirfd


In [30]:
# Khởi tạo lại DataFrame:
df = pd.DataFrame({'col1':[1, 2, 5, 4, 3],
                   'col2':[444,555,666,444, np.nan],
                   'col3':['abc','defss','ghirfd','xyzh', 'sdgdgff']})
df

Unnamed: 0,col1,col2,col3
0,1,444.0,abc
1,2,555.0,defss
2,5,666.0,ghirfd
3,4,444.0,xyzh
4,3,,sdgdgff


### Filling in NaN values with something else

In [31]:
df.fillna('Empty', inplace = True)
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,defss
2,5,666,ghirfd
3,4,444,xyzh
4,3,Empty,sdgdgff
