### 1. Handling of missing values

In [1]:
import pandas as pd
import numpy as np

In [2]:
my_dataframe = pd.Series([1.5, 7.8, np.nan, 0])
my_dataframe

0    1.5
1    7.8
2    NaN
3    0.0
dtype: float64

In [3]:
#Detection of a missing value

my_dataframe.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [8]:
my_other_dataframe = pd.Series(['apple', 'mango', np.nan, 'strawberry', None, 'banana'])

my_other_dataframe

0         apple
1         mango
2           NaN
3    strawberry
4          None
5        banana
dtype: object

In [9]:
my_other_dataframe.isna()

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [10]:
#Filtering null values

my_dataframe.dropna()

0    1.5
1    7.8
3    0.0
dtype: float64

In [13]:
my_dataframe

0    1.5
1    7.8
2    NaN
3    0.0
dtype: float64

In [14]:
#permanently drop

my_dataframe.dropna(inplace = True)

In [15]:
my_dataframe

0    1.5
1    7.8
3    0.0
dtype: float64

In [19]:
my_other_dataframe.dropna(inplace = True)

In [20]:
my_other_dataframe

0         apple
1         mango
3    strawberry
5        banana
dtype: object

In [21]:
data = pd.DataFrame({'Name' : ['Rafael', 'Delia', 'David', 'Sameer', None, np.nan],
                     'Weight' : [60, np.nan, 64, 56, 62, np.nan],
                     'City' : ['Madrid', 'Paris', None, np.nan, 'London', 'Delhi'],
                     'Profession' : ['Manager', 'Team Lead', 'Project Manager', 'Developer', 'Team Lead', 'Manager']})
data

Unnamed: 0,Name,Weight,City,Profession
0,Rafael,60.0,Madrid,Manager
1,Delia,,Paris,Team Lead
2,David,64.0,,Project Manager
3,Sameer,56.0,,Developer
4,,62.0,London,Team Lead
5,,,Delhi,Manager


In [22]:
data.isna()

Unnamed: 0,Name,Weight,City,Profession
0,False,False,False,False
1,False,True,False,False
2,False,False,True,False
3,False,False,True,False
4,True,False,False,False
5,True,True,False,False


In [11]:
#Which column has a missing value
data.isna().any()

Name           True
Weight         True
City           True
Profession    False
dtype: bool

In [23]:
#count of missing value
data.isna().sum()

Name          2
Weight        2
City          2
Profession    0
dtype: int64

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        4 non-null      object 
 1   Weight      4 non-null      float64
 2   City        4 non-null      object 
 3   Profession  6 non-null      object 
dtypes: float64(1), object(3)
memory usage: 320.0+ bytes


In [25]:
#Treat them by removing null values

data.dropna()

Unnamed: 0,Name,Weight,City,Profession
0,Rafael,60.0,Madrid,Manager


In [15]:
data

Unnamed: 0,Name,Weight,City,Profession
0,Rafael,60.0,Madrid,Manager
1,Delia,,Paris,Team Lead
2,David,64.0,,Project Manager
3,Sameer,56.0,,Developer
4,,62.0,London,Team Lead
5,,,Delhi,Manager


In [26]:
#drop any column with atleast a missing value
data.dropna(axis = 'columns')

Unnamed: 0,Profession
0,Manager
1,Team Lead
2,Project Manager
3,Developer
4,Team Lead
5,Manager


In [24]:
#Treat it by filling the missing values

In [27]:
data

Unnamed: 0,Name,Weight,City,Profession
0,Rafael,60.0,Madrid,Manager
1,Delia,,Paris,Team Lead
2,David,64.0,,Project Manager
3,Sameer,56.0,,Developer
4,,62.0,London,Team Lead
5,,,Delhi,Manager


In [28]:
#temporary filling
data.fillna(0)

Unnamed: 0,Name,Weight,City,Profession
0,Rafael,60.0,Madrid,Manager
1,Delia,0.0,Paris,Team Lead
2,David,64.0,0,Project Manager
3,Sameer,56.0,0,Developer
4,0,62.0,London,Team Lead
5,0,0.0,Delhi,Manager


In [29]:
data

Unnamed: 0,Name,Weight,City,Profession
0,Rafael,60.0,Madrid,Manager
1,Delia,,Paris,Team Lead
2,David,64.0,,Project Manager
3,Sameer,56.0,,Developer
4,,62.0,London,Team Lead
5,,,Delhi,Manager


In [30]:
data.fillna({'Name' : 'XYZ', 'Weight' : 77})

Unnamed: 0,Name,Weight,City,Profession
0,Rafael,60.0,Madrid,Manager
1,Delia,77.0,Paris,Team Lead
2,David,64.0,,Project Manager
3,Sameer,56.0,,Developer
4,XYZ,62.0,London,Team Lead
5,XYZ,77.0,Delhi,Manager


In [30]:
data['Weight'].fillna(data['Weight'].mean())

0    60.0
1    60.5
2    64.0
3    56.0
4    62.0
5    60.5
Name: Weight, dtype: float64

#### 2. Detecting Duplicates

In [31]:
data = pd.DataFrame({'c1' : ['Delhi', 'Melbourne', 'Delhi', 'Delhi', 'Toronto', 'New York', 'Toronto', 'Sydney'],
                    'c2': [ 67, 34,  67,  66,  45,  34,  35,  102]})

data

Unnamed: 0,c1,c2
0,Delhi,67
1,Melbourne,34
2,Delhi,67
3,Delhi,66
4,Toronto,45
5,New York,34
6,Toronto,35
7,Sydney,102


In [32]:
#detects duplicate
data.duplicated()

0    False
1    False
2     True
3    False
4    False
5    False
6    False
7    False
dtype: bool

In [33]:
data.iloc[2]

c1    Delhi
c2       67
Name: 2, dtype: object

In [34]:
data.drop_duplicates()

Unnamed: 0,c1,c2
0,Delhi,67
1,Melbourne,34
3,Delhi,66
4,Toronto,45
5,New York,34
6,Toronto,35
7,Sydney,102


In [35]:
data

Unnamed: 0,c1,c2
0,Delhi,67
1,Melbourne,34
2,Delhi,67
3,Delhi,66
4,Toronto,45
5,New York,34
6,Toronto,35
7,Sydney,102


In [36]:
#permanently drop

data.drop_duplicates(inplace = True)
data

Unnamed: 0,c1,c2
0,Delhi,67
1,Melbourne,34
3,Delhi,66
4,Toronto,45
5,New York,34
6,Toronto,35
7,Sydney,102


In [38]:
#create a dictionary of short names

short_names = {'Delhi' : 'DEL', 'Melbourne' : 'MEL', 'Toronto' : 'TOR', 'New York' : 'NYW', 'Sydney' : 'SYD'}

In [39]:
data['c3'] = data['c1'].map(short_names)
data

Unnamed: 0,c1,c2,c3
0,Delhi,67,DEL
1,Melbourne,34,MEL
3,Delhi,66,DEL
4,Toronto,45,TOR
5,New York,34,NYW
6,Toronto,35,TOR
7,Sydney,102,SYD


In [41]:
#Discretization or Binning

age = [21, 24, 19, 38, 42, 65, 67, 19, 18, 34, 78, 54, 39, 49, 63, 99]

#intervals
bins = [0, 25, 50, 75, 100]

In [45]:
age_bins = pd.cut(age, bins, labels=['teenage', 'adult', 'senior', 'old aged'])

age_bins

['teenage', 'teenage', 'teenage', 'adult', 'adult', ..., 'senior', 'adult', 'adult', 'senior', 'old aged']
Length: 16
Categories (4, object): ['teenage' < 'adult' < 'senior' < 'old aged']