In [218]:
import pandas as pd
import numpy as np

# Load data

In [219]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = pd.read_csv(url, header=None, names=['sepal_length','sepal_width', 'petal_length', 'petal_width', 'class'])

iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Data cleaning

## Missing values

### Is there any missing value in the dataframe?

In [220]:
pd.isna(iris).sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
class           0
dtype: int64

### Lets set the values of the rows 10 to 29 of the column 'petal_length' to NaN

In [221]:
iris.iloc[10:30, 2] = np.nan
iris.iloc[10:30, :]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
10,5.4,3.7,,0.2,Iris-setosa
11,4.8,3.4,,0.2,Iris-setosa
12,4.8,3.0,,0.1,Iris-setosa
13,4.3,3.0,,0.1,Iris-setosa
14,5.8,4.0,,0.2,Iris-setosa
15,5.7,4.4,,0.4,Iris-setosa
16,5.4,3.9,,0.4,Iris-setosa
17,5.1,3.5,,0.3,Iris-setosa
18,5.7,3.8,,0.3,Iris-setosa
19,5.1,3.8,,0.3,Iris-setosa


### Which column has the maximum number of missing values?

In [222]:
iris.columns[pd.isnull(iris).sum().argmax()]

'petal_length'

### Try to substitute the NaN values with two methods:
- replace null values with column mean (apply it to a copy of the dataframe)
- replace null values with 1.0



In [223]:
iris_copy = iris.copy()
iris_copy = iris_copy.fillna(iris_copy.loc[:, ['petal_length', 'petal_width', 'sepal_length', 'sepal_width']].mean())
iris_copy.head(20)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [224]:
iris = iris.fillna(1.0)
iris.head(20)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


### Set the first 3 rows as NaN

In [225]:
iris.iloc[0:3, :] = np.nan
iris.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,,,,,
1,,,,,
2,,,,,
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Delete the rows that have all NaN

In [226]:
iris = iris.dropna(how='all')
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


### Reset the index so it begins with 0 again

In [227]:
iris = iris.reset_index(drop=True)
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,4.6,3.1,1.5,0.2,Iris-setosa
1,5.0,3.6,1.4,0.2,Iris-setosa
2,5.4,3.9,1.7,0.4,Iris-setosa
3,4.6,3.4,1.4,0.3,Iris-setosa
4,5.0,3.4,1.5,0.2,Iris-setosa
...,...,...,...,...,...
142,6.7,3.0,5.2,2.3,Iris-virginica
143,6.3,2.5,5.0,1.9,Iris-virginica
144,6.5,3.0,5.2,2.0,Iris-virginica
145,6.2,3.4,5.4,2.3,Iris-virginica


## Duplicates

### Does the dataframe contain duplicated rows? If any, visualize all duplicated rows (don't omit first or last occurrences)

In [228]:
dup = iris[iris.duplicated(keep=False)]
dup

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
6,4.9,3.1,1.5,0.1,Iris-setosa
8,4.8,3.4,1.0,0.2,Iris-setosa
21,4.8,3.4,1.0,0.2,Iris-setosa
31,4.9,3.1,1.5,0.1,Iris-setosa
34,4.9,3.1,1.5,0.1,Iris-setosa
98,5.8,2.7,5.1,1.9,Iris-virginica
139,5.8,2.7,5.1,1.9,Iris-virginica


### Which row is the most repeated?

In [229]:
dup.groupby(list(dup.columns.values)).size()

sepal_length  sepal_width  petal_length  petal_width  class         
4.8           3.4          1.0           0.2          Iris-setosa       2
4.9           3.1          1.5           0.1          Iris-setosa       3
5.8           2.7          5.1           1.9          Iris-virginica    2
dtype: int64

### Drop duplicated rows

In [230]:
iris = iris.drop_duplicates()
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,4.6,3.1,1.5,0.2,Iris-setosa
1,5.0,3.6,1.4,0.2,Iris-setosa
2,5.4,3.9,1.7,0.4,Iris-setosa
3,4.6,3.4,1.4,0.3,Iris-setosa
4,5.0,3.4,1.5,0.2,Iris-setosa
...,...,...,...,...,...
142,6.7,3.0,5.2,2.3,Iris-virginica
143,6.3,2.5,5.0,1.9,Iris-virginica
144,6.5,3.0,5.2,2.0,Iris-virginica
145,6.2,3.4,5.4,2.3,Iris-virginica


## Detect outliers, e.g., values that are higher than 85th percentile and lower than 25th percentile.

In [231]:
iris_copy = iris.loc[:, ['petal_length', 'petal_width', 'sepal_length', 'sepal_width']].copy()
q_85 = iris_copy.quantile(0.85, axis=0)
q_25 = iris_copy.quantile(0.25, axis=0)
outliers = iris[(iris_copy > q_85).all(axis=1)]
outliers = pd.concat([outliers, iris[(iris_copy < q_25).all(axis=1)]])
outliers = outliers.sort_index()
outliers

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
38,4.5,2.3,1.3,0.3,Iris-setosa
106,7.2,3.6,6.1,2.5,Iris-virginica
114,7.7,3.8,6.7,2.2,Iris-virginica


# Data transformation

## Replace class values by removing "Iris-" prefix (use a dictionary)

In [232]:
names = {}
for name in iris['class'].unique():
    names[name] = name.replace("Iris-", "")
iris['class'] = iris['class'].replace(names)
iris.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iris['class'] = iris['class'].replace(names)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,4.6,3.1,1.5,0.2,setosa
1,5.0,3.6,1.4,0.2,setosa
2,5.4,3.9,1.7,0.4,setosa
3,4.6,3.4,1.4,0.3,setosa
4,5.0,3.4,1.5,0.2,setosa


## Delete columns
Delete for example class column

In [233]:
iris = iris.drop(columns='class')

## How to normalize all columns in a dataframe?
- Normalize all columns of df by subtracting the column mean and divide by standard deviation.
- Range all columns of df such that the minimum value in each column is 0 and max is 1.

In [234]:
normalized = iris.apply(lambda x: ((x.max() - x)/(x.max() - x.min())).round(2))
normalized

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,0.92,0.54,0.92,0.96
1,0.81,0.33,0.93,0.96
2,0.69,0.21,0.88,0.88
3,0.92,0.42,0.93,0.92
4,0.81,0.42,0.92,0.96
...,...,...,...,...
142,0.33,0.58,0.29,0.08
143,0.44,0.79,0.32,0.25
144,0.39,0.58,0.29,0.21
145,0.47,0.42,0.25,0.08


## Binning and discretization
Discretize dataframe columns in 4 bins and get the new value frequency distribution

In [235]:
discrete = iris.apply(lambda x: pd.cut(x, 4))
discrete

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,"(4.296, 5.2]","(2.6, 3.2]","(0.994, 2.475]","(0.0976, 0.7]"
1,"(4.296, 5.2]","(3.2, 3.8]","(0.994, 2.475]","(0.0976, 0.7]"
2,"(5.2, 6.1]","(3.8, 4.4]","(0.994, 2.475]","(0.0976, 0.7]"
3,"(4.296, 5.2]","(3.2, 3.8]","(0.994, 2.475]","(0.0976, 0.7]"
4,"(4.296, 5.2]","(3.2, 3.8]","(0.994, 2.475]","(0.0976, 0.7]"
...,...,...,...,...
142,"(6.1, 7.0]","(2.6, 3.2]","(3.95, 5.425]","(1.9, 2.5]"
143,"(6.1, 7.0]","(1.998, 2.6]","(3.95, 5.425]","(1.3, 1.9]"
144,"(6.1, 7.0]","(2.6, 3.2]","(3.95, 5.425]","(1.9, 2.5]"
145,"(6.1, 7.0]","(3.2, 3.8]","(3.95, 5.425]","(1.9, 2.5]"


In [236]:
discrete['sepal_length'].value_counts(normalize=True)

sepal_length
(5.2, 6.1]      0.342657
(6.1, 7.0]      0.300699
(4.296, 5.2]    0.272727
(7.0, 7.9]      0.083916
Name: proportion, dtype: float64

## Binarize categorical data (dummy variables)
Based on the prevoius result, binarize all dataframe columns

In [237]:
pd.get_dummies(discrete)

Unnamed: 0,"sepal_length_(4.296, 5.2]","sepal_length_(5.2, 6.1]","sepal_length_(6.1, 7.0]","sepal_length_(7.0, 7.9]","sepal_width_(1.998, 2.6]","sepal_width_(2.6, 3.2]","sepal_width_(3.2, 3.8]","sepal_width_(3.8, 4.4]","petal_length_(0.994, 2.475]","petal_length_(2.475, 3.95]","petal_length_(3.95, 5.425]","petal_length_(5.425, 6.9]","petal_width_(0.0976, 0.7]","petal_width_(0.7, 1.3]","petal_width_(1.3, 1.9]","petal_width_(1.9, 2.5]"
0,True,False,False,False,False,True,False,False,True,False,False,False,True,False,False,False
1,True,False,False,False,False,False,True,False,True,False,False,False,True,False,False,False
2,False,True,False,False,False,False,False,True,True,False,False,False,True,False,False,False
3,True,False,False,False,False,False,True,False,True,False,False,False,True,False,False,False
4,True,False,False,False,False,False,True,False,True,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,True
143,False,False,True,False,True,False,False,False,False,False,True,False,False,False,True,False
144,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,True
145,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,True
