In [20]:
# Import required modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from numpy import mean, std
from sklearn import metrics, preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, make_scorer

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

1. Load the above dataset
2. Get the number of columns and rows.
3. Use describe function for the dataset
4. Use info function for the dataset
5. Count the number of missing values
6. Replace the missing values with the average of non-null values.
7. Perform data normalization to all the features.
8. Perform data standardization to all the features.

# Task 1

In [3]:
# Load the above dataset
dt = pd.read_csv('data.csv')

#Task 2

In [4]:
# Get the number of columns and rows
dt.shape

(511, 14)

#Task 3

In [6]:
# Use describe function for the dataset
dt.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,511.0,511.0,511.0,511.0,511.0,506.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0
mean,3.584139,11.252446,11.151096,0.068493,0.554757,6.287589,68.616243,3.783876,9.485323,407.440313,18.5,356.6009,12.87955,22.682192
std,8.564433,23.234838,6.828175,0.252838,0.11531,0.703802,28.09913,2.098631,8.688469,167.903532,2.200348,90.882679,7.797416,9.484262
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082325,0.0,5.19,0.0,0.449,5.8855,45.05,2.10035,4.0,279.5,17.4,374.71,7.065,17.05
50%,0.26169,0.0,9.69,0.0,0.538,6.209,77.3,3.1523,5.0,330.0,19.1,391.34,11.45,21.2
75%,3.621175,12.5,18.1,0.0,0.624,6.62975,94.05,5.118,24.0,666.0,20.2,396.21,17.105,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,23.0,396.9,76.0,67.0


#Task 4

In [7]:
# Use info function for the dataset
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511 entries, 0 to 510
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     511 non-null    float64
 1   ZN       511 non-null    float64
 2   INDUS    511 non-null    float64
 3   CHAS     511 non-null    int64  
 4   NOX      511 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      511 non-null    float64
 7   DIS      511 non-null    float64
 8   RAD      511 non-null    int64  
 9   TAX      511 non-null    int64  
 10  PTRATIO  511 non-null    float64
 11  B        511 non-null    float64
 12  LSTAT    511 non-null    float64
 13  MEDV     511 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 56.0 KB


# Task 5

In [8]:
# Count the number of missing values
dt.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         5
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

#Task 6

In [9]:
# Replace the missing values with the average of non-null values
# This function cleans the dataset
def clean(data):
  cols = ['RM']
  for col in cols:
    data[col].fillna(data[col].median(), inplace=True)
  return data

In [10]:
data = clean(dt)

In [11]:
dt.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

#Task 7

In [25]:
# Perform data normalization to all the features
data_scaler = MinMaxScaler()
data_scaled = data_scaler.fit_transform(data)
data_scaled

array([[0.00000000e+00, 1.80000000e-01, 6.78152493e-02, ...,
        1.00000000e+00, 4.37592568e-02, 3.06451613e-01],
       [2.35922539e-04, 0.00000000e+00, 2.42302053e-01, ...,
        1.00000000e+00, 9.97711054e-02, 2.67741935e-01],
       [2.35697744e-04, 0.00000000e+00, 2.42302053e-01, ...,
        9.89737254e-01, 3.09680894e-02, 4.79032258e-01],
       ...,
       [4.92312679e-03, 0.00000000e+00, 4.41348974e-01, ...,
        8.64087952e-01, 2.59458732e-01, 7.90322581e-01],
       [8.66933843e-03, 0.00000000e+00, 4.48680352e-01, ...,
        8.64087952e-01, 1.00000000e+00, 1.00000000e+00],
       [7.28336376e-03, 0.00000000e+00, 4.52346041e-01, ...,
        8.08613647e-01, 5.82604012e-01, 3.06451613e-01]])

#Task 8

In [26]:
# Perform data standardization to all the features
scaler_standard = StandardScaler()
scaled_std = scaler_standard.fit_transform(data_scaled)
scaled_std

array([[-0.41816246,  0.29069132, -1.29606519, ...,  0.44385344,
        -1.01409118,  0.13908299],
       [-0.41570923, -0.48476656, -0.59827044, ...,  0.44385344,
        -0.48005831, -0.11421576],
       [-0.41571156, -0.48476656, -0.59827044, ...,  0.39902654,
        -1.13604581,  1.26837325],
       ...,
       [-0.36696944, -0.48476656,  0.19774332, ..., -0.14980003,
         1.04244887,  3.30531737],
       [-0.32801455, -0.48476656,  0.22706243, ..., -0.14980003,
         8.10297964,  4.67735227],
       [-0.34242657, -0.48476656,  0.24172198, ..., -0.39210757,
         4.12340775,  0.13908299]])