# Data Preprocessing

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("melb_data_housing.csv")

## Data Analysis

Check data types

In [None]:
df.dtypes

Select only columns with numeric data

In [None]:
df.select_dtypes(include='float64')

Inspect the first vew colums

In [None]:
df.head()

## Missing Values

Count all rows with missing values (Null or NaN)

In [None]:
missing_values_count = df.isnull().sum()
missing_values_count

Count all rows where the number is inf or -inf (infinity)

In [None]:
missing_values_count = df.isin([np.inf, -np.inf]).sum()
missing_values_count

Replace all inf values with nan values

In [None]:
df.replace([np.inf, -np.inf], np.nan)

Drop all rows with nan

In [None]:
df1=df.dropna()
print(len(df))
print(len(df1))

Drop all columns with na

In [None]:
df2 = df.dropna(axis=1)
print(len(df))
print(len(df2))

print(len(df.columns))
print(len(df2.columns))

## Data Imputation

Automatic filling with zero

In [None]:
df3=df.fillna(0)
missing_values_count = df3.isnull().sum()
missing_values_count

Watch "YearBuild" and "BuildingArea"

In [None]:
df3[["YearBuilt","BuildingArea"]].describe()

Replace all NA's with the value that comes directly after it in the same column, then replace all the reamining na's with 0

In [None]:
df3=df.fillna(method = 'bfill', axis=0).fillna(0)

In [None]:
df3[["YearBuilt","BuildingArea"]].describe()

Use the SimpleImputer from Scikit-Learn and replace NaN values with the most frequent values in a column

https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df4 = pd.DataFrame(imputer.fit_transform(df),columns=df.columns)

In [None]:
df4[["YearBuilt","BuildingArea"]].describe()

Use the SimpleImputer from Scikit-Learn and replace NaN values with the "Mean" based on all other values in the column

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df_num=df.select_dtypes(include='float64')
df4 = pd.DataFrame(imputer.fit_transform(df_num),columns=df_num.columns)

In [None]:
df4[["YearBuilt","BuildingArea"]].describe()

## Data Transformation

Normalize numerical data with the MinMaxScaler

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

In [None]:
from sklearn import preprocessing
mm_scaler = preprocessing.MinMaxScaler()
df_num=df[["Distance"]]
minmax = mm_scaler.fit_transform(df_num)

#Any null values?
np.isinf(minmax).any()
np.isnan(minmax).any()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.distplot(minmax, hist=True, kde=True, color = 'blue', hist_kws={'edgecolor':'black'})
plt.title('Histogram after MinMax Scaler')
plt.xlabel('Value')
plt.ylabel('Frequency')

Standardize numerical data with the StandardScaler

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

In [None]:
from sklearn import preprocessing
standard_scaler = preprocessing.StandardScaler()
df_num=df[["Distance"]]
standard = standard_scaler.fit_transform(df_num)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.distplot(standard, hist=True, kde=True, color = 'blue', hist_kws={'edgecolor':'black'})
plt.title('Histogram after MinMax Scaler')
plt.xlabel('Value')
plt.ylabel('Frequency')

Convert categorical data to One-Hot-Encoding

In [None]:
df.head()

Does it make sense to convert a column to one-hot-encoding?

In [None]:
df['Suburb'].unique() 

Convert to One-Hot-Eoncoding in Pandas

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html

In [None]:
df5=pd.get_dummies(df, columns=["Suburb"])
df5.head()

Convert to One-Hot-Encoding in Scikit-Learn

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [None]:
from sklearn.preprocessing import OneHotEncoder

onehot = OneHotEncoder()
result = onehot.fit_transform(df[["Regionname"]])
pd.DataFrame(result.toarray(), columns=onehot.categories_).head()
df6 = df.join(pd.DataFrame(result.toarray(), columns=onehot.categories_))
df6.head()

Custom Binary Encoding

In [None]:
df["SellerG_Binary"] = np.where(df["SellerG"].str.contains("Biggin"), 1, 0)
df[["SellerG","SellerG_Binary"]].head()

Ordinal Encoding

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ordinal = OrdinalEncoder()
df["Method_ordinal"] = ordinal.fit_transform(df[["Method"]])
df[["Method","Method_ordinal"]].head()

Label Encoding

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
df['Method_label'] = labelencoder.fit_transform(df['Method'])
df[["Method","Method_label"]].head()

In [None]:
list(labelencoder.classes_)

In [None]:
labelencoder.inverse_transform(df['Method_label'])