## **Exercise 1.03 - Guided Exercise**
### Performing Quality Investigation

### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

%matplotlib inline

In [None]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import certifi
from sklearn.datasets import fetch_openml

### Loading the Data

In [None]:

df_X = pd.read_csv('https://raw.githubusercontent.com/fenago/datawrangling/main/miami-housing.csv')
df_X.sample(5)
df_X.head()

## **Structure Investigation**


In [None]:
# Show size of the dataset
df_X.shape


In [None]:
# Count how many times each data type is present in the dataset
pd.value_counts(df_X.dtypes)

##  Structure of numerical features


In [None]:
# For each numerical feature compute number of unique entries
unique_values = df_X.select_dtypes(include='number').nunique().sort_values()
plt.figure(figsize=(15, 4))
sns.set_style('whitegrid')


In [None]:

g = sns.barplot(x=unique_values.index, y=unique_values, palette='inferno')
g.set_yscale("log")
g.set_xticklabels(g.get_xticklabels(), rotation=45, horizontalalignment='right')
g.set_title('Unique values per frequency')
plt.show()

##  Conclusion of structure investigation
Looking into the kind of features

In [None]:
df_X.info() 
df_X.describe()

### Quality Investigation
 Duplicates, missing values and unwanted entries or  errors will be seen:

In [None]:
# Check number of duplicates while ignoring the index feature
n_duplicates = df_X.drop(labels=['PARCELNO'], axis=1).duplicated().sum()

print(f"You seem to have {n_duplicates} duplicates in your database.")

### Removing Duplicates

In [None]:
#  Extract column names of all features, except 'PARCELNO' - unique feature
columns_to_consider = df_X.drop(labels=['PARCELNO'], axis=1).columns

# Drop duplicates based on 'columns_to_consider'
df_X.drop_duplicates(subset=columns_to_consider, inplace=True)
df_X.shape

### Missing Values

In [None]:
plt.figure(figsize=(15, 4))
sns.set_style('whitegrid')

g = sns.barplot(x=unique_values.index, y=unique_values, palette='inferno')
g.set_yscale("log")
g.set_xticklabels(g.get_xticklabels(), rotation=45, horizontalalignment='right')
g.set_title('Unique values per frequency')
plt.show()

### Per Sample

In [None]:

plt.figure(figsize=(15, 8))
sns.set_style('whitegrid')

g = sns.heatmap(df_X.isnull(), cbar=False, cmap='viridis')
# g = sns.heatmap(df_X.loc[df_X.isnull().sum(1).sort_values(ascending=1).index].isnull(), cbar=False, cmap='viridis')
g.set_xlabel('Column Number')
g.set_ylabel('Sample Number')

In [None]:
!pip install missingno
import missingno as msno
msno.matrix(df_X, labels=True, sort='descending', color=(0.27, 0.52, 1.0));
# g = msno.bar(df_X, labels=True, color="dodgerblue", sort="ascending", figsize=(10,5), fontsize=12)
# g.set_xticklabels(g.get_xticklabels(),rotation=90);

In [None]:

df_X = df_X.dropna(thresh=df_X.shape[1] * 0.80, axis=0).reset_index(drop=True)
df_X.shape

## Per Feature
As a next step, let's now look at the number of missing values per feature. 



In [None]:
df_X.isna().mean().sort_values().plot(
    kind="bar", figsize=(15, 4),
    title="Percentage of missing values per feature",
    ylabel="Ratio of missing values per feature");


In [None]:

df_X = df_X.dropna(thresh=df_X.shape[0] * 0.85, axis=1)
df_X.shape

## Numerical Features
Using Pandas Plot feature to plot global view of the dataset


In [None]:
df_X.plot(lw=0, marker=".", subplots=True, layout=(-1, 4),
          figsize=(15, 30), markersize=1);

## Non-numerical features

In [None]:
# Display non-numerical features
df_X.select_dtypes(exclude="number").head()
# Extract descriptive properties of non-numerical features
# df_X.describe(exclude="number").head()

0
1
2
3
4
