# Data Cleaning 

## Objectives
- Clean dataset in preparation for analysis and modelling

## Inputs 
- outputs/datasets/collection/HousingPrices.csv

## Outputs
- outputs/datasets/cleaned/HousingPrices.csv
- outputs/datasets/cleaned/Test.csv
- outputs/datasets/cleaned/Train.csv

---

## Change working directory

In [None]:
import os 
cwd = os.getcwd()
cwd

In [None]:
os.chdir(os.path.dirname(cwd))
print("You set a new current working directory")


In [None]:
cwd = os.getcwd()
cwd

---

## Load data

In [None]:
import pandas as pd
df = pd.read_csv("outputs/datasets/collection/HousingPrices.csv")
df.head()

## Data Exploration

In [None]:
vars_with_missing_data = df.columns[df.isna().sum() > 0].to_list()
vars_with_missing_data

In [None]:
from ydata_profiling import ProfileReport
if vars_with_missing_data:
    profile = ProfileReport(df=df[vars_with_missing_data], minimal=True)
    profile.to_notebook_iframe()
else:
    print("There are no variables with missing data")

## Assessing Missing Data Levels

In [None]:
def EvaluateMissingData(df):
    missing_data_absolute = df.isnull().sum()
    missing_data_percentage = round(missing_data_absolute/len(df)*100, 2)
    df_missing_data = (pd.DataFrame(
                            data={"RowsWithMissingData": missing_data_absolute,
                                   "PercentageOfDataset": missing_data_percentage,
                                   "DataType": df.dtypes}
                                    )
                          .sort_values(by=['PercentageOfDataset'], ascending=False)
                          .query("PercentageOfDataset > 0")
                          )

    return df_missing_data

In [None]:
EvaluateMissingData(df)

Observations:
- EnclosedPorch and WoodDeckSF have severe levels of missing data. It would be most sensible to drop these fields before any analysis or modelling. 
- Categorical variables with missing data are GarageFinish, BsmtFinType1 and BsmtExposure - we should use a CategoricalImputer for these variables.
- Numerical variables with missing data are LotFrontage, BedroomAbvGr, 2ndFlrSF, GarageAge and MasVnrArea - we should use a MeanMedianImputer or ArbitraryNumberImputer for these variables.

### Assessing which imputer to use on numerical values

In [None]:
%matplotlib inline

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

numerical_variables_missing = ['LotFrontage', 'BedroomAbvGr', '2ndFlrSF', 'GarageAge', 'MasVnrArea']

for i in numerical_variables_missing:
    sns.histplot(df[i].dropna(), kde=True)
    plt.title(f"Distribution of {i}")
    plt.show()

Observations: 
- 2ndFlrSF and GarageAge can be filled with 0. This will be more meaningful than filling with the median, since it likely represents the lack of 2nd floor/garage.
- It is likely the missing values for LotFrontage, BedroomAbvGr and MasVnrArea are due to data entry missing, rather than the value is 0, so a MeanMedianImputer works best. We will use a median imputer since these variables are not normally distributed. 

## Handling missing data 

In [None]:
from sklearn.preprocessing import FunctionTransformer
from feature_engine.imputation import MeanMedianImputer, ArbitraryNumberImputer, CategoricalImputer
from sklearn.pipeline import Pipeline

In [None]:
cols_to_drop = ['WoodDeckSF', 'EnclosedPorch']

def drop_columns(X):
    return X.drop(columns=cols_to_drop)

dropper = FunctionTransformer(drop_columns)

In [None]:
pipeline = Pipeline([
      ('drop_cols', dropper),
      ( 'median',  MeanMedianImputer(imputation_method='median',
                                     variables=['LotFrontage', 'BedroomAbvGr', 'MasVnrArea']) ),
      ( 'zero_variables',  ArbitraryNumberImputer(arbitrary_number=-0,
                                                  variables=['2ndFlrSF', 'GarageAge']) ),
      ( 'cat_imputer',  CategoricalImputer(imputation_method='missing',
                                          variables=['GarageFinish', 'BsmtFinType1', 'BsmtExposure']) )
])
pipeline

In [None]:
pipeline.fit(df)

In [None]:
df_clean = pipeline.transform(df)

In [None]:
df_clean.isnull().sum()

In [None]:
print("Imputer dict for median: ", pipeline['median'].imputer_dict_)
print("Imputer dict for zero variables: ", pipeline['zero_variables'].imputer_dict_)
print("Imputer dict for categorical variables: ", pipeline['cat_imputer'].imputer_dict_)

Observations:
- MasVnrArea median is 0 anyway
- Other variables are being transformed correctly

## Investigating effect of data cleaning pipeline

In [None]:
import seaborn as sns
sns.set(style="whitegrid")
import matplotlib.pyplot as plt

def DataCleaningEffect(df_original,df_cleaned,variables_applied_with_method):

  flag_count=1 # Indicate plot number
  
  # distinguish between numerical and categorical variables
  categorical_variables = df_original.select_dtypes(exclude=['number']).columns 

  # scan over variables, 
    # first on variables that you applied the method
    # if the variable is a numerical plot, a histogram if categorical plot a barplot
  for set_of_variables in [variables_applied_with_method]:
    print("\n=====================================================================================")
    print(f"* Distribution Effect Analysis After Data Cleaning Method in the following variables:")
    print(f"{set_of_variables} \n\n")
  

    for var in set_of_variables:
      if var in categorical_variables:  # it is categorical variable: barplot
        
        df1 = pd.DataFrame({"Type":"Original","Value":df_original[var]})
        df2 = pd.DataFrame({"Type":"Cleaned","Value":df_cleaned[var]})
        dfAux = pd.concat([df1, df2], axis=0)
        fig , axes = plt.subplots(figsize=(15, 5))
        sns.countplot(hue='Type', data=dfAux, x="Value",palette=['#432371',"#FAAE7B"])
        axes.set(title=f"Distribution Plot {flag_count}: {var}")
        plt.xticks(rotation=90)
        plt.legend() 

      else: # it is numerical variable: histogram

        fig , axes = plt.subplots(figsize=(10, 5))
        sns.histplot(data=df_original, x=var, color="#432371", label='Original', kde=True,element="step", ax=axes)
        sns.histplot(data=df_cleaned, x=var, color="#FAAE7B", label='Cleaned', kde=True,element="step", ax=axes)
        axes.set(title=f"Distribution Plot {flag_count}: {var}")
        plt.legend() 

      plt.show()
      flag_count+= 1

In [None]:
DataCleaningEffect(df_original=df,
                   df_cleaned=df_clean,
                   variables_applied_with_method=numerical_variables_missing)

## Splitting cleaned df into test and train sets

In [None]:
from sklearn.model_selection import train_test_split

X = df_clean
y = df_clean['SalePrice']

X_train, X_test, y_train, y_test= train_test_split(
                                        X,
                                        y,
                                        test_size=0.2,
                                        random_state=0)

print(f"TrainSet shape: {X_train.shape} \nTestSet shape: {X_test.shape}")

### Checking train set is clean

In [None]:
EvaluateMissingData(X_train)

## Push cleaned data to repo

In [None]:
from pathlib import Path

out_dir = Path("outputs/datasets/cleaned")
out_dir.mkdir(parents=True, exist_ok=True)

df_clean.to_csv(out_dir / "HousingPrices.csv", index=False)
X_train.to_csv(out_dir / "TrainSetCleaned.csv", index=False)
X_test.to_csv(out_dir / "TestSetCleaned.csv", index=False)