# Importing all the Dependencies

In [151]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# original df

In [152]:
# loading the data
df = pd.read_csv('../data/hou_all.csv')
df.head()

Unnamed: 0,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24,1.1
0,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,1
1,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,1
2,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,1
3,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2,1
4,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7,1


In [153]:
#adding columsn name 
col_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV','BIAS_COL']
df.columns = col_names

In [154]:
df = df.iloc[:,:-1]

In [155]:
df.head(2)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
1,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7


- so there are 505 observations and 15 columns

In [156]:
# lets check columns that are numerical types

numerical_cols = df.select_dtypes(include=[np.number]).columns
numerical_cols

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')

- looks like no categorical columns
- also since no categorical value, no encoding required

# heavily_moderately correlated df

In [157]:
# correlated dataframe
df_correlated = pd.read_csv('../data/highly_moderate_correlated.csv')


In [158]:
df_correlated = df_correlated.iloc[:,1:]

In [159]:
df_correlated.head(2)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
1,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7


# MEDV outlier removed dataframe


In [160]:
df_removed_outlier= pd.read_csv('../data/df_outlier_removed_from_medv.csv')
df_removed_outlier = df_removed_outlier.iloc[:,1:]
df_removed_outlier.head(2)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
1,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7


# Some transformations to remove skewness and reduce effect of outliers

In [161]:
from scipy.stats import mstats
# Log Transformation  ==>  to reduce skewness and the impact of high values.
def log_transform(df, columns):
    for col in columns:
        df[col] = np.log(df[col])  # log1p handles log(0) case
    return df

# Square Root Transformation ==> to lessen the influence of high values in a less aggressive manner than log transformation.
def sqrt_transform(df, columns):
    for col in columns:
        df[col] = np.sqrt(df[col])
    return df

# Winsorization ==> to limit extreme values and reduce the effect of outliers.
def winsorize_transform(df, columns):
    for col in columns:
        df[col] = mstats.winsorize(df[col], limits=[0.05, 0.05])  # limits can be adjusted
    return df

In [162]:
# Select columns to apply transformations based on correlation and outlier analysis
log_transform_cols = ['CRIM', 'ZN', 'DIS', 'TAX']
sqrt_transform_cols = ['AGE', 'LSTAT']
winsorize_transform_cols = ['RM', 'NOX', 'PTRATIO', 'RAD', 'B']

## transformation for original df

In [163]:
df_transformed = df.copy()

In [164]:

# Apply Log Transformation
df_transformed = log_transform(df_transformed, log_transform_cols)

# Apply Square Root Transformation
df_transformed = sqrt_transform(df_transformed, sqrt_transform_cols)

# Apply Winsorization
df_transformed = winsorize_transform(df_transformed, winsorize_transform_cols)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [165]:
# df_transformed.to_csv('../data/transformed_original.csv', index=True)

## transformation for correlated df

In [166]:
df_transformed_correlated = df_correlated.copy()

In [167]:


# Apply Log Transformation
df_transformed_correlated = log_transform(df_transformed_correlated, log_transform_cols)

# Apply Square Root Transformation
df_transformed_correlated = sqrt_transform(df_transformed_correlated, sqrt_transform_cols)

# Apply Winsorization
df_transformed_correlated = winsorize_transform(df_transformed_correlated, winsorize_transform_cols)

In [168]:
# df_transformed_correlated.to_csv('../data/transformed_correlated.csv', index=True)

## transformation for df with no outlier 

In [169]:
df_transformed_removed_outlier = df_removed_outlier.copy()

In [170]:
# Apply Log Transformation
df_transformed_removed_outlier = log_transform(df_transformed_removed_outlier, log_transform_cols)

# Apply Square Root Transformation
df_transformed_removed_outlier = sqrt_transform(df_transformed_removed_outlier, sqrt_transform_cols)

# Apply Winsorization
df_transformed_removed_outlier = winsorize_transform(df_transformed_removed_outlier, winsorize_transform_cols)

In [171]:
# df_transformed_removed_outlier.to_csv('../data/transformed_removed_outlier.csv', index=True)

# Standarizing the data


## standarization for original df

In [172]:
df_scaled = df.copy()

In [173]:
col_to_standarized = ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
scaler = StandardScaler()
df_scaled[col_to_standarized] = scaler.fit_transform(df_scaled[col_to_standarized])

In [174]:
# df_scaled.to_csv('../data/scaled_original.csv', index=True)

## standarization for correlated df

In [175]:
df_scaled_correlated = df_correlated.copy()

In [176]:
col_to_standarized = ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
scaler = StandardScaler()
df_scaled_correlated[col_to_standarized] = scaler.fit_transform(df_scaled_correlated[col_to_standarized])

In [177]:
# df_scaled_correlated.to_csv('../data/scaled_correlated.csv', index=True)

## Standarization for no_outlier df

In [178]:
df_scaled_removed_outlier = df_removed_outlier.copy()

In [179]:
col_to_standarized = ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
scaler = StandardScaler()
df_scaled_removed_outlier[col_to_standarized] = scaler.fit_transform(df_scaled_removed_outlier[col_to_standarized])

In [180]:
# df_scaled_removed_outlier.to_csv('../data/scaled_no_outlier.csv', index=True)