In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('data/train.csv')

In [None]:
df.head()

In [None]:
# from sklearn.model_selection import train_test_split
# x_train, x_test, y_train, y_test = train_test_split(df, test_size = 0.1, random_state = 0)

### Missing values for categorical values

In [None]:
features_nan = [feature for feature in df.columns if df[feature].isnull().sum() > 1 and df[feature].dtypes == 'O']

In [None]:
for feature in features_nan:
    print(feature, np.round(df[feature].isnull().mean(), 3), '% null value')

In [None]:
df[features_nan] = df[features_nan].fillna('Missing')

In [None]:
for feature in features_nan:
    print(feature, np.round(df[feature].isnull().mean(), 3), '% null value')

### Missing values for numerical values

In [None]:
numerical_features_nan = [feature for feature in df.columns if df[feature].isnull().sum() > 1 and df[feature].dtypes != 'O']

numerical_features_nan

In [None]:
for feature in numerical_features_nan:
    median_val = df[feature].median()
    df[feature + '_nan'] = np.where(df[feature].isnull(), 1, 0)
    df[feature] = df[feature].fillna(median_val)
    

In [None]:
df[numerical_features_nan].isnull().sum()

In [None]:
df

### Temporal Variables

In [None]:
year_features = [feature for feature in df.columns if 'Yr' in feature or 'Year' in feature]
year_features

In [None]:
for feature in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    df[feature] = df['YrSold'] - df[feature]

In [None]:
df[year_features]

### Transformations

In [None]:
num_features = ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']
for feature in num_features:
    df[feature] = np.log(df[feature])

### Handling Rare Categories
We will removecategorical variables that are present less than 1% in dataset

In [None]:
cat_features = [feature for feature in df.columns if df[feature].dtypes == 'O']

In [None]:
cat_features

In [None]:
for feature in cat_features:
    temp = df.groupby(feature)['SalePrice'].count() / len(df)
    temp_df = temp[temp > 0.01].index
    df[feature] = np.where(df[feature].isin(temp_df), df[feature], 'rare_var')

In [None]:
df.head()

In [None]:
for feature in cat_features:
    print(feature, len(df[feature].unique()))

In [None]:
# df = pd.get_dummies(df, drop_first = True)
# df.head()

In [None]:
df.shape

In [None]:
df

### Variance Threshold

In [None]:
num_features = [feature for feature in df.columns if df[feature].dtypes != 'O']
df[num_features]

In [None]:
from sklearn.preprocessing import MinMaxScaler
features_scale = [feature for feature in num_features if feature not in ['Id', 'SalePrice']]

In [None]:
data = df[num_features].copy().drop(['Id', 'SalePrice'], axis = 1)
data

In [None]:
scaler = MinMaxScaler()
scaler.fit(data)
data = pd.DataFrame(scaler.transform(df[features_scale]), columns = features_scale)

In [None]:
plt.bar(data.columns, data.var())
plt.show()

In [None]:
data.var()

In [None]:
data

In [None]:
from sklearn.feature_selection import VarianceThreshold

var_thres = VarianceThreshold(threshold = 0.01)
var_thres.fit(data)

In [None]:
var_thres.get_support()

In [None]:
low_variance_cols = [col for col in data.columns if col not in data.columns[var_thres.get_support()]]
low_variance_cols

In [None]:
df['SalePrice'].var()

In [None]:
# df = df.drop(low_variance_cols, axis = 1)

In [None]:
df

## Selection with correlation

In [None]:
X = df.drop("SalePrice", axis = 1)
y = df["SalePrice"]

In [None]:
plt.figure(figsize = (40, 40))
cor = X.corr()
sns.heatmap(cor, annot = True, cmap = plt.cm.gray)
plt.show()

In [None]:
threshold = 0.87
corrilated_features = set()
for i in range(cor.shape[0]):
    for j in range(cor.shape[1]):
        if cor.iloc[i, j] > threshold and i != j:
            colname = cor.columns[i]
            corrilated_features.add(colname)

corrilated_features

## Mutual Info Regression

In [None]:
from sklearn.feature_selection import mutual_info_regression

In [None]:
X = df[num_features].copy().drop(['SalePrice', 'Id'], axis = 1)
y = df['SalePrice']

In [None]:
mutual_info = mutual_info_regression(X, y)

In [None]:
mutual_info

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X.columns
mutual_info.sort_values(ascending = False)

In [None]:
mutual_info.sort_values(ascending = False).plot.bar(figsize = (15, 5))

In [None]:
df = pd.get_dummies(df, drop_first = True)
df.head()

### Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
features_scale = [feature for feature in df.columns if feature not in ['Id', 'SalePrice']]

scaler = MinMaxScaler()
scaler.fit(df[features_scale])

In [None]:
data = pd.DataFrame(scaler.transform(df[features_scale]), columns = features_scale)

In [None]:
data

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()

In [None]:
X_train = data
y_train = df["SalePrice"]

In [None]:
reg.fit(X_train, y_train)