# EDA (exploratory data analysis)

 import the libraries

In [None]:
from statistics import quantiles
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

I'm editing kaggle terminal output. I ignore library warnings

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)


In [None]:
train = pd.read_csv(r"/kaggle/input/house-prices-dataset/train.csv")
test = pd.read_csv(r"/kaggle/input/house-prices-dataset/test.csv")
df = pd.concat([train, test], ignore_index=True).drop(columns='Id')
df

In [None]:
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape,end="\n\n")
    print("##################### Types #####################")
    print(dataframe.dtypes,end="\n\n")
    print("##################### Head #####################")
    print(dataframe.head(3),end="\n\n")
    print("##################### Tail #####################")
    print(dataframe.tail(3),end="\n\n")
    print("##################### NA #####################")
    print(dataframe.isnull().sum(),end="\n\n")
    print("##################### Quantiles #####################")
    for col in dataframe.columns:
        if dataframe[col].dtype != "object":
            print(dataframe[col].quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T, end="\t\t")

check_df(df)

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe if dataframe[col].dtype == "O"]

    num_but_cat = [col for col in dataframe if len(dataframe[col].unique()) < cat_th
                   and dataframe[col].dtype != "O"]

    cat_but_car = [col for col in dataframe if len(dataframe[col].unique()) > car_th
                   and dataframe[col].dtype != "O"]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe if dataframe[col].dtype != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print("observations : ", dataframe.shape[0], end="\n\n")
    print("features : ", dataframe.shape[1], end="\n\n")
    print("Categorical columns : ", len(cat_cols), end="\n\n")
    print("Numerical columns : ", len(num_cols), end="\n\n")
    print("Categorical columns but not car columns : ", len(cat_but_car), end="\n\n")
    print("Numerical columns but not car columns : ", len(num_but_cat), end="\n\n")
    print("Categorical columns but not car columns : ", cat_cols, end="\n\n")
    print("Numerical columns but not car columns : ", num_but_cat, end="\n\n")
    print("Car columns : ", cat_but_car, end="\n\n")
    print("Car columns : ", cat_but_car, end="\n\n")
    return cat_cols, num_cols, cat_but_car

cat_cols, num_cols, cat_but_car = grab_col_names(df)



## Analysis of categorical variables

In [None]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))

    if plot:
        plt.style.use("fivethirtyeight")
        plt.figure(figsize=(12, 9), dpi=80)
        if len(dataframe[col_name].unique()) < 10:
            
            sns.countplot(x=dataframe[col_name], data=dataframe, palette="pastel")
            plt.show(block=True)
        else:
            plt.show(block=True)
            sns.countplot(y=dataframe[col_name], data=dataframe, palette="pastel")


for col in cat_cols:
    cat_summary(dataframe=df, col_name=col, plot=True)


## Analysis of numerical variables

In [None]:
def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        plt.style.use("ggplot")
        plt.figure(figsize=(12, 9), dpi=80)
        sns.histplot(data=dataframe, x=numerical_col, bins=50)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show(block=True)
        print("#####################################", end="\n\n")


for col in num_cols:
    num_summary(dataframe=df, numerical_col=col, plot=True)


## Analysis of target variable

In [None]:
def target_summary_cat_barplot(dataframe, target, categorial_col, plot=False):
    groupData = dataframe.groupby(categorial_col)[target].mean()
    print(groupData)
    if plot:
        plt.style.use("ggplot")
        plt.figure(figsize=(12, 9), dpi=80)
        sns.barplot(x=groupData.index, y=groupData.values)
        plt.xlabel(categorial_col)
        plt.ylabel(target)
        plt.title(target + " by " + categorial_col)
        plt.show(block=True)
for col in cat_cols:
    target_summary_cat_barplot(dataframe=df, target="SalePrice", categorial_col=col, plot=True)


In [None]:
corr_matrix = df[num_cols].corr()
corr_matrix = corr_matrix.abs()
sns.heatmap(df[num_cols].corr().abs())
plt.show(block=True)


In [None]:
def high_corr_heatmap(dataframe, threshold=0.75, plot=False):
    corr_matrix = dataframe.corr()
    corr_matrix = abs(corr_matrix)
    upper_triangle_matrix = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
    drop_list = [col for col in upper_triangle_matrix.columns if any(upper_triangle_matrix[col] > threshold)]
    if plot:
        import seaborn as sns
        import matplotlib.pyplot as plt
        sns.set(rc={'figure.figsize': (15, 15)})
        sns.heatmap(upper_triangle_matrix, cmap="RdBu")
        plt.show(block=True)
    return drop_list
high_corr_col=high_corr_heatmap(df[num_cols], threshold=0.75, plot=True)