In [1]:
# Author : Daniel Parada

# Import required packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from matplotlib import pyplot

# We create the EDA class
class df_EDA(object):
    """Performs exploratory data analysis on a pandas data frame. 
    
    Attributes:
        dat: pandas.DataFrame containing entries which describes the real states sold
        
    Returns:
        None
    """
    
    
    def __init__(self, dat):
        """Initialises df attribute of the class.
        
        Checks if the object passed is a pandas.DataFrame
        
        Args :
            dat : pandas.DataFrame
            
        Returns :
            None
        """
        
        if isinstance(dat,pd.DataFrame):
            self.df = dat

    
    def df_null(self):
        """Checks if there are any null values in the dataframe"""
        
        if self.df.isnull().values.any():
            print("There are null values in the data. \n")
        else:
            print("No null values. \n")
    
    
    def df_vis(self):
        """Basic visualisation of the data.
        
        Prints the first 10 entries in the dataframe.
        Prints the shape of the dataframe.
        Prints the name of the features of the dataframe.
        """
        
        print("First 10 entries :")
        print(self.df.head(10))
            
        print("Shape of the data frame : ")
        print(self.df.shape)
            
        print("Feature names :")
        print(self.df.columns)
        
        
    def df_stats(self):
        """Perform basic statistical analysis on the features of the dataframe."""
        
        print("Basic stats of the features : \n")
        print(self.df.describe(include='all'))
    
    
    def df_dtypes(self):
        """Print the type of data stored in each column."""
        
        print(self.df.dtypes)
    
    
    def df_pairplot(self):
        """Creates a pairplot, which gives us the correlation between features with feature 
           distributions along the diagonal."""
        
        df = self.df.select_dtypes(include='number')
        df = df.drop('Id', axis=1)
        sns.pairplot(df, diag_kind="kde")
        
#         test = dat.select_dtypes(include='number')
#         test = test.drop('Id', axis=1)
#         sns.pairplot(data=test, diag_kind="kde")

        
    def df_corr(self):
        """Creates a heatmap for the correlation among features."""
        
        # Select features with numeric values (not necessary as corr() automatically selects numeric features)
        df_num = self.df.select_dtypes(include='number')
        print(df_num.shape)
        
        # Calculate correlation between features
        feat_corr = df_num.corr()
        
        # We create a figure using matplotlib that is big enough for all the variables
        pyplot.figure(figsize=(12, 12)) # width and height in inches
        sns.heatmap(feat_corr, cbar = True,  square = True, annot=True, annot_kws={'size': 10},  cmap= 'coolwarm')
    
    
    def df_eda(self):
        """Performs all of the methods defined for exploratory data analysis."""
        
        self.df_null()
        self.df_vis()
        self.df_stats()
        self.df_dtypes()
        self.df_pairplot()
        self.df_corr()    
    