# Explore data analysis (EDA)

## Statistic report table

In [1]:
import os, math, subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

# some settings for displaying Pandas results
# pd.set_option('display.width', 2000)
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.precision', 4)
# pd.set_option('display.max_colwidth', -1)

In [None]:
class ExplorerDF:

    def __init__(self,):
        pass
    
    def exploring(dataframe, nsample=5, targetY = None):
        # check rows, cols
        total_records, total_columns = dataframe.shape[0], dataframe.shape[1]
        print(f"Total {total_records} records, {total_columns} columns")

        # check dtypes
        dty = dataframe.dtypes.rename("sub_type")

        # check distinct
        n_dist = dataframe.nunique().rename("n_distinct")
        pct_dist = (100*n_dist/total_records).round(2).rename("pct_distinct")

        # check missing
        n_miss = dataframe.isna().sum().rename("n_miss")
        pct_miss = (100*n_miss/total_records).round(2).rename("pct_miss")
        pct_coverage = (100 - pct_miss).rename("pct_coverage")

        # check negative
        n_neg = dataframe.applymap(lambda x: (x<0) if (isinstance(x, int) or isinstance(x, float))
                                    else False).sum().rename("n_negative")
        pct_neg = (100*n_neg/total_records).round(2).rename("pct_negative")

        # check zero
        n_zero = dataframe.applymap(lambda x: (x==0) if (isinstance(x, int) or isinstance(x, float))
                                    else False).sum().rename("n_zero")
        pct_zero = (100*n_zero/total_records).round(2).rename("pct_zero")

        # check correlation with targetY
        if targetY is not None:
            assert total_records == targetY.shape[0]
            corr_Y = dataframe.corrwith(targetY, numeric_only = True)\
                .reindex(dataframe.columns).rename('corr_Y')
            pct_zero = pd.concat([pct_zero, corr_Y], axis=1)

        # check description
        des_stat = dataframe.describe().transpose().reindex(dataframe.columns).fillna(0)

        # take samples
        pdf_sample = dataframe.sample(n=nsample).transpose()
        pdf_sample.columns = ["sample_{}".format(i+1) for i in range(nsample)]

        # output
        pdf_data = pd.concat([dty, n_dist, pct_dist, n_miss, 
                            pct_miss, pct_coverage, n_neg, pct_neg, n_zero, 
                            pct_zero, des_stat, pdf_sample], axis=1)

        return pdf_data
    
    def export_statistic_report(list_data, output_folder = None):
        """
        list_data: list of dataframes/path_csv_files that want to be explore 
        """
        output_dir =os.path.join(output_folder, 'data_statistic_report.xlsx') \
            if output_folder is not None else 'data_statistic_report.xlsx'
        with pd.ExcelWriter(output_dir) as writer:
            for i, ele in enumerate(list_data) :
                if (type(ele) == str) :
                    if os.path.exists(ele) :
                        name = os.path.splitext(os.path.basename(ele))[0]
                        df = pd.read_csv(ele)
                elif (type(ele) == pd.DataFrame):
                    name = f'DataFrame_{i}'
                    df = ele
                else :
                    print('Error type of element', i)
                    continue
                print(name, end = ": ")
                rp = ExplorerDF.exploring_stats(df)
                rp.reset_index().to_excel(writer, sheet_name=name, index = False)  

## Dataprep report

### create_report

In [None]:
from dataprep.eda import create_report

In [None]:
from dataprep.eda import create_report
# create_report(df.convert_dtypes())
# xem report
create_report(df.convert_dtypes()).show_browser()

In [None]:
# dask with large data
from dataprep.eda import create_report
create_report(df_dask).show_browser()

In [None]:
df_meta = pd.DataFrame({c: pd.Series(dtype=t) for c, t in 
                        [('report_date',int),('brandname',str), ('month',str) ,('template_id',str) ,('tenkh',str) ,
                         ('makh',str), ('kythongbao',str) ,('som3', float),('sotien', float),('diachi', str),]})

conn_Str = f'oracle+cx_oracle://score:Vmg102021@192.168.18.32:1521/?service_name=score'

df_oracle = dd.read_sql('BRANDNAME_WATER_BILL_OVERDUE', conn_Str ,index_col= 'user_id', meta = df_meta).reset_index()

In [None]:
df_oracle

In [None]:
create_report(df_oracle).show_browser()

In [None]:
import sqlalchemy as sa
sqluri = f'oracle+cx_oracle://score:Vmg102021@192.168.18.32:1521/?service_name=score'
engine = sa.create_engine(sqluri)
df_meta = pd.DataFrame({c: pd.Series(dtype=t) for c, t in [('month',str) , ('makh',str),('sotien', float)]})
sa_meta = sa.MetaData()
sa_table = sa.Table("BRANDNAME_WATER_BILL_OVERDUE", sa_meta, autoload=True, autoload_with=engine)
sa_query = sa.select([sa_table]).where(sa_table.c.month == "202008")
sa_columns = [sa_table.c.month, sa_table.c.makh, sa_table.c.sotien]


df_oracle = dd.read_sql_table(sa_query, sqluri, index_col="user_id", columns=sa_columns, meta = df_meta).reset_index()
create_report(df_oracle).show_browser()

In [None]:
import dataprep.connector as cx
conn_str = "sqlite:///D:/INFO.db"
df = cx.read_sql(conn_str,'select * from DTTSD_TELCO_INFO limit 1000000',return_type = 'dask')

### plot() - distributions and statistics
The function plot() explores the distributions and statistics of the dataset. 

In [None]:
from dataprep import eda

In [None]:
## plot(df): plots the distribution of each column
eda.plot(df)

In [None]:
# plot(df, x): plots the distribution of column x in various ways and calculates column statistics
eda.plot(df, 'GENDER')

In [None]:
df['ACTIVE_DATE'] = pd.to_datetime(df['ACTIVE_DATE'],format='%Y-%m-%d', infer_datetime_format=True)
eda.plot(df, 'ACTIVE_DATE')

In [None]:
# plot(df, x, y): generates plots depicting the relationship between columns x and y
eda.plot(df, 'ACTIVE_DATE','BILL_TOTAL')

In [None]:
eda.plot(df, 'GENDER', 'BILL_TOTAL')

### Analyze correlations with plot_correlation()

In [None]:
eda.plot_correlation(df)

In [None]:
eda.plot_correlation(df, 'LL_THOAI',)

In [None]:
eda.plot_correlation(df, 'BILL_TOTAL', 'LL_THOAI',config ={'scatter.sample_size': 1000, 'height': 400, 'width': 400,} )

### missing value

In [None]:
eda.plot_missing(df)

In [None]:
eda.plot_missing(df, 'GENDER')

In [None]:
eda.plot_missing(df, 'GENDER', 'BILL_TOTAL')

### Analyze difference with plot_diff()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train, df_test = train_test_split(df, train_size = 0.7)

In [None]:
eda.plot_diff([df_train, df_test])

### read_sql with ConnectorX 

In [None]:
from dataprep.connector import read_sql

In [None]:
%%time
db = r'sqlite:///E:/4. Score/LEAD.db'
# read_sql(db,'select * from VMGLEAD_SYNC where ID_CARD = \'126756472\' ')
read_sql(db,'select * from VMGLEAD_SYNC where ID_CARD = \'030087000004\' ', partition_num = 8)
# read_sql(db,'select * from VMGLEAD_SYNC limit 10 ')

In [None]:
%%time
db = r'sqlite:///E:/4. Score/LEAD.db'
df = read_sql(db,'select * from VMGLEAD_SYNC where RISK_SCORE > 400 ', partition_num = 8,partition_on="RISK_SCORE")

In [None]:
%%time
import sqlite3
conn = sqlite3.connect(r'sqlite:///E:/4. Score/LEAD.db')
df = pd.read_sql_query('select * from VMGLEAD_SYNC where RISK_SCORE > 400 ', conn)

In [None]:
from dataprep.eda import create_db_report

In [None]:
from dataprep.eda import create_db_report
from dataprep.datasets import load_db
db_engine = load_db(r'sqlite:///E:/4. Score/LEAD.db')
create_db_report(db_engine)