### Import Libraries

In [2]:
import numpy as np
import pandas as pd
import datetime

### Load Dataset

In [2]:
# %time

start_time = datetime.datetime.now()

# normal load
df = pd.read_csv('Datasets/hmeq.csv')

# load in chunks (when dataset is too large)
tp = pd.read_csv('Datasets/hmeq.csv',low_memory=False,iterator=True,chunksize=100)
df = pd.concat(tp,ignore_index = True)

end_time = datetime.datetime.now()

time_taken = (end_time - start_time).total_seconds()

print(time_taken,"s")

0.152769 s


### Remove "," for Numeric Columns

In [4]:
def numeric_remove_comma(df):
    if df.dtypes == 'int64' or df.dtypes =='float64':
        pd.to_numeric(df.replace(',',''))

df.apply(numeric_remove_comma)

0.001001 s


### Columns Info

In [6]:
# Get Column EDA and store in dict
dtypes_dict = {}
shortest_dict = {}
longest_dict = {}
isnull_dict = {}
unique_dict = {}
memory_dict = {}

for column in df.columns:
    dtypes_dict[column] = df[column].dtype
    shortest_dict[column] = df[column].astype(str).str.len().min()
    longest_dict[column] = df[column].astype(str).str.len().max()
    isnull_dict[column] = df[column].isnull().sum()
    unique_dict[column] = df[column].nunique()
    memory_dict[column] = df[column].memory_usage()

columns_info = ['data_types','shortest_length','longest_length','null_values','unique_values','memory_usage','count','mean','std','min','25%','median','75%','max']

df1 = pd.DataFrame(dtypes_dict ,index=[0]).T
df2 = pd.DataFrame(shortest_dict ,index=[0]).T
df3 = pd.DataFrame(longest_dict,index=[0]).T
df4 = pd.DataFrame(isnull_dict,index=[0]).T
df5 = pd.DataFrame(unique_dict,index=[0]).T
df6 = pd.DataFrame(memory_dict,index=[0]).T

desc = df.describe().T

# Output to csv
pd.concat([df1,df2,df3,df4,df5,df6,desc],axis=1).to_csv('output.csv',header = columns_info)

### Remove Duplicated Value Columns

In [None]:
# remove duplicated value columns
df = df.loc[:~df.T.duplicated(keep='first')]

### Check One Hot Encoded Columns

In [None]:
for col in df.columns:
    if len(df[col].unique()) ==2 and set(df[col].unique().flatten()) == {0,1}:
        print(col)