In [1]:
import pandas as pd
import json
from pandas_profiling import ProfileReport

def createDataFrame(name: str) -> pd.DataFrame:
    fl = "sample-dataset/{}.csv".format(name)
    df = pd.read_csv(fl)
    return df

def duplicateCount(df: pd.DataFrame) -> (int, float):
    dflen = len(df)
    uniqdflen = len(df.drop_duplicates())
    duplicate = dflen - uniqdflen
    duplicate_p = float(duplicate)/float(dflen)
    return (duplicate, duplicate_p)

def missingCount(df: pd.DataFrame) -> (int, float):
    na_count = int(df.isnull().sum().sum())
    na_p = float(na_count)/float(len(df))
    return (na_count, na_p)

def dfSize(df: pd.DataFrame) -> (int, float):
    tot_size = int(df.memory_usage().sum())
    avg_size = float(tot_size) / float(len(df))
    return (tot_size, avg_size)


In [7]:
df = createDataFrame("iris")
sample = df.sample(n = min(10000, len(df)))
sample.reset_index(drop=True, inplace=True)

prof = ProfileReport(sample, title = "iris profile")
prof.to_file(output_file="iris_prof.json")
dfDict = {}
dfDict['n'] = len(df)
dfDict['n_var'] = len(df.columns)
dfDict['n_cells_missing'], dfDict['p_cells_missing'] = missingCount(df)
dfDict['n_duplicates'], dfDict['p_duplicates'] = duplicateCount(df)
dfDict['memory_size'], dfDict['record_size'] = dfSize(df)
with open('full_dataframe.json', 'w') as f:
    json.dump(obj={'full_table': dfDict}, fp=f)


Summarize dataset:   0%|          | 0/18 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
df.memory_usage().sum()

6128

In [None]:
vars = {}
vars_full = {}

with open('iris_prof.json') as v:
     vars = json.load(v)
with open('full_dataframe.json') as v2:
     vars_full = json.load(v2)
vars.update(vars_full)
print(vars)

In [25]:
def human_readable_size(size: float, decimal_places: int=2) -> str:
    for unit in ['B','KiB','MiB','GiB','TiB']:
        if size < 1024.0:
            break
        size /= 1024.0
    return f"{size:.{decimal_places}f}{unit}"

print(human_readable_size(6128))
print(human_readable_size(40.85333333333333))
print(human_readable_size(36507222016))


5.98KiB
40.85B
34.00GiB


In [37]:
sample = df.sample(n = min(10000, len(df))).drop(1)
x = sample.loc[:, df.columns]
x.columns
x

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
52,6.9,3.1,4.9,1.5,versicolor
119,6.0,2.2,5.0,1.5,virginica
50,7.0,3.2,4.7,1.4,versicolor
40,5.0,3.5,1.3,0.3,setosa
108,6.7,2.5,5.8,1.8,virginica
...,...,...,...,...,...
76,6.8,2.8,4.8,1.4,versicolor
81,5.5,2.4,3.7,1.0,versicolor
123,6.3,2.7,4.9,1.8,virginica
77,6.7,3.0,5.0,1.7,versicolor


In [38]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [33]:
y.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [34]:
prof.to_file(output_file="iris_prof.html")

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
sample

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,7.2,3.6,6.1,2.5,virginica
2,4.9,3.6,1.4,0.1,setosa
3,5.2,4.1,1.5,0.1,setosa
4,5.0,3.2,1.2,0.2,setosa
5,5.8,2.7,3.9,1.2,versicolor
...,...,...,...,...,...
145,5.1,3.5,1.4,0.2,setosa
146,5.0,3.4,1.6,0.4,setosa
147,6.3,2.5,5.0,1.9,virginica
148,5.4,3.4,1.7,0.2,setosa


In [2]:
pd.__version__

'1.2.5'