# Data Profiling

### Import modules

In [1]:
import pandas
import numpy
import random
import datetime

### Import/Create Data

In [2]:
data = pandas.DataFrame({"colour":["blue","blue","red","orange","red",None,"red"],
                        "number":[12,14,15,12,31,14, None],
                        "name":["daniel","chris","gary","jess", "robert","jan","harry"]})

In [3]:
data.head(3)

Unnamed: 0,colour,number,name
0,blue,12.0,daniel
1,blue,14.0,chris
2,red,15.0,gary


### Helper function to generate data

In [4]:
def generate_dynamic_random_code(code_length):
    
    characters_string = "abcdefghijklmnopqrstuvwxyz1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    
    listchar = list(characters_string)
    
    placeholder_string = "-" * code_length
    
    placeholders = list(placeholder_string)
    
    for index in range(len(placeholders)):
    
        random_index = random.randint(0,len(listchar) - 1)
    
        character = listchar[random_index]
    
        placeholders[index] = character
    
    code = "".join(placeholders)
    
    return code

In [5]:
def generate_float_frame(row_count, column_count, assign_col_names = False):
    
    data = pandas.DataFrame(numpy.random.random((row_count, column_count)))
    
    if assign_col_names == True:
        
        columns = []
        while len(columns) < column_count:
            
            columns.append(generate_dynamic_random_code(8))
            
        data.columns = columns
    
    return data

## Data Preview

### Get Statistical Summary

In [6]:
describe = data.describe(include = "all")
describe_transposed = describe.T
describe_final = describe_transposed.reset_index()

In [7]:
describe_final

Unnamed: 0,index,count,unique,top,freq,mean,std,min,25%,50%,75%,max
0,colour,6,3.0,red,3.0,,,,,,,
1,number,6,,,,16.3333,7.28469,12.0,12.5,14.0,14.75,31.0
2,name,7,7.0,robert,1.0,,,,,,,


### Get Data Types

In [8]:
dtypes = data.dtypes

In [9]:
dtypes_dataframe = pandas.DataFrame(dtypes, columns = ["data_type"])

In [10]:
dtypes_final = dtypes_dataframe.reset_index()

In [11]:
dtypes_final

Unnamed: 0,index,data_type
0,colour,object
1,number,float64
2,name,object


### Get Null composition

In [12]:
filled_count_series = data.notnull().sum().astype(int)
null_count_series = data.isnull().sum().astype(int)
totalcount = data.shape[0]

In [13]:
percent_null =  null_count_series / totalcount
percent_filled = filled_count_series / totalcount

In [14]:
percent_null = percent_null.apply('{0:.2%}'.format)
percent_filled = percent_filled.apply('{0:.2%}'.format)

In [15]:
percent_null = percent_null.reset_index().rename(columns = {0:"null_percent"})
percent_filled = percent_filled.reset_index().rename(columns = {0:"non_null_percent"})
filled_count_series = filled_count_series.reset_index().rename(columns = {0:"non_null_counts"})
null_count_series = null_count_series.reset_index().rename(columns = {0:"null_counts"})

In [16]:
from functools import reduce

In [17]:
null_info_dataframe = reduce(lambda left,right: pandas.merge(left,right,on='index'), [percent_null,
                                                                                 percent_filled,
                                                                                 filled_count_series,
                                                                                 null_count_series])

In [18]:
null_info_dataframe

Unnamed: 0,index,null_percent,non_null_percent,non_null_counts,null_counts
0,colour,14.29%,85.71%,6,1
1,number,14.29%,85.71%,6,1
2,name,0.00%,100.00%,7,0



### Merge all

In [19]:
merged = reduce(lambda left,right: pandas.merge(left,right,on='index', how = "left"), [null_info_dataframe,
                                                                        dtypes_final,
                                                                        describe_final])

### Add row count

In [20]:
merged["row_count"] = merged["non_null_counts"] + merged["null_counts"]

merged.drop(["count"], axis = 1, inplace = True)

In [68]:
merged["data_type"] = merged["data_type"].astype(str)

### Print

In [69]:
profile_dict = {}

In [70]:
merged_attributes = list(merged.columns)[1:]

for index in range(merged.shape[0]):
    
    column = merged.loc[index, "index"]
    column_padded = "  " + column + "  "
    print("\n\n", column_padded.center(44, "-"),"\n",sep = "")    
    profile_dict[column] = []    
    for attr in merged_attributes:
        attr_ = attr + ":"
        value = merged.loc[index, attr]
        value_string = str(value)
        
        print(attr_.ljust(22, " "), value_string.rjust(22, " "), sep = "")    
        
        print(type(value))
        if type(value) == numpy.int64:
            profile_dict[column].append({attr:int(value)})
        else:
            profile_dict[column].append({attr:value})



-----------------  colour  -----------------

null_percent:                         14.29%
<class 'str'>
non_null_percent:                     85.71%
<class 'str'>
non_null_counts:                           6
<class 'numpy.int64'>
null_counts:                               1
<class 'numpy.int64'>
data_type:                            object
<class 'str'>
unique:                                    3
<class 'int'>
top:                                     red
<class 'str'>
freq:                                      3
<class 'numpy.int64'>
mean:                                    nan
<class 'float'>
std:                                     nan
<class 'float'>
min:                                     nan
<class 'float'>
25%:                                     nan
<class 'float'>
50%:                                     nan
<class 'float'>
75%:                                     nan
<class 'float'>
max:                                     nan
<class 'float'>
row_count:                   

In [79]:
profile_dict

{'colour': [{'null_percent': '14.29%'},
  {'non_null_percent': '85.71%'},
  {'non_null_counts': 6},
  {'null_counts': 1},
  {'data_type': 'object'},
  {'unique': 3},
  {'top': 'red'},
  {'freq': 3},
  {'mean': nan},
  {'std': nan},
  {'min': nan},
  {'25%': nan},
  {'50%': nan},
  {'75%': nan},
  {'max': nan},
  {'row_count': 7}],
 'number': [{'null_percent': '14.29%'},
  {'non_null_percent': '85.71%'},
  {'non_null_counts': 6},
  {'null_counts': 1},
  {'data_type': 'float64'},
  {'unique': nan},
  {'top': nan},
  {'freq': nan},
  {'mean': 16.333333333333332},
  {'std': 7.284687135812126},
  {'min': 12.0},
  {'25%': 12.5},
  {'50%': 14.0},
  {'75%': 14.75},
  {'max': 31.0},
  {'row_count': 7}],
 'name': [{'null_percent': '0.00%'},
  {'non_null_percent': '100.00%'},
  {'non_null_counts': 7},
  {'null_counts': 0},
  {'data_type': 'object'},
  {'unique': 7},
  {'top': 'robert'},
  {'freq': 1},
  {'mean': nan},
  {'std': nan},
  {'min': nan},
  {'25%': nan},
  {'50%': nan},
  {'75%': nan},

In [77]:
exportpath = "/users/danielcorcoran/desktop/github_repos/python_nb_data/"

In [80]:
with open(exportpath + "profile.json", "w") as fileobj:
    json.dump(profile_dict, fileobj)
    
fileobj.close()

In [67]:
merged.to_csv(exportpath + "profile.csv", index = False)