### Import Packages

In [1]:
import pandas as pd

### Create Data Source

Ideally, this should be parameterized so that you can specify a file type, path, and options for importing

In [2]:
def import_data(file_type, file_path):
    """
    Imports data into a dataframe
    file_type: str, defines the type of file, currently only supports csv
    file_path: str, file path of file to be used
    """
    if file_type == 'csv':
        df = pd.read_csv(file_path)
    return df

In [163]:
#df again outside of function
df = import_data('csv','datasets/TestFile/TestFile.csv')

### View Sample Data

In [4]:
print(df.head())

          Employee Name Employee Status      Salary  Pay Basis  \
0       Abrams, Adam W.        Employee   $66300.00  Per Annum   
1         Adams, Ian H.        Employee   $45000.00  Per Annum   
2       Agnew, David P.        Employee   $93840.00  Per Annum   
3        Albino, James         Employee   $91800.00  Per Annum   
4  Aldy, Jr., Joseph E.        Employee  $130500.00  Per Annum   

                                      Position Title  
0           WESTERN REGIONAL COMMUNICATIONS DIRECTOR  
1  EXECUTIVE ASSISTANT TO THE DIRECTOR OF SCHEDUL...  
2       DEPUTY DIRECTOR OF INTERGOVERNMENTAL AFFAIRS  
3                             SENIOR PROGRAM MANAGER  
4  SPECIAL ASSISTANT TO THE PRESIDENT FOR ENERGY ...  


### Get general statistics in case this is helpful

In [5]:
df.shape


(469, 5)

### Define functions

In [41]:
def get_data_type(col):
    """
    Determines the datatype of a column in the dataframe. Begins with what pandas provides,
    and expands. In future, should return a "subtype", like a string that is currency. This
    may then be useful for modifying data, such as turning currency into float. Is float
    really float or is it int with empty decimals?
    
    If no values are found at all, defaults to Float64, so when count=count_null should change data_type
    to [none], but still keep in mind that for purposes of df is still float. At this point, all values
    should be set to "NA" and field flag as empty set.
    
    col: index of column being analyzed
    """
    data_type = df.dtypes[col]
    
    return data_type

In [93]:
def check_for_nulls(col):
    """
    Determines if column has null data
    """
    has_nulls = df.shape[0] != df[df.columns[col]].count()
    
      #.count only includes non-nulls
    return has_nulls

In [185]:
def check_for_blanks(col):
    """
    Determines if field has any blanks. 
    """
    if data_type == 'object':
        if any(df[df.columns[col]].str.strip() == ''):
            has_blanks = 'True'
        else:
            has_blanks = 'False'
    else:
        has_blanks = 'NA'
    
    return has_blanks

In [147]:
def get_length(col):
    """
    provides length information for column
    """
    if data_type == 'object':
        max_length = df[df.columns[col]].str.len().max()
        min_length = df[df.columns[col]].str.len().min()
    else:
        max_length = 'NA'
        min_length = 'NA'
    
    return min_length, max_length    



In [188]:
def get_min_max(col):
    """
    provides min/max information for column
    """
    if data_type in ('int64', 'datetime64[ns]', 'float64') :
    
        max_value = df[df.columns[col]].max() 
        min_value = df[df.columns[col]].min() 
    else:
        max_value = 'NA'
        min_value = 'NA'
    
    return min_value, max_value

### Compile results

In [189]:
results = [] #list to store output

col = 0
while col <= len(df.columns)-1: 
    column_name = df.columns[col]
    has_nulls = check_for_nulls(col) 
    count_distinct = df[df.columns[col]].nunique()
    data_type=get_data_type(col)
    
    has_blanks = check_for_blanks(col)
    
    min_length = get_length(col)[0]
    max_length = get_length(col)[1]
    
    min_value = get_min_max(col)[0]
    max_value = get_min_max(col)[1]
    

    
    results.append({'column_name' : column_name,\
                    'count_distinct' : count_distinct,\
                    'data_type' : data_type ,\
                    'has_nulls' : has_nulls, \
                    'has_blanks' : has_blanks,\
                    'min' : min_value,\
                    'max' : max_value,\
                    'shortest_string' : min_length,\
                    'longest_string' : max_length
                   }
                  )
   
    col += 1
    
pd.DataFrame(results, columns=['column_name', 'count_distinct', 'data_type','has_nulls','has_blanks','min','max','shortest_string','longest_string']) #reads the list in a table format

Unnamed: 0,column_name,count_distinct,data_type,has_nulls,has_blanks,min,max,shortest_string,longest_string
0,Employee Name,470,object,False,False,,,5.0,30.0
1,Employee Status,3,object,False,False,,,8.0,20.0
2,Salary,120,object,False,False,,,6.0,12.0
3,Pay Basis,1,object,False,False,,,9.0,9.0
4,Position Title,269,object,False,False,,,6.0,128.0
5,Test Col_Has_Nulls,0,float64,True,,,,,
6,Test Col_Has_Blanks,2,object,True,True,,,1.0,3.0
7,Test_Col_Float,120,float64,False,,0.0,179700.0,,
8,Test_Col_Int,120,int64,False,,0.0,179700.0,,
