In [2]:
import pandas as pd
# import pandas.io.sql as psql
import numpy as np
from sqlalchemy import create_engine, event
import urllib.parse
from datetime import datetime

# Inputs & Outputs

In [3]:
# SQL
server = 'CSKMA0400\RDB_Data'
db = 'JLDJobPath'
odbc_connection_string = 'DRIVER={SQL Server Native Client 11.0};SERVER='+server+';DATABASE='+db+';Trusted_Connection=yes'
sql_table = "linkedclaims_casuals_2018m04_v2_flat_20140101_with_income_with_edu"

#Selection threshold
selection_nan_treshold = 0.55
select_only_num = True

# Output
varprofile_csvfilename = "D:/DATA/" + sql_table + "_variable_profile.csv"

# Profiling

In [4]:
# Connect to SQL and retrieve data
params = urllib.parse.quote_plus(odbc_connection_string)
engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
conn = engine.connect().connection

@event.listens_for(engine, 'before_cursor_execute')
def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
    if executemany:
        cursor.fast_executemany = True
print ('Reading Data from table: %s' %(sql_table))
df = pd.read_sql_table(sql_table, engine)
conn.close()
print ('Reading Data done')

## Profile features
# convert data types
var_names = df.columns
var_types = df.dtypes

myvar_types = pd.DataFrame(var_types.tolist())
myvar_types[myvar_types[0] == 'float64'] = 'Num'
myvar_types[myvar_types[0] == 'object'] = 'Char'
myvar_types[myvar_types[0] == 'int64'] = 'Num'
myvar_types[myvar_types[0] =='<M8[ns]'] = 'Date'

# create stats feature selection output structure
df_sel = pd.DataFrame()
df_sel['Variable']=  var_names
df_sel['Type'] = myvar_types[0]
df_sel['Num_of_NaN'] = [np.NaN]*len(df_sel.index)
df_sel['Num_NOT_NaN'] = [np.NaN]*len(df_sel.index)
df_sel['NaN_of_Total'] = [np.NaN]*len(df_sel.index)
df_sel['Num_of_ZERO'] = [np.NaN]*len(df_sel.index)
df_sel['Num_NOT_ZERO'] = [np.NaN]*len(df_sel.index)
df_sel['ZERO_of_Total'] = [np.NaN]*len(df_sel.index)

# calculate number of null/not null & percentage of null
for i in df_sel.index:
    mytype = df_sel.Type.loc[i]

    NofNaN = (df[df_sel.Variable.loc[i]]).isnull().sum()
    df_sel.loc[df_sel['Variable'] == df_sel.Variable.loc[i], 'Num_of_NaN'] = NofNaN

    Nof_NOT_NaN = len(df) - NofNaN
    df_sel.loc[df_sel['Variable'] == df_sel.Variable.loc[i], 'Num_NOT_NaN'] = Nof_NOT_NaN

    NaN_of_Total = NofNaN/len(df)
    df_sel.loc[df_sel['Variable'] == df_sel.Variable.loc[i], 'NaN_of_Total'] = round(NaN_of_Total,2)
    
    if mytype == 'Num':
        
        Num_NOT_ZERO = (df[df_sel.Variable.loc[i]]).fillna(0).astype(bool).sum()
        df_sel.loc[df_sel['Variable'] == df_sel.Variable.loc[i], 'Num_NOT_ZERO'] = Num_NOT_ZERO
    
        Num_of_ZERO = len(df) - Num_NOT_ZERO
        df_sel.loc[df_sel['Variable'] == df_sel.Variable.loc[i], 'Num_of_ZERO'] = Num_of_ZERO
    
        ZERO_of_Total = Num_of_ZERO/len(df)
        df_sel.loc[df_sel['Variable'] == df_sel.Variable.loc[i], 'ZERO_of_Total'] = round(ZERO_of_Total,2)
#         print ('\n\t Num_of_ZERO = %d' %(Num_of_ZERO))
    else:
        df_sel.loc[df_sel['Variable'] == df_sel.Variable.loc[i], 'Num_of_ZERO'] = np.nan
        df_sel.loc[df_sel['Variable'] == df_sel.Variable.loc[i], 'Num_NOT_ZERO'] = np.nan
        df_sel.loc[df_sel['Variable'] == df_sel.Variable.loc[i], 'ZERO_of_Total'] = np.nan
    
    

# create Stats feature selection function, adding "selected" 0/1   
def insert_select(data, treshold):
    data['selected'] = np.nan
    for index, row in data.iterrows():
        if select_only_num == True and row['Type'] != 'Num':
            data.at[index, 'selected'] = 0
        elif row['NaN_of_Total'] > treshold or row['Variable']=='ppsn':
            data.at[index, 'selected'] = 0
        else:
            data.at[index, 'selected'] = 1
    return data

# run stats feature selection
df_sel = insert_select(df_sel, selection_nan_treshold)

# export stats feature selection output to csv
df_sel.to_csv(varprofile_csvfilename, index=False)

print ('\nALL DONE')

Reading Data from table: linkedclaims_casuals_2018m04_v2_flat_20140101_with_income_with_edu
Reading Data done

ALL DONE
