In [40]:
import cudf
import pandas as pd
import numpy as np 
from cuml.preprocessing import model_selection
from itertools import chain
from scipy.stats import chi2_contingency
import cuml
import torch

In [2]:
print("padas version",pd.__version__)
print("numpy version",np.__version__)
print("cudf version",cudf.__version__)
print("cuml version",cuml.__version__)
print("cuda version",torch.version.cuda)

padas version 0.25.3
numpy version 1.18.1
cudf version 0.13.0+0.ga2804c3.dirty
cuml version 0.13.0+0.g7544c43.dirty
cuda version 9.0.176


In [3]:
'''Function declarations'''

'Function declarations'

In [4]:
'''Append Frames'''
def append_frames(data1,data2):
    append_data=cudf.DataFrame()
    if(len(data1)>0):
        if(len(data2)>0):
            append_data=cudf.concat([data1,data2],ignore_index=True,axis=0)
        else:
            append_data=data1
    else:
        append_data=data2
    return(append_data)

In [5]:
def tabular_sorting(data):
    data['temp']=data.cuts
    data['temp']=data.temp.str.replace("%","")
    data['temp']=data.temp.str.split("-")[0]
    data['temp']=data.temp.astype("int32")
    data=data.sort_values(['var_name','temp'],ascending=True)
    data=data.drop(columns=["temp"])
    return(data)

In [6]:
'''Univariate cuts for numeral type.
Since there is not qcut functioanlity in CuDF,this function is a work-around for qcut[only for numeric type]'''
def univariate_cuts_mapper_numeric(data,var):
    stats_frame=cudf.DataFrame()
    for i in range(0,len(var)):
        
        tmp=data[var[i]].describe(percentiles=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
        stats_frame1=cudf.DataFrame()
        stats_frame1['cut_val']=tmp.copy()
        stats_frame1['desc']=stats_frame1.index
        stats_frame1=stats_frame1[stats_frame1.desc.str.contains("%")]
        decile_var=var[i]+'__decile_cuts'
        data[decile_var]='-1'
        stats_frame1['var_name']=var[i]
        processed_data=cudf.DataFrame()
        k=''
        for j in stats_frame1.desc:
            temp_data=data[data[var[i]]<=stats_frame1.cut_val[stats_frame1.desc==j][0]]
            if(len(temp_data)>0):
                temp_data[decile_var]=j
                k=j
                processed_data=append_frames(processed_data,temp_data)
                data=data[~data.ID.isin(temp_data.ID)]
            else:
                processed_data.loc[processed_data[decile_var].str.contains(k),decile_var]=k+"-"+j
                stats_frame1.loc[stats_frame1.desc==k,'desc']=k+"-"+j
                stats_frame1=stats_frame1[stats_frame1.desc!=j]
            
        stats_frame=append_frames(stats_frame,stats_frame1)
        data=processed_data
        data.loc[data[decile_var]=='-1',decile_var]="Null"
    return([data,stats_frame])

In [7]:
'''Univariate for character type'''
def univariate_cuts_mapper_char(data,var):
    for i in var:
        cut_var=i+"__cuts"
        data[cut_var]=data[i]
    return(data)

In [8]:
def tabulate_univariates(data):
    vars=data.columns[data.columns.str.contains('_cuts$')]
    summarised_frame=cudf.DataFrame()
    for i in vars:
        tmp_frame=cudf.DataFrame()
        tmp_frame['cases']=data.groupby(i)['target'].count()
        tmp_frame['popn']=tmp_frame['cases']/sum(tmp_frame['cases'])
        tmp_frame['cuts']=tmp_frame.index
        tmp_frame['var_name']=i.split("__")[0]
        if(i.split("__")[1]=='decile_cuts'):
            tmp_frame['var_type']="numeric"
        else:
            tmp_frame['var_type']="character"
        summarised_frame=append_frames(summarised_frame,tmp_frame)
        summarised_frame.index=summarised_frame.var_name+"_"+summarised_frame.cuts
        summarised_frame=tabular_sorting(summarised_frame)
    return(summarised_frame)

In [9]:
def tabulate_bivariates(data):
    vars=data.columns[data.columns.str.contains('_cuts$')]
    summarised_frame=cudf.DataFrame()
    for i in vars:
        #print(i)
        tmp_frame=data.groupby(i,as_index=False).agg({'target':['count','sum']})
        tmp_frame.columns=['cuts',"cases","target"]
        tmp_frame['non_target']=tmp_frame.cases-tmp_frame.target
        tmp_frame['popn']=tmp_frame.cases/sum(tmp_frame.cases)
        tmp_frame['non_target_distbn']=tmp_frame.non_target/sum(tmp_frame.non_target)
        tmp_frame['target_distbn']=tmp_frame.target/sum(tmp_frame.target)
        tmp_frame['target_rate']=tmp_frame['target']/tmp_frame['cases']
        tmp_frame['WOE']=np.log((tmp_frame.non_target_distbn/tmp_frame.target_distbn))
        tmp_frame['IV']=(tmp_frame.non_target_distbn-tmp_frame.target_distbn)*tmp_frame.WOE
        tmp_frame['var_name']=i.split("__")[0]
        
        summarised_frame=append_frames(summarised_frame,tmp_frame)
    summarised_frame=tabular_sorting(summarised_frame)
    #summarised_frame.index=summarised_frame.var+"_"+summarised_frame.cuts
    return(summarised_frame)

In [30]:
#########Cramer's V
def cramers_V(var1,var2) :
  crosstab =np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building
  stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test
  obs = np.sum(crosstab) # Number of observations
  mini = min(crosstab.shape)-1 # Take the minimum value between the columns and the rows of the cross table
  return (stat/(obs*mini))

In [10]:
'''Loading the sample UCI Credit Card default dataset
Dataset Description:
LIMIT_BAL: Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit.
Sex: Gender (1 = male; 2 = female).
Education: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others).
MARRIAGE: Marital status (1 = married; 2 = single; 3 = others).
Age: Age (year).
PAY_0-PAY_6: History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; . . .;X11 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.
BILL_AMT1-BILL_AMT6:statement (NT dollar).1= amount of bill statement in September, 2005; 2= amount of bill statement in August, 2005; . . .; 6 = amount of bill statement in April, 2005.
PAY_AMT1-PAY_AMT6: Amount of previous payment (NT dollar). 1= amount paid in September, 2005; 2 = amount paid in August, 2005; . . .;6 = amount paid in April, 2005.
'''
file=cudf.read_csv("uci_default of credit card clients.csv",delimiter=",")
file.columns



Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default'],
      dtype='object')

In [11]:
'''Random split for train/test
train_test_split takes the following parameters:
1. CuDF
2. y i.e. target
3. random_seed either a seed generator numeral or randomseed object
4. train population
'''
x_train,x_test,y_train,y_test=model_selection.train_test_split(file,"default",random_state=300,train_size=0.75)
train=x_train
train['target']=y_train
test=x_test
test['target']=y_test

In [12]:
'''Feature name generator i.e. input for univariates and bivariates'''
pattern_cols = []
_ = [pattern_cols.extend(item) if isinstance(item, list) else pattern_cols.append(item) for item in train.columns[train.columns.str.contains("PAY_\\d+$")].values.tolist() if item]
char_names=['SEX', 'EDUCATION', 'MARRIAGE']
char_names.extend(pattern_cols)
target_name=['target']
excluded_name=['ID']
numeric_names=train.columns[~train.columns.isin(chain(char_names,target_name,excluded_name))]

In [13]:
'''Setting character type for char columns'''
for i in char_names:
    train[i]=train[i].astype("str")
    test[i]=test[i].astype("str")

In [14]:
'''Computing Univariates'''
[train1,cuts_numeric]=univariate_cuts_mapper_numeric(train,numeric_names)
train1=univariate_cuts_mapper_char(train1,char_names)
univariates=tabulate_univariates(train1)

In [19]:
'''Computing Bivariates'''
bivariates=tabulate_bivariates(train1)
var_iv=bivariates.groupby('var_name')['IV'].sum()

In [None]:
%%time
'''Computing Cramer's V '''
cramers_table=pd.DataFrame()
train_file11=train1.to_pandas()

for i in np.arange(0,len(var_iv.index)):
    for j in np.arange(i+1,len(var_iv.index)):
        frame=pd.DataFrame({'var1':[var_iv.index[i]],'var2':[var_iv.index[j]],'cramers':[cramers_V(train_file11.iloc[1:len(train1),i],train_file11.iloc[1:len(train1),j])]})
        cramers_table=cramers_table.append(frame,ignore_index=True)