In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/TrainingData.csv')

In [3]:
df.head()

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [4]:
df.set_index('Id', inplace=True)

In [5]:
df.nunique()

Income               41920
Age                     59
Experience              21
Married/Single           2
House_Ownership          3
Car_Ownership            2
Profession              51
CITY                   317
STATE                   29
CURRENT_JOB_YRS         15
CURRENT_HOUSE_YRS        5
Risk_Flag                2
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 252000 entries, 1 to 252000
Data columns (total 12 columns):
Income               252000 non-null int64
Age                  252000 non-null int64
Experience           252000 non-null int64
Married/Single       252000 non-null object
House_Ownership      252000 non-null object
Car_Ownership        252000 non-null object
Profession           252000 non-null object
CITY                 252000 non-null object
STATE                252000 non-null object
CURRENT_JOB_YRS      252000 non-null int64
CURRENT_HOUSE_YRS    252000 non-null int64
Risk_Flag            252000 non-null int64
dtypes: int64(6), object(6)
memory usage: 25.0+ MB


In [11]:
numerical_features = df.drop('Risk_Flag', axis=1).select_dtypes('number').columns.to_list()
numerical_features

['Income', 'Age', 'Experience', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS']

In [14]:
categorical_features = df.select_dtypes('object').columns.to_list()
categorical_features

['Married/Single',
 'House_Ownership',
 'Car_Ownership',
 'Profession',
 'CITY',
 'STATE']

In [15]:
from sklearn.feature_selection import f_classif

In [16]:
f_values = f_classif(df[numerical_features], df['Risk_Flag'])

In [17]:
f_values

(array([  2.40794463, 119.9185068 , 300.69230652,  72.34852625,
          4.82385115]),
 array([1.20722032e-01, 6.68762365e-28, 2.54756349e-67, 1.81322122e-17,
        2.80694728e-02]))

In [18]:
f_values[1]

array([1.20722032e-01, 6.68762365e-28, 2.54756349e-67, 1.81322122e-17,
       2.80694728e-02])

In [19]:
p_values = pd.Series(index=numerical_features, data=f_values[1])
p_values

Income               1.207220e-01
Age                  6.687624e-28
Experience           2.547563e-67
CURRENT_JOB_YRS      1.813221e-17
CURRENT_HOUSE_YRS    2.806947e-02
dtype: float64

In [20]:
threshold = 0.005
p_values >= threshold

Income                True
Age                  False
Experience           False
CURRENT_JOB_YRS      False
CURRENT_HOUSE_YRS     True
dtype: bool

In [24]:
df[categorical_features] = df[categorical_features].astype('category')

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 252000 entries, 1 to 252000
Data columns (total 12 columns):
Income               252000 non-null int64
Age                  252000 non-null int64
Experience           252000 non-null int64
Married/Single       252000 non-null category
House_Ownership      252000 non-null category
Car_Ownership        252000 non-null category
Profession           252000 non-null category
CITY                 252000 non-null category
STATE                252000 non-null category
CURRENT_JOB_YRS      252000 non-null int64
CURRENT_HOUSE_YRS    252000 non-null int64
Risk_Flag            252000 non-null int64
dtypes: category(6), int64(6)
memory usage: 15.2 MB


In [28]:
df['CITY'].values.categories

Index(['Adoni', 'Agartala', 'Agra', 'Ahmedabad', 'Ahmednagar', 'Aizawl',
       'Ajmer', 'Akola', 'Alappuzha', 'Aligarh',
       ...
       'Unnao', 'Vadodara', 'Varanasi', 'Vasai-Virar', 'Vellore',
       'Vijayanagaram', 'Vijayawada', 'Visakhapatnam[4]', 'Warangal[11][12]',
       'Yamunanagar'],
      dtype='object', length=317)

In [29]:
df['CITY'].values.codes

array([251, 227,   8, ..., 144, 233,  26], dtype=int16)

In [30]:
from sklearn.feature_selection import mutual_info_classif

In [38]:
df_cat = pd.get_dummies(df[categorical_features])

In [39]:
mi = mutual_info_classif(df_cat, df['Risk_Flag'], discrete_features=True)

In [42]:
mi_series = pd.Series(data = mi, index=df_cat.columns)
mi_series.sort_values(ascending=False).head(50)

House_Ownership_rented              0.000381
CITY_Bhubaneswar                    0.000340
House_Ownership_owned               0.000300
Car_Ownership_yes                   0.000294
Car_Ownership_no                    0.000294
STATE_Madhya_Pradesh                0.000254
CITY_Kochi                          0.000239
Married/Single_single               0.000233
Married/Single_married              0.000233
CITY_Dehradun                       0.000213
CITY_Gwalior                        0.000199
STATE_Kerala                        0.000194
CITY_Buxar[37]                      0.000176
CITY_Barasat                        0.000174
Profession_Technology_specialist    0.000171
CITY_Satna                          0.000166
CITY_Sikar                          0.000161
CITY_Bareilly                       0.000161
CITY_Gandhinagar                    0.000159
Profession_Petroleum_Engineer       0.000150
CITY_Mehsana                        0.000148
CITY_Raiganj                        0.000147
Profession

In [49]:
from sklearn.preprocessing import LabelEncoder

labelencoder_X = LabelEncoder()

df_labeled = df[categorical_features].copy()
for cat in categorical_features:
    df_labeled[cat] = labelencoder_X.fit_transform(df[cat])

# df_labeled = labelencoder_X.fit_transform(df[['CITY', 'STATE']])
df_labeled

Unnamed: 0_level_0,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,2,0,33,251,13
2,1,2,0,43,227,14
3,0,2,0,47,8,12
4,1,2,1,43,54,17
5,1,2,0,11,296,22
...,...,...,...,...,...,...
251996,1,2,0,45,162,28
251997,1,2,0,3,251,13
251998,1,2,0,17,144,14
251999,1,2,0,27,233,18


In [52]:
mi = mutual_info_classif(df_labeled, df['Risk_Flag'], discrete_features=True)

In [54]:
mi_series = pd.Series(data = mi, index=df_labeled.columns)
mi_series.sort_values(ascending=False).head(50)

CITY               0.010351
STATE              0.001447
Profession         0.001226
House_Ownership    0.000391
Car_Ownership      0.000294
Married/Single     0.000233
dtype: float64