In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/TrainingData.csv')

In [3]:
df.set_index('Id', inplace=True)

In [4]:
y = df['Risk_Flag']

In [5]:
numerical_features = df.drop('Risk_Flag', axis=1).select_dtypes('number').columns.to_list()
numerical_features

['Income', 'Age', 'Experience', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS']

In [6]:
categorical_features = df.select_dtypes('object').columns.to_list()
categorical_features

['Married/Single',
 'House_Ownership',
 'Car_Ownership',
 'Profession',
 'CITY',
 'STATE']

# 1. Approach - label encoding

In [8]:
train = df[numerical_features + categorical_features].copy()

In [9]:
from sklearn.preprocessing import LabelEncoder

labelencoder_X = LabelEncoder()

for cat in categorical_features:
    train[cat] = labelencoder_X.fit_transform(df[cat])

In [10]:
train.head()

Unnamed: 0_level_0,Income,Age,Experience,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1303834,23,3,3,13,1,2,0,33,251,13
2,7574516,40,10,9,13,1,2,0,43,227,14
3,3991815,66,4,4,10,0,2,0,47,8,12
4,6256451,41,2,2,12,1,2,1,43,54,17
5,5768871,47,11,3,14,1,2,0,11,296,22


### Logistic Regression

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

In [117]:
w = {0: (y_train==0).mean(), 1: (y_train==1).mean()}
w

{0: 0.8772668650793651, 1: 0.12273313492063492}

In [14]:
from sklearn.linear_model import LogisticRegression

In [140]:
model = LogisticRegression(random_state=42, max_iter=100000000, class_weight='balanced', ).fit(X_train, y_train)



In [142]:
coef = pd.Series(data = model.coef_.tolist()[0], index=train.columns).sort_values(ascending=True)
coef

Age                 -1.110139e-03
Experience          -1.002581e-03
Profession          -3.136859e-04
CURRENT_JOB_YRS     -2.688343e-04
STATE               -1.074317e-04
Car_Ownership       -5.386945e-05
Income               3.494104e-09
CURRENT_HOUSE_YRS    2.184426e-05
Married/Single       3.733771e-05
House_Ownership      5.344642e-05
CITY                 3.038142e-04
dtype: float64

In [143]:
y_train_pred = model.predict(X_train)

In [144]:
model.score(X_train, y_train)

0.5857093253968254

In [145]:
model.score(X_test, y_test)

0.5857539682539683

In [48]:
from sklearn.metrics import f1_score

In [159]:
f1_score(y_test, model.predict(X_test))

0.20306893656004274

In [156]:
from sklearn.metrics import confusion_matrix

In [157]:
confusion_matrix(y_test, model.predict(X_test))

array([[26862, 17285],
       [ 3593,  2660]])

## 2 Approach - one hot encoder

In [160]:
df_cat = pd.get_dummies(df[categorical_features])

In [169]:
df_cat.head()

Unnamed: 0_level_0,Married/Single_married,Married/Single_single,House_Ownership_norent_noown,House_Ownership_owned,House_Ownership_rented,Car_Ownership_no,Car_Ownership_yes,Profession_Air_traffic_controller,Profession_Analyst,Profession_Architect,...,STATE_Punjab,STATE_Rajasthan,STATE_Sikkim,STATE_Tamil_Nadu,STATE_Telangana,STATE_Tripura,STATE_Uttar_Pradesh,STATE_Uttar_Pradesh[5],STATE_Uttarakhand,STATE_West_Bengal
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,1,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [167]:
X = df_cat.merge(df[numerical_features], left_index=True, right_index=True)

In [170]:
X.head()

Unnamed: 0_level_0,Married/Single_married,Married/Single_single,House_Ownership_norent_noown,House_Ownership_owned,House_Ownership_rented,Car_Ownership_no,Car_Ownership_yes,Profession_Air_traffic_controller,Profession_Analyst,Profession_Architect,...,STATE_Tripura,STATE_Uttar_Pradesh,STATE_Uttar_Pradesh[5],STATE_Uttarakhand,STATE_West_Bengal,Income,Age,Experience,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,1303834,23,3,3,13
2,0,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,7574516,40,10,9,13
3,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,3991815,66,4,4,10
4,0,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,6256451,41,2,2,12
5,0,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,5768871,47,11,3,14


# Logistic Regression

In [171]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [172]:
w = {0: (y_train==0).mean(), 1: (y_train==1).mean()}
w

{0: 0.8772668650793651, 1: 0.12273313492063492}

In [174]:
model = LogisticRegression(random_state=42, max_iter=100000000, class_weight='balanced').fit(X_train, y_train)



In [175]:
model.score(X_train, y_train)

0.6850198412698413

In [176]:
model.score(X_test, y_test)

0.6875198412698412

In [177]:
f1_score(y_test, model.predict(X_test))

0.18327023803350104

In [178]:
confusion_matrix(y_test, model.predict(X_test))

array([[32884, 11263],
       [ 4486,  1767]])

# 3 Approach - numeric feature selection

In [179]:
from sklearn.feature_selection import f_classif

In [180]:
f_values = f_classif(df[numerical_features], df['Risk_Flag'])

In [182]:
p_values = pd.Series(index=numerical_features, data=f_values[1])
p_values
threshold = 0.005
p_values >= threshold

Income                True
Age                  False
Experience           False
CURRENT_JOB_YRS      False
CURRENT_HOUSE_YRS     True
dtype: bool

In [186]:
numerical_features = ['Income', 'CURRENT_HOUSE_YRS']

In [187]:
train = df[numerical_features + categorical_features].copy()

In [188]:
from sklearn.preprocessing import LabelEncoder

labelencoder_X = LabelEncoder()

for cat in categorical_features:
    train[cat] = labelencoder_X.fit_transform(df[cat])

In [189]:
train.head()

Unnamed: 0_level_0,Income,CURRENT_HOUSE_YRS,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1303834,13,1,2,0,33,251,13
2,7574516,13,1,2,0,43,227,14
3,3991815,10,0,2,0,47,8,12
4,6256451,12,1,2,1,43,54,17
5,5768871,14,1,2,0,11,296,22


In [197]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

In [198]:
model = LogisticRegression(random_state=42, max_iter=100000000, class_weight='balanced', ).fit(X_train, y_train)



In [199]:
model.score(X_train, y_train)

0.5140823412698413

In [200]:
model.score(X_test, y_test)

0.5111904761904762

In [201]:
f1_score(y_test, model.predict(X_test))

0.20111550684220764

In [202]:
confusion_matrix(y_test, model.predict(X_test))

array([[22663, 21484],
       [ 3152,  3101]])