In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2, VarianceThreshold, RFE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import resample

### `Load data numerical, categorical and target`

In [None]:
numericals = pd.read_csv('data/numerical.csv')
categoricals = pd.read_csv('data/categorical.csv')
targets = pd.read_csv('data/target.csv')

print(f"Numericals shape: {numericals.shape}")
print(f"Categoricals shape: {categoricals.shape}")
print(f"Targets shape: {targets.shape}")

In [None]:
numericals_with_nan = [col for col in numericals.columns if numericals[col].isna().sum() > 0]
categoricals_with_nan = [col for col in categoricals.columns if categoricals[col].isna().sum() > 0]
targets_with_nan = [col for col in targets.columns if targets[col].isna().sum() > 0]

print(f"There are '{len(numericals_with_nan)}' NaN columns in Numericals.")
print(f"There are '{len(categoricals_with_nan)}' NaN columns in Categoricals.")
print(f"There are '{len(targets_with_nan)}' NaN columns in Targets.")

In [None]:
categoricals.dropna(subset=categoricals_with_nan, inplace=True)
categoricals[categoricals_with_nan].value_counts(dropna=False)

In [None]:
important_columns = ['WEALTH1', 'WEALTH2', 'VETERANS', 'SOLIH']

### `Numericals - Variance Threshold`

In [None]:
min_max_scaler = MinMaxScaler()
numericals_scaled = min_max_scaler.fit_transform(numericals)

In [None]:
selector = VarianceThreshold(0.09)
selected_numericals = selector.fit_transform(numericals_scaled)

columns_to_drop = [col_tuple[0] for col_tuple in zip(numericals.columns, selector.get_support()) if col_tuple[1] == False]

print(f"There are '{len(columns_to_drop)}' columns to drop.")

In [None]:
columns_to_drop = [col for col in columns_to_drop if col not in important_columns]
numericals_variance = numericals.drop(columns_to_drop, axis=1)
numericals_variance.shape

In [None]:
# Save results to csv

numericals_variance.to_csv('numericals_cleaned/numericals_variance.csv')

In [None]:
def display_heatmap(dataframe):
    corr=dataframe.corr()

    mask=np.triu(np.ones_like(corr, dtype=bool))     # generate a mask for the upper triangle

    f, ax=plt.subplots(figsize=(11, 9))                 # set up the matplotlib figure

    cmap=sns.diverging_palette(220, 10, as_cmap=True)   # generate a custom diverging colormap

    sns.heatmap(corr, mask=mask, cmap=cmap,             # draw the heatmap with the mask and correct aspect ratio
                vmax=.3, center=0, square=True,
                linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
display_heatmap(pd.concat([targets[['TARGET_B']], numericals_variance], axis=1))

In [None]:
numericals_variance_manual_drop = numericals_variance.drop(['POP90C3', 'HC6', 'HC7', 'CLUSTER2'], axis=1)
numericals_variance_manual_drop.shape

In [None]:
# Save results to csv

numericals_variance_manual_drop.to_csv('numericals_cleaned/numericals_variance_manual_drop.csv')

In [None]:
display_heatmap(pd.concat([targets[['TARGET_B']], numericals_variance_manual_drop], axis=1))

### `Numericals - Chi2`

In [None]:
selector = SelectKBest(chi2, k=10)
selector.fit(numericals_scaled, targets[['TARGET_B']])
mask = selector.get_support()
columns_to_drop = numericals.columns[~mask]

print(f"There are '{len(columns_to_drop)}' columns to drop.")

In [None]:
columns_to_drop = [col for col in columns_to_drop if col not in important_columns]
numericals_chi2 = numericals.drop(columns_to_drop, axis=1)
numericals_chi2.shape

In [None]:
# Save results to csv

numericals_chi2.to_csv('numericals_cleaned/numericals_chi2.csv')

In [None]:
display_heatmap(pd.concat([targets[['TARGET_B']], numericals_chi2], axis=1))

### `Numericals - RFE`

In [None]:
model = LinearRegression()

# Create an instance of the RFE class
rfe = RFE(model, n_features_to_select=25, verbose=False)

# Fit the RFE to the DataFrame
rfe.fit(numericals_scaled, targets[['TARGET_B']])

# Get the boolean mask of the selected columns
mask = rfe.support_

# Use the mask to obtain the names of the selected columns
columns_to_drop = numericals.columns[~mask]

print(f"There are '{len(columns_to_drop)}' columns to drop.")

In [None]:
columns_to_drop = [col for col in columns_to_drop if col not in important_columns]
numericals_rfe = numericals.drop(columns_to_drop, axis=1)
numericals_rfe.shape

In [None]:
# Save results to csv

numericals_rfe.to_csv('numericals_cleaned/numericals_rfe.csv')

In [None]:
display_heatmap(pd.concat([targets[['TARGET_B']], numericals_rfe], axis=1))

In [None]:
numericals_rfe_manual_drop = numericals_rfe.drop(['POP90C1', 'DW1', 'MC2'], axis=1)
numericals_rfe_manual_drop.shape

In [None]:
# Save results to csv

numericals_rfe_manual_drop.to_csv('numericals_cleaned/numericals_rfe_manual_drop.csv')

In [None]:
display_heatmap(pd.concat([targets[['TARGET_B']], numericals_rfe_manual_drop], axis=1))

### `Numericals - OLS`

In [None]:
df = sm.add_constant(numericals_scaled)

# Fit the OLS model
model = sm.OLS(targets[['TARGET_B']], df)
results = model.fit()

# Get the p-values of the features
pvalues = results.pvalues

# Select the features with a p-value more than 0.05
columns_to_drop = [col_tuple[0] for col_tuple in zip(numericals.columns, pvalues[1::]) if col_tuple[1] > 0.05]
print(f"There are '{len(columns_to_drop)}' columns to drop.")

In [None]:
columns_to_drop = [col for col in columns_to_drop if col not in important_columns]
numericals_ols = numericals.drop(columns_to_drop, axis=1)
numericals_ols.shape

In [None]:
# Save results to csv

numericals_ols.to_csv('numericals_cleaned/numericals_ols.csv')

In [None]:
display_heatmap(pd.concat([targets[['TARGET_B']], numericals_ols], axis=1))

### `Categoricals`

In [None]:
categoricals.nunique()

In [None]:
categoricals.dtypes

In [None]:
object_columns_from_categoricals = [col for col in categoricals.columns if categoricals[col].dtype == object]

In [None]:
for col in object_columns_from_categoricals:
    categoricals[col] = pd.factorize(categoricals[col])[0]

In [None]:
# 'RFA_2R' has only 1 unique value so we can drop it

categoricals.drop('RFA_2R', axis=1, inplace=True)
categoricals.shape

In [None]:
# Save results to csv

categoricals.to_csv('categoricals_cleaned/categoricals.csv')

In [None]:
# Check correlation matrix on categorical features

display_heatmap(pd.concat([targets[['TARGET_B']], categoricals], axis=1))

### `Load cached data`

In [51]:
numericals = pd.read_csv('data/numerical.csv')
numericals_variance = pd.read_csv('numericals_cleaned/numericals_variance.csv')
numericals_variance_manual_drop = pd.read_csv('numericals_cleaned/numericals_variance_manual_drop.csv')
numericals_chi2 = pd.read_csv('numericals_cleaned/numericals_chi2.csv')
numericals_rfe = pd.read_csv('numericals_cleaned/numericals_rfe.csv')
numericals_rfe_manual_drop = pd.read_csv('numericals_cleaned/numericals_rfe_manual_drop.csv')
numericals_ols = pd.read_csv('numericals_cleaned/numericals_ols.csv')

categoricals = pd.read_csv('categoricals_cleaned/categoricals.csv')

targets = pd.read_csv('data/target.csv')

### `Check accuracy with sets of selected columns`

#### `Helper functions`

In [52]:
def get_full_df(target_df, features_df):
    return pd.concat([target_df, features_df], axis=1).dropna(axis=0).sample(frac=1, random_state=10)


def get_full_dfs_with_selection_name(numerical_df_tuples):
    return [
        (sub_df[0], get_full_df(targets[['TARGET_B']], pd.concat([sub_df[1], categoricals], axis=1))) 
            for sub_df in numerical_df_tuples
    ]


def oversample(y_train, X_train):
    target_b_0 = y_train[y_train['TARGET_B'] == 0]
    target_b_1 = y_train[y_train['TARGET_B'] == 1]

    target_b_1_oversampled = resample(target_b_1, replace=True, n_samples=len(target_b_0))
    target_b_oversampled = pd.concat([target_b_0, target_b_1_oversampled], axis=0)

    full_df = get_full_df(target_b_oversampled, X_train)
    return full_df[['TARGET_B']], full_df.drop('TARGET_B', axis=1)


def undersample(y_train, X_train):
    target_b_0 = y_train[y_train['TARGET_B'] == 0]
    target_b_1 = y_train[y_train['TARGET_B'] == 1]

    target_b_0_undersampled = resample(target_b_0, replace=False, n_samples=len(target_b_1))
    target_b_downsampled = pd.concat([target_b_0_undersampled, target_b_1], axis=0)

    full_df = get_full_df(target_b_downsampled, X_train)
    return full_df[['TARGET_B']], full_df.drop('TARGET_B', axis=1)


def score_selected_columns(numerical_df_tuples, sample_action = ''):
    index = 1
    df_tuples = get_full_dfs_with_selection_name(numerical_df_tuples)
    for tuple in df_tuples:

        print("================================")
        print(f"{tuple[0]}")

        y = tuple[1][['TARGET_B']]
        X = tuple[1].drop('TARGET_B', axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

        if sample_action == 'oversample':
            y_train, X_train = oversample(y_train=y_train, X_train=X_train)
        elif sample_action == 'undersample':
            y_train, X_train = undersample(y_train=y_train, X_train=X_train)

        # Scaling data = X_train
        X_train_transformer = MinMaxScaler().fit(X_train)
        X_train_normalized = X_train_transformer.transform(X_train)
        X_train_normalized = pd.DataFrame(X_train_normalized)

        # Scaling data = X_test
        X_test_transformer = MinMaxScaler().fit(X_test)
        X_test_normalized = X_test_transformer.transform(X_test)
        X_test_normalized = pd.DataFrame(X_test_normalized)

        # Create an instance of the DecisionTreeClassifier
        clf = DecisionTreeClassifier()

        # Fit the classifier to the training data
        clf.fit(X_train_normalized, y_train)

        # Make predictions on the test data
        y_pred = clf.predict(X_test_normalized)

        # Calculate the accuracy of the model
        display(confusion_matrix(y_test, y_pred))

        scores = cross_val_score(clf, X_train_normalized, y_train, cv=10)
        # Print the mean and standard deviation of the scores
        print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
        
        accuracy = accuracy_score(y_test, y_pred)
        print(f"accuracy: {accuracy}")
        print("precision: ",precision_score(y_test, y_pred))
        print("recall: ",recall_score(y_test, y_pred))
        print("f1: ",f1_score(y_test, y_pred))

        print("================================")

        index += 1

#### `With imbalanced label`

In [53]:
numerical_df_tuples = [
    ('numericals', numericals), 
    ('numericals_variance', numericals_variance), 
    ('numericals_variance_manual_drop', numericals_variance_manual_drop),
    ('numericals_chi2', numericals_chi2),
    ('numericals_rfe', numericals_rfe),
    ('numericals_rfe_manual_drop', numericals_rfe_manual_drop),
    ('numericals_ols', numericals_ols)
]

In [54]:
score_selected_columns(numerical_df_tuples=numerical_df_tuples)

numericals


array([[13081,  5036],
       [  692,   273]])

Accuracy: 0.89 (+/- 0.00)
accuracy: 0.6998218216119904
precision:  0.05142211339235261
recall:  0.28290155440414505
f1:  0.0870258208479439
numericals_variance


array([[15301,  2816],
       [  800,   165]])

Accuracy: 0.89 (+/- 0.01)
accuracy: 0.8105020438109213
precision:  0.055350553505535055
recall:  0.17098445595854922
f1:  0.08362899138367967
numericals_variance_manual_drop


array([[14989,  3128],
       [  777,   188]])

Accuracy: 0.89 (+/- 0.00)
accuracy: 0.7953568808301017
precision:  0.05669481302774427
recall:  0.19481865284974093
f1:  0.08782994627423499
numericals_chi2


array([[15587,  2530],
       [  803,   162]])

Accuracy: 0.89 (+/- 0.01)
accuracy: 0.8253327743423121
precision:  0.060178306092124816
recall:  0.16787564766839377
f1:  0.08859721082854799
numericals_rfe


array([[15577,  2540],
       [  816,   149]])

Accuracy: 0.89 (+/- 0.01)
accuracy: 0.8241274499528352
precision:  0.055410933432502786
recall:  0.1544041450777202
f1:  0.0815544608648057
numericals_rfe_manual_drop


array([[15513,  2604],
       [  804,   161]])

Accuracy: 0.89 (+/- 0.00)
accuracy: 0.8214023687244524
precision:  0.05822784810126582
recall:  0.16683937823834197
f1:  0.08632707774798927
numericals_ols


array([[15740,  2377],
       [  812,   153]])

Accuracy: 0.89 (+/- 0.01)
accuracy: 0.8328791531286028
precision:  0.060474308300395255
recall:  0.15854922279792746
f1:  0.08755364806866953


#### `With Undersampling`

In [55]:
score_selected_columns(numerical_df_tuples=numerical_df_tuples, sample_action='undersample')

numericals


array([[8199, 9918],
       [ 413,  552]])

Accuracy: 0.50 (+/- 0.02)
accuracy: 0.45859972749187716
precision:  0.05272206303724929
recall:  0.572020725388601
f1:  0.0965456930476607
numericals_variance


array([[9221, 8896],
       [ 434,  531]])

Accuracy: 0.52 (+/- 0.04)
accuracy: 0.5110575411382454
precision:  0.0563275697464729
recall:  0.550259067357513
f1:  0.10219399538106236
numericals_variance_manual_drop


array([[9073, 9044],
       [ 462,  503]])

Accuracy: 0.51 (+/- 0.04)
accuracy: 0.5018341892883346
precision:  0.052686707866345446
recall:  0.5212435233160622
f1:  0.09570015220700152
numericals_chi2


array([[9479, 8638],
       [ 446,  519]])

Accuracy: 0.52 (+/- 0.05)
accuracy: 0.5239492715648255
precision:  0.05667795129409195
recall:  0.5378238341968912
f1:  0.1025489033787789
numericals_rfe


array([[ 6624, 11493],
       [  338,   627]])

Accuracy: 0.51 (+/- 0.02)
accuracy: 0.3799916151346819
precision:  0.05173267326732673
recall:  0.649740932642487
f1:  0.09583492548719907
numericals_rfe_manual_drop


array([[9270, 8847],
       [ 517,  448]])

Accuracy: 0.51 (+/- 0.03)
accuracy: 0.509275757258149
precision:  0.048197955890263586
recall:  0.46424870466321244
f1:  0.08732943469785577
numericals_ols


array([[8665, 9452],
       [ 433,  532]])

Accuracy: 0.52 (+/- 0.04)
accuracy: 0.4819725395660832
precision:  0.05328525641025641
recall:  0.5512953367875648
f1:  0.0971778244588547


#### `With Oversampling`

In [56]:
score_selected_columns(numerical_df_tuples=numerical_df_tuples, sample_action='oversample')

numericals


array([[17227,   890],
       [  915,    50]])

Accuracy: 0.97 (+/- 0.00)
accuracy: 0.905408238130175
precision:  0.05319148936170213
recall:  0.05181347150259067
f1:  0.05249343832020997
numericals_variance


array([[17106,  1011],
       [  895,    70]])

Accuracy: 0.97 (+/- 0.00)
accuracy: 0.9001152918981239
precision:  0.06475485661424607
recall:  0.07253886010362694
f1:  0.06842619745845553
numericals_variance_manual_drop


array([[17154,   963],
       [  912,    53]])

Accuracy: 0.97 (+/- 0.00)
accuracy: 0.9017398595535059
precision:  0.05216535433070866
recall:  0.054922279792746116
f1:  0.05350832912670369
numericals_chi2


array([[17153,   964],
       [  898,    67]])

Accuracy: 0.97 (+/- 0.00)
accuracy: 0.9024211298606016
precision:  0.06498545101842872
recall:  0.0694300518134715
f1:  0.06713426853707415
numericals_rfe


array([[17273,   844],
       [  913,    52]])

Accuracy: 0.97 (+/- 0.00)
accuracy: 0.9079236977256053
precision:  0.05803571428571429
recall:  0.0538860103626943
f1:  0.05588393336915637
numericals_rfe_manual_drop


array([[17161,   956],
       [  917,    48]])

Accuracy: 0.97 (+/- 0.00)
accuracy: 0.9018446703699822
precision:  0.04780876494023904
recall:  0.049740932642487044
f1:  0.04875571356018283
numericals_ols


array([[17364,   753],
       [  917,    48]])

Accuracy: 0.97 (+/- 0.00)
accuracy: 0.9124829682423226
precision:  0.0599250936329588
recall:  0.049740932642487044
f1:  0.05436013590033975


#### `Comments`

In [57]:
# Columns selected in 'numericals_variance' with Oversampled 'TARGET_B' seems to give us better results.
# Metrics show less false positive and overall better performance.