In [47]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

### `Load data numerical, categorical and target`

In [48]:
numericals = pd.read_csv('files_for_lab/numerical.csv')
categoricals = pd.read_csv('files_for_lab/categorical.csv')
targets = pd.read_csv('files_for_lab/target.csv')

print(f"Numericals shape: {numericals.shape}")
print(f"Categoricals shape: {categoricals.shape}")
print(f"Targets shape: {targets.shape}")

Numericals shape: (95412, 315)
Categoricals shape: (95412, 22)
Targets shape: (95412, 2)


In [49]:
important_columns = ['WEALTH1', 'WEALTH2', 'VETERANS', 'SOLIH']

### `Numericals - Variance Threshold`

In [50]:
min_max_scaler = MinMaxScaler()
numericals_scaled = min_max_scaler.fit_transform(numericals)

In [51]:
selector = VarianceThreshold(0.09)
selected_numericals = selector.fit_transform(numericals_scaled)

columns_to_drop = [col_tuple[0] for col_tuple in zip(numericals.columns, selector.get_support()) if col_tuple[1] == False]

print(f"There are '{len(columns_to_drop)}' columns to drop.")

There are '293' columns to drop.


In [52]:
columns_to_drop = [col for col in columns_to_drop if col not in important_columns]
numericals_variance = numericals.drop(columns_to_drop, axis=1)
numericals_variance.shape

(95412, 22)

In [53]:
numericals_variance_manual_drop = numericals_variance.drop(['POP90C3', 'HC6', 'HC7', 'CLUSTER2'], axis=1)
numericals_variance_manual_drop.shape

(95412, 18)

### `Numericals - Chi2`

In [54]:
selector = SelectKBest(chi2, k=10)
selector.fit(numericals_scaled, targets[['TARGET_B']])
mask = selector.get_support()
columns_to_drop = numericals.columns[~mask]

print(f"There are '{len(columns_to_drop)}' columns to drop.")

There are '305' columns to drop.


In [55]:
columns_to_drop = [col for col in columns_to_drop if col not in important_columns]
numericals_chi2 = numericals.drop(columns_to_drop, axis=1)
numericals_chi2.shape

(95412, 12)

### `Numericals - RFE`

In [56]:
model = LinearRegression()

# Create an instance of the RFE class
rfe = RFE(model, n_features_to_select=25, verbose=False)

# Fit the RFE to the DataFrame
rfe.fit(numericals_scaled, targets[['TARGET_B']])

# Get the boolean mask of the selected columns
mask = rfe.support_

# Use the mask to obtain the names of the selected columns
columns_to_drop = numericals.columns[~mask]

print(f"There are '{len(columns_to_drop)}' columns to drop.")

There are '290' columns to drop.


In [57]:
columns_to_drop = [col for col in columns_to_drop if col not in important_columns]
numericals_rfe = numericals.drop(columns_to_drop, axis=1)
numericals_rfe.shape

(95412, 27)

In [58]:
numericals_rfe_manual_drop = numericals_rfe.drop(['POP90C1', 'DW1', 'MC2'], axis=1)
numericals_rfe_manual_drop.shape

(95412, 24)

### `Numericals - OLS`

In [59]:
df = sm.add_constant(numericals_scaled)

# Fit the OLS model
model = sm.OLS(targets[['TARGET_B']], df)
results = model.fit()

# Get the p-values of the features
pvalues = results.pvalues

# Select the features with a p-value more than 0.05
columns_to_drop = [col_tuple[0] for col_tuple in zip(numericals.columns, pvalues[1::]) if col_tuple[1] > 0.05]
print(f"There are '{len(columns_to_drop)}' columns to drop.")

There are '293' columns to drop.


In [60]:
columns_to_drop = [col for col in columns_to_drop if col not in important_columns]
numericals_ols = numericals.drop(columns_to_drop, axis=1)
numericals_ols.shape

(95412, 23)

### `Categoricals`

In [61]:
object_columns_from_categoricals = [col for col in categoricals.columns if categoricals[col].dtype == object]
for col in object_columns_from_categoricals:
    categoricals[col] = pd.factorize(categoricals[col])[0]

In [62]:
# 'RFA_2R' has only 1 unique value so we can drop it

categoricals.drop('RFA_2R', axis=1, inplace=True)
categoricals.shape

(95412, 21)

### `Oversample label`

In [63]:
target_b = targets[['TARGET_B']]
target_b['TARGET_B'].value_counts()

target_b_0 = target_b[target_b['TARGET_B'] == 0]
target_b_1 = target_b[target_b['TARGET_B'] == 1]

target_b_1_undersampled = resample(target_b_1, replace=True, n_samples=len(target_b_0))
target_b_1_undersampled.shape

(90569, 1)

In [64]:
target_b_oversampled = pd.concat([target_b_0, target_b_1_undersampled], axis=0)
target_b_oversampled['TARGET_B'].value_counts()

0    90569
1    90569
Name: TARGET_B, dtype: int64

### `Check accuracy with sets of selected columns`

#### `Helper functions`

In [65]:
def get_full_dfs_tuples(targets, selected_numericals_df_tuples):
    return [
        (sub_df[0], pd.concat([targets, sub_df[1], categoricals], axis=1).dropna(axis=0).sample(frac=1, random_state=10))
            for sub_df in selected_numericals_df_tuples
    ]

def score_selected_columns(targets, selected_numericals_df_list):
    index = 1
    df_tuples = get_full_dfs_tuples(targets, selected_numericals_df_list)
    for tuple in df_tuples:

        print("================================")
        print(f"{tuple[0]}")

        y = tuple[1]['TARGET_B']
        X = tuple[1].drop('TARGET_B', axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

        # Scaling data = X_train
        X_train_transformer = MinMaxScaler().fit(X_train)
        X_train_normalized = X_train_transformer.transform(X_train)
        X_train_normalized = pd.DataFrame(X_train_normalized)

        # Scaling data = X_test
        X_test_transformer = MinMaxScaler().fit(X_test)
        X_test_normalized = X_test_transformer.transform(X_test)
        X_test_normalized = pd.DataFrame(X_test_normalized)

        # Create an instance of the RandomForestClassifier
        clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf=20)

        # Fit the classifier to the training data
        clf.fit(X_train_normalized, y_train)

        # Make predictions on the test data
        y_pred = clf.predict(X_test_normalized)

        # Calculate the accuracy of the model
        display(confusion_matrix(y_test, y_pred))

        scores = cross_val_score(clf, X_train_normalized, y_train, cv=10)
        # Print the mean and standard deviation of the scores
        print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

        accuracy = accuracy_score(y_test, y_pred)
        print(f"accuracy: {accuracy}")
        print("precision: ",precision_score(y_test, y_pred, zero_division=0))
        print("recall: ",recall_score(y_test, y_pred))
        print("f1: ",f1_score(y_test, y_pred))
        
        print("================================")

        index += 1

#### `With imbalanced label`

In [66]:
selected_numericals_df_list = [
    ('numericals', numericals), 
    ('numericals_variance', numericals_variance), 
    ('numericals_variance_manual_drop', numericals_variance_manual_drop),
    ('numericals_chi2', numericals_chi2),
    ('numericals_rfe', numericals_rfe),
    ('numericals_rfe_manual_drop', numericals_rfe_manual_drop),
    ('numericals_ols', numericals_ols)
]

score_selected_columns(targets[['TARGET_B']], selected_numericals_df_list)

numericals


array([[18131,     0],
       [  952,     0]])

Accuracy: 0.95 (+/- 0.00)
accuracy: 0.9501126657234188
precision:  0.0
recall:  0.0
f1:  0.0
numericals_variance


array([[18104,     0],
       [  979,     0]])

Accuracy: 0.95 (+/- 0.00)
accuracy: 0.9486977938479275
precision:  0.0
recall:  0.0
f1:  0.0
numericals_variance_manual_drop


array([[18110,     0],
       [  973,     0]])

Accuracy: 0.95 (+/- 0.00)
accuracy: 0.9490122098202589
precision:  0.0
recall:  0.0
f1:  0.0
numericals_chi2


array([[18124,     0],
       [  959,     0]])

Accuracy: 0.95 (+/- 0.00)
accuracy: 0.9497458470890321
precision:  0.0
recall:  0.0
f1:  0.0
numericals_rfe


array([[18145,     0],
       [  938,     0]])

Accuracy: 0.95 (+/- 0.00)
accuracy: 0.950846302992192
precision:  0.0
recall:  0.0
f1:  0.0
numericals_rfe_manual_drop


array([[18144,     0],
       [  939,     0]])

Accuracy: 0.95 (+/- 0.00)
accuracy: 0.9507939003301368
precision:  0.0
recall:  0.0
f1:  0.0
numericals_ols


array([[18097,     0],
       [  986,     0]])

Accuracy: 0.95 (+/- 0.00)
accuracy: 0.9483309752135408
precision:  0.0
recall:  0.0
f1:  0.0


#### `With Oversampling`

In [67]:
score_selected_columns(target_b_oversampled, selected_numericals_df_list)

numericals


array([[12703,  5681],
       [ 8773,  9071]])

Accuracy: 0.61 (+/- 0.01)
accuracy: 0.6010268300761842
precision:  0.6148996746203904
recall:  0.5083501457072406
f1:  0.5565713584488894
numericals_variance


array([[10613,  7403],
       [ 7119, 11093]])

Accuracy: 0.60 (+/- 0.01)
accuracy: 0.599149828861654
precision:  0.5997512975778547
recall:  0.6091038875466726
f1:  0.6043914133158984
numericals_variance_manual_drop


array([[11545,  6606],
       [ 8008, 10069]])

Accuracy: 0.60 (+/- 0.01)
accuracy: 0.5966103566302308
precision:  0.6038380809595202
recall:  0.5570061403994025
f1:  0.5794774401473296
numericals_chi2


array([[11117,  7038],
       [ 7362, 10711]])

Accuracy: 0.60 (+/- 0.01)
accuracy: 0.6025173898641935
precision:  0.6034706180629895
recall:  0.5926520223537874
f1:  0.5980123946178326
numericals_rfe


array([[16237,  1847],
       [14346,  3798]])

Accuracy: 0.59 (+/- 0.01)
accuracy: 0.553025284310478
precision:  0.6728077945084145
recall:  0.20932539682539683
f1:  0.31930724284333095
numericals_rfe_manual_drop


array([[15071,  2984],
       [12647,  5526]])

Accuracy: 0.59 (+/- 0.01)
accuracy: 0.5685381472893895
precision:  0.6493537015276146
recall:  0.3040774775766247
f1:  0.4141963047633324
numericals_ols


array([[12867,  5212],
       [ 9548,  8601]])

Accuracy: 0.60 (+/- 0.01)
accuracy: 0.5925803246107982
precision:  0.6226742923333092
recall:  0.47391040828695796
f1:  0.5382016144171203


#### `Comments`

In [68]:
# Without Oversampling, models seem only to predict people that are not willing to donate.
# With Oversampling, it seems that results are better without filtering columns but overall less interesting than those seen on the previous lab (lab-handling-data-imbalance-classification).
# We are going to continue with colomns selected by the variance threshhold