In [19]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

### `Load data numerical, categorical and target`

In [2]:
numericals = pd.read_csv('files_for_lab/numerical.csv')
categoricals = pd.read_csv('files_for_lab/categorical.csv')
targets = pd.read_csv('files_for_lab/target.csv')

print(f"Numericals shape: {numericals.shape}")
print(f"Categoricals shape: {categoricals.shape}")
print(f"Targets shape: {targets.shape}")

Numericals shape: (95412, 315)
Categoricals shape: (95412, 22)
Targets shape: (95412, 2)


In [4]:
important_columns = ['WEALTH1', 'WEALTH2', 'VETERANS', 'SOLIH']

### `Numericals - RFE`

In [7]:
min_max_scaler = MinMaxScaler()
numericals_scaled = min_max_scaler.fit_transform(numericals)

In [8]:
model = LinearRegression()

# Create an instance of the RFE class
rfe = RFE(model, n_features_to_select=25, verbose=False)

# Fit the RFE to the DataFrame
rfe.fit(numericals_scaled, targets[['TARGET_B']])

# Get the boolean mask of the selected columns
mask = rfe.support_

# Use the mask to obtain the names of the selected columns
columns_to_drop = numericals.columns[~mask]

print(f"There are '{len(columns_to_drop)}' columns to drop.")

There are '290' columns to drop.


In [9]:
columns_to_drop = [col for col in columns_to_drop if col not in important_columns]
numericals_rfe = numericals.drop(columns_to_drop, axis=1)
numericals_rfe.shape

(95412, 27)

In [10]:
numericals_rfe_manual_drop = numericals_rfe.drop(['POP90C1', 'DW1', 'MC2'], axis=1)
numericals_rfe_manual_drop.shape

(95412, 24)

### `Categoricals`

In [13]:
object_columns_from_categoricals = [col for col in categoricals.columns if categoricals[col].dtype == object]
for col in object_columns_from_categoricals:
    categoricals[col] = pd.factorize(categoricals[col])[0]

In [14]:
# 'RFA_2R' has only 1 unique value so we can drop it

categoricals.drop('RFA_2R', axis=1, inplace=True)
categoricals.shape

(95412, 21)

### `Try Oversampling`

In [15]:
target_b = targets[['TARGET_B']]
target_b['TARGET_B'].value_counts()

target_b_0 = target_b[target_b['TARGET_B'] == 0]
target_b_1 = target_b[target_b['TARGET_B'] == 1]

target_b_1_undersampled = resample(target_b_1, replace=True, n_samples=len(target_b_0))
target_b_1_undersampled.shape

(90569, 1)

In [16]:
target_b_oversampled = pd.concat([target_b_0, target_b_1_undersampled], axis=0).sample(frac=1)
target_b_oversampled['TARGET_B'].value_counts()

0    90569
1    90569
Name: TARGET_B, dtype: int64

### `Check accuracy with sets of selected columns`

In [20]:
def get_full_dfs_tuples(targets, selected_numericals_df_tuples):
    return [
        (sub_df[0], pd.concat([targets, sub_df[1], categoricals], axis=1).dropna(axis=0))
            for sub_df in selected_numericals_df_tuples
    ]

def score_selected_columns(targets, selected_numericals_df_list):
    index = 1
    df_tuples = get_full_dfs_tuples(targets, selected_numericals_df_list)
    for tuple in df_tuples:

        print("================================")
        print(f"{tuple[0]}")

        y = tuple[1]['TARGET_B']
        X = tuple[1].drop('TARGET_B', axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        # Scaling data = X_train
        X_train_transformer = MinMaxScaler().fit(X_train)
        X_train_normalized = X_train_transformer.transform(X_train)
        X_train_normalized = pd.DataFrame(X_train_normalized)

        # Scaling data = X_test
        X_test_transformer = MinMaxScaler().fit(X_test)
        X_test_normalized = X_test_transformer.transform(X_test)
        X_test_normalized = pd.DataFrame(X_test_normalized)

        # Create an instance of the RandomForestClassifier
        clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf=20)

        # Fit the classifier to the training data
        clf.fit(X_train_normalized, y_train)

        # Make predictions on the test data
        y_pred = clf.predict(X_test_normalized)

        # Calculate the accuracy of the model
        display(confusion_matrix(y_test, y_pred))

        scores = cross_val_score(clf, X_train_normalized, y_train, cv=10)
        # Print the mean and standard deviation of the scores
        print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
        
        print("================================")

        index += 1

#### `Without Oversampling`

In [21]:
selected_numericals_df_list = [
    ('numericals', numericals), 
    ('numericals_rfe', numericals_rfe), 
    ('numericals_rfe_manual_drop', numericals_rfe_manual_drop)
]

score_selected_columns(targets[['TARGET_B']], selected_numericals_df_list)

numericals


array([[18166,     0],
       [  917,     0]])

Accuracy: 0.95 (+/- 0.00)
numericals_rfe


array([[18112,     0],
       [  971,     0]])

Accuracy: 0.95 (+/- 0.00)
numericals_rfe_manual_drop


array([[18111,     0],
       [  972,     0]])

Accuracy: 0.95 (+/- 0.00)


#### `With Oversampling`

In [22]:
score_selected_columns(target_b_oversampled, selected_numericals_df_list)

numericals


array([[13176,  4968],
       [ 9712,  8372]])

Accuracy: 0.61 (+/- 0.01)
numericals_rfe


array([[11926,  6122],
       [ 8746,  9434]])

Accuracy: 0.59 (+/- 0.01)
numericals_rfe_manual_drop


array([[16011,  2238],
       [13667,  4312]])

Accuracy: 0.59 (+/- 0.01)


In [23]:
# By Oversampling, we can predict with LESS accuracy people who will donate in opposite than without Oversampling.
# But by Oversampling, we can predict with MORE accuracy people who will NOT donate. (with many errors though)
# We can use these 2 different types of models (Oversampling / Not Oversampling) so we can predict with more accuracy people who will donate AND people who will NOT.