In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score

In [59]:
# Assuming the files are in the "files_for_lab" folder
categorical_data = pd.read_csv("categorical.csv")
numerical_data = pd.read_csv("numerical.csv")
data = pd.read_csv("target.csv")

In [60]:
target_count = data['TARGET_B'].value_counts()
print("Class 0 (Not Responded):", target_count[0])
print("Class 1 (Responded):", target_count[1])

Class 0 (Not Responded): 90569
Class 1 (Responded): 4843


In [61]:
X = data.drop(columns=['TARGET_B'])
y = data['TARGET_B']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = data[data['TARGET_B'] == 0]
df_minority = data[data['TARGET_B'] == 1]

# Upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=target_count[0],
                                 random_state=42)

# Combine majority class with upsampled minority class
data_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Split the upsampled data into features (X) and target (y) again
X_upsampled = data_upsampled.drop(columns=['TARGET_B'])
y_upsampled = data_upsampled['TARGET_B']

X_train_upsampled, _, y_train_upsampled, _ = train_test_split(X_upsampled, y_upsampled, test_size=0.01, random_state=42)


In [63]:
# Define the custom scorer based on F1-score
custom_scorer = make_scorer(f1_score)

In [64]:
# Define the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Perform GridSearchCV
grid_search = GridSearchCV(rf_classifier, param_grid, scoring=custom_scorer, cv=5)
grid_search.fit(X_train_upsampled, y_train_upsampled)

In [65]:
best_rf_model = grid_search.best_estimator_
best_params = grid_search.best_params_

In [66]:
y_pred = best_rf_model.predict(X_test)

# Calculate the relevant metrics (precision, recall, F1-score)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Precision: 1.0
Recall: 1.0
F1-score: 1.0
