In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

In [None]:
df = pd.read_csv('./scaleddata/train_df.csv')

In [None]:
df.head()

Unnamed: 0,Application order,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),...,Father's occupation_7,Father's occupation_9,Father's occupation_others,Displaced_1,Educational special needs_1,Debtor_1,Tuition fees up to date_1,Gender_1,Scholarship holder_1,International_1
0,0.111111,0.347368,0.307368,0.09434,0.0,0.230769,0.377778,0.115385,0.600442,0.25,...,True,False,False,True,False,False,True,True,False,False
1,0.555556,0.557895,0.370526,0.037736,0.0,0.269231,0.0,0.0,0.0,0.0,...,False,False,True,True,False,False,True,False,False,False
2,0.111111,0.473684,0.542105,0.415094,0.0,0.192308,0.222222,0.115385,0.918322,0.0,...,True,False,False,False,False,False,True,True,False,False
3,0.444444,0.421053,0.3,0.037736,0.0,0.269231,0.177778,0.269231,0.650899,0.0,...,True,False,False,True,False,False,True,False,True,False
4,0.111111,0.294737,0.191579,0.056604,0.0,0.230769,0.244444,0.153846,0.569536,0.0,...,False,True,False,False,False,False,True,False,True,False


In [None]:
df['Target']

0       0
1       1
2       1
3       0
4       0
       ..
3091    1
3092    0
3093    1
3094    1
3095    1
Name: Target, Length: 3096, dtype: int64

In [None]:
from sklearn.ensemble import RandomForestClassifier

def train_random_forest(X_train, y_train, **rf_params):
    """
    Train a Random Forest classifier after ensuring all data is numerical.

    Parameters:
    - X_train: pd.DataFrame, feature data.
    - y_train: pd.Series or np.array, target binary labels (0 or 1).
    - **rf_params: parameters for RandomForestClassifier.

    Returns:
    - model: trained RandomForestClassifier.
    """

    # Convert boolean features to integers (0/1)
    X_train = X_train.applymap(lambda x: int(x) if isinstance(x, bool) else x)

    # Ensure the target is integer type
    y_train = y_train.astype(int)

    # Initialize the Random Forest model with given parameters
    model = RandomForestClassifier(**rf_params)

    # Train the model
    model.fit(X_train, y_train)

    return model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

X = df.drop('Target', axis=1)
y = df['Target']

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def kfolds_results(param):
    final_accuracy, final_precision, final_recall, final_f1 = [], [], [], []

    for fold, (train_index, test_index) in enumerate(skf.split(X, y), start=1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        rf = train_random_forest(X_train, y_train, **param)

        y_pred = rf.predict(X_test)

        final_accuracy.append(accuracy_score(y_test, y_pred))
        final_precision.append(precision_score(y_test, y_pred))
        final_recall.append(recall_score(y_test, y_pred))
        final_f1.append(f1_score(y_test, y_pred))

    avg_accuracy = sum(final_accuracy) / len(final_accuracy)
    avg_precision = sum(final_precision) / len(final_precision)
    avg_recall = sum(final_recall) / len(final_recall)
    avg_f1 = sum(final_f1) / len(final_f1)

    metrics = {
        'Accuracy': avg_accuracy,
        'Precision': avg_precision,
        'Recall': avg_recall,
        'F1-score': avg_f1
    }
    return metrics

In [None]:
param_sets = [
    {'n_estimators': 50, 'max_depth': 5, 'random_state': 42},
    {'n_estimators': 100, 'max_depth': 10, 'random_state': 42},
    {'n_estimators': 150, 'max_depth': None, 'random_state': 42},
    {'n_estimators': 200, 'max_depth': 7, 'random_state': 42},
    {'n_estimators': 100, 'max_depth': 3, 'random_state': 42}
]

for i, params in enumerate(param_sets, start=1):
    print(f"\nParameter Set {i}: {params}")
    metrics = kfolds_results(params)
    print(f"Metrics: {metrics}")



Parameter Set 1: {'n_estimators': 50, 'max_depth': 5, 'random_state': 42}
Metrics: {'Accuracy': 0.8530335087810725, 'Precision': 0.8622372612306008, 'Recall': 0.6478554388102127, 'F1-score': 0.7389930433964684}

Parameter Set 2: {'n_estimators': 100, 'max_depth': 10, 'random_state': 42}
Metrics: {'Accuracy': 0.8688634113294075, 'Precision': 0.8788773974781069, 'Recall': 0.6881021267955941, 'F1-score': 0.7710793402781922}

Parameter Set 3: {'n_estimators': 150, 'max_depth': None, 'random_state': 42}
Metrics: {'Accuracy': 0.8685408306842461, 'Precision': 0.8671547345462164, 'Recall': 0.7001675041876047, 'F1-score': 0.7735226208291937}

Parameter Set 4: {'n_estimators': 200, 'max_depth': 7, 'random_state': 42}
Metrics: {'Accuracy': 0.8662754703215384, 'Precision': 0.876179431043132, 'Recall': 0.6810466473783057, 'F1-score': 0.7657789001870151}

Parameter Set 5: {'n_estimators': 100, 'max_depth': 3, 'random_state': 42}
Metrics: {'Accuracy': 0.8262249205273855, 'Precision': 0.8233848168841

In [None]:
param_grid = [
    {'n_estimators': 150, 'max_depth': None, 'random_state': 42},  # best from previous run
    {'n_estimators': 200, 'max_depth': None, 'random_state': 42},
    {'n_estimators': 150, 'max_depth': 15, 'random_state': 42},
    {'n_estimators': 100, 'max_depth': None, 'random_state': 42},
    {'n_estimators': 150, 'max_depth': 20, 'random_state': 42},
]

results = []

for i, params in enumerate(param_grid, start=1):
    print(f"\nEvaluating Parameter Set {i}: {params}")
    metrics = kfolds_results(params)
    results.append((params, metrics))
    print(f"Metrics: {metrics}")

# Find the best parameter set based on the highest F1-score
best_params, best_metrics = max(results, key=lambda x: x[1]['F1-score'])
print("\nBest Parameters and Metrics based on F1-score:")
print(best_params)
print(best_metrics)



Evaluating Parameter Set 1: {'n_estimators': 150, 'max_depth': None, 'random_state': 42}
Metrics: {'Accuracy': 0.8685408306842461, 'Precision': 0.8671547345462164, 'Recall': 0.7001675041876047, 'F1-score': 0.7735226208291937}

Evaluating Parameter Set 2: {'n_estimators': 200, 'max_depth': None, 'random_state': 42}
Metrics: {'Accuracy': 0.8669253217989473, 'Precision': 0.8654280608853974, 'Recall': 0.6961524795695649, 'F1-score': 0.770469681529599}

Evaluating Parameter Set 3: {'n_estimators': 150, 'max_depth': 15, 'random_state': 42}
Metrics: {'Accuracy': 0.8682172077752879, 'Precision': 0.8654284527658088, 'Recall': 0.7001675041876047, 'F1-score': 0.7732420921825783}

Evaluating Parameter Set 4: {'n_estimators': 100, 'max_depth': None, 'random_state': 42}
Metrics: {'Accuracy': 0.8695064880921362, 'Precision': 0.86735812397607, 'Recall': 0.7021724785543881, 'F1-score': 0.7752635153494463}

Evaluating Parameter Set 5: {'n_estimators': 150, 'max_depth': 20, 'random_state': 42}
Metrics: 

In [None]:
param_grid = [
    # Existing parameter sets
    {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'random_state': 42},
    {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'random_state': 42},
    {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': None, 'random_state': 42},

    # Additional parameter sets for further experimentation
    {'n_estimators': 150, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'random_state': 42},
    {'n_estimators': 200, 'max_depth': 15, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'log2', 'random_state': 42},
]

results = []

for i, params in enumerate(param_grid, start=1):
    print(f"\nEvaluating Parameter Set {i}: {params}")
    metrics = kfolds_results(params)
    results.append((params, metrics))
    print(f"Metrics: {metrics}")

# Find the best parameter set based on the highest F1-score
best_params, best_metrics = max(results, key=lambda x: x[1]['F1-score'])
print("\nBest Parameters and Metrics based on F1-score:")
print(best_params)
print(best_metrics)




Evaluating Parameter Set 1: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'random_state': 42}
Metrics: {'Accuracy': 0.8695064880921362, 'Precision': 0.86735812397607, 'Recall': 0.7021724785543881, 'F1-score': 0.7752635153494463}

Evaluating Parameter Set 2: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'random_state': 42}
Metrics: {'Accuracy': 0.8653098129136485, 'Precision': 0.8639021220595502, 'Recall': 0.6921323790670524, 'F1-score': 0.7675385507451958}

Evaluating Parameter Set 3: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': None, 'random_state': 42}
Metrics: {'Accuracy': 0.8717734118505394, 'Precision': 0.8635347298048452, 'Recall': 0.7152733363788639, 'F1-score': 0.7819105447490762}

Evaluating Parameter Set 4: {'n_estimators': 150, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve
# from sklearn.model_selection import train_test_split

# # Assuming X, y are already defined (from your dataset)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# # Retrain Random Forest with best parameters
# best_params = {
#     'n_estimators': 100,
#     'max_depth': None,
#     'min_samples_split': 10,
#     'min_samples_leaf': 4,
#     'max_features': None,
#     'random_state': 42
# }

# rf = RandomForestClassifier(**best_params)
# rf.fit(X_train, y_train)

# # Threshold Optimization Function
# def optimize_threshold(model, X_test, y_test):
#     y_probs = model.predict_proba(X_test)[:, 1]
#     precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)
#     f1_scores = (2 * precisions * recalls) / (precisions + recalls)
#     best_idx = f1_scores.argmax()
#     best_threshold = thresholds[best_idx]
#     best_f1 = f1_scores[best_idx]

#     print(f'Optimal Threshold: {best_threshold:.4f}')
#     print(f'Best F1-score: {best_f1:.4f}')

#     return best_threshold, best_f1

# # Execute threshold optimization
# opt_threshold, best_f1 = optimize_threshold(rf, X_test, y_test)