## AGYW Vulnerability Screening Project



## Importing necessary libraries and data

In [None]:
import warnings

warnings.filterwarnings("ignore")

# this will help in making the Python code more structured automatically (good coding practice)
#%load_ext nb_black

# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
import pickle

# Library to split data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline # Import the necessary function
from sklearn.compose import make_column_transformer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline

# libaries to help with data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 100)

# Libraries different ensemble classifiers
from sklearn.ensemble import (
    BaggingClassifier,
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    StackingClassifier,

)
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

# Libraries to get different metric scores
from sklearn import metrics
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

# To tune different models
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

#import required libraries
import sklearn
import joblib
#import session_info

## Data Overview

#### Load Dataset

In [182]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [183]:
screened_agyw = pd.read_csv('/content/drive/MyDrive/DREAMS Models Project/Vulnerability_Screening/tbl_screened.csv')

In [184]:
screened_agyw.head(2)

Unnamed: 0,id,Agency,IM,county,Sub_county,Ward,sub_location,village,date_of_birth,age_at_screening,marital_status,has_disability,disability,out_of_school,ever_had_sex,is_head,undergone_gbv_last_12mnths,sexual_partners_last_12mnths,received_gifts_for_sex,ever_had_sti,no_condom_use,is_orphan,has_child,used_drugs_last_12mnths,is_eligible,status,screening_date
0,1,USAID,USAID Nuru Ya Mtoto,Homabay,Karachuonyo,West karachuonyo,rabuor,kasaye,2003-12-31,19.0,Single,No,,Yes,Yes,No,Yes,Yes,Yes,No,Yes,No,Yes,No,Eligible,Enrolled,2023-11-09 00:00:00
1,2,USAID,USAID Nuru Ya Mtoto,Homabay,Karachuonyo,West karachuonyo,koguta,koyiwa,2010-12-01,12.0,Single,No,,Yes,No,No,Yes,No,No,No,No,No,No,No,Eligible,Enrolled,2023-11-09 00:00:00


#### Screened AGYWs

In [185]:
#Select only the required columns
sAGYWs = screened_agyw.copy()

## Data Cleaning & Processing

In [186]:
# Check duplicates in the dataset
sAGYWs.duplicated().sum()

0

In [187]:
#check the dataset columns datatypes
#sAGYWs.columns
print(sAGYWs.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179901 entries, 0 to 179900
Data columns (total 27 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   id                            179901 non-null  int64  
 1   Agency                        179901 non-null  object 
 2   IM                            179901 non-null  object 
 3   county                        179901 non-null  object 
 4   Sub_county                    179901 non-null  object 
 5   Ward                          179901 non-null  object 
 6   sub_location                  171770 non-null  object 
 7   village                       178673 non-null  object 
 8   date_of_birth                 179896 non-null  object 
 9   age_at_screening              179896 non-null  float64
 10  marital_status                179901 non-null  object 
 11  has_disability                179901 non-null  object 
 12  disability                    2084 non-null 

In [188]:
#Select required columns
vScreened = sAGYWs.iloc[:, list(range(9, 25))]

#drop the column dissabilty
vScreened = vScreened.drop(['disability'], axis=1)
vScreened.head(2)

Unnamed: 0,age_at_screening,marital_status,has_disability,out_of_school,ever_had_sex,is_head,undergone_gbv_last_12mnths,sexual_partners_last_12mnths,received_gifts_for_sex,ever_had_sti,no_condom_use,is_orphan,has_child,used_drugs_last_12mnths,is_eligible
0,19.0,Single,No,Yes,Yes,No,Yes,Yes,Yes,No,Yes,No,Yes,No,Eligible
1,12.0,Single,No,Yes,No,No,Yes,No,No,No,No,No,No,No,Eligible


In [189]:
#Check missing valeus
vScreened.isnull().sum()

Unnamed: 0,0
age_at_screening,5
marital_status,0
has_disability,0
out_of_school,0
ever_had_sex,0
is_head,0
undergone_gbv_last_12mnths,0
sexual_partners_last_12mnths,0
received_gifts_for_sex,0
ever_had_sti,0


In [190]:
#Drop the rows where age_at_screening is null
vScreened = vScreened.dropna(subset=['age_at_screening'])
vScreened.isnull().sum()

Unnamed: 0,0
age_at_screening,0
marital_status,0
has_disability,0
out_of_school,0
ever_had_sex,0
is_head,0
undergone_gbv_last_12mnths,0
sexual_partners_last_12mnths,0
received_gifts_for_sex,0
ever_had_sti,0


In [191]:
#Divide the dataset to training data and testting data
training_data = vScreened.sample(frac=0.7, random_state=42).copy()
testing_data = vScreened.drop(training_data.index)

#Reset dataset index
training_data = training_data.reset_index(drop=True)
testing_data = testing_data.reset_index(drop=True).drop(['is_eligible'], axis=1)

#Print shape of the dataset
print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

No. of training examples: 125927
No. of testing examples: 53969


In [272]:
testing_data['ID'] = ['SM-' + str(i) for i in range(1, len(testing_data) + 1 )]
vsTestData = testing_data.copy()
vsTestData.head(2)

Unnamed: 0,age_at_screening,marital_status,has_disability,out_of_school,ever_had_sex,is_head,undergone_gbv_last_12mnths,sexual_partners_last_12mnths,received_gifts_for_sex,ever_had_sti,no_condom_use,is_orphan,has_child,used_drugs_last_12mnths,ID
0,19.0,Single,No,Yes,No,No,No,No,No,No,No,No,No,No,SM-1
1,15.0,Single,No,Yes,Yes,No,No,No,Yes,No,No,No,Yes,No,SM-2


In [265]:
testing_data.to_csv('/content/drive/MyDrive/DREAMS Models Project/Vulnerability_Screening/datasets/testing_data.csv', index=False)
training_data.to_csv('/content/drive/MyDrive/DREAMS Models Project/Vulnerability_Screening/datasets/training_data.csv', index=False)

### Data Preprocessing

In [203]:
#Laod the data be used
vsData = pd.read_csv('/content/drive/MyDrive/DREAMS Models Project/Vulnerability_Screening/datasets/training_data.csv')
vsData.head()

Unnamed: 0,age_at_screening,marital_status,has_disability,out_of_school,ever_had_sex,is_head,undergone_gbv_last_12mnths,sexual_partners_last_12mnths,received_gifts_for_sex,ever_had_sti,no_condom_use,is_orphan,has_child,used_drugs_last_12mnths,is_eligible
0,11.0,Single,No,No,No,No,Yes,No,No,No,No,No,No,No,Eligible
1,11.0,Single,No,No,No,No,No,No,No,No,No,No,No,No,Eligible
2,12.0,Single,No,No,No,No,Yes,No,No,No,No,No,No,No,Eligible
3,14.0,Single,No,No,No,No,No,No,No,No,No,No,No,No,Eligible
4,10.0,Single,No,No,No,No,Yes,No,No,No,No,No,No,No,Eligible


In [204]:
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

#Createa a label encoder
le = preprocessing.LabelEncoder()
vsData_En  = vsData.apply(preprocessing.LabelEncoder().fit_transform )
vsData_En.sample(3)

Unnamed: 0,age_at_screening,marital_status,has_disability,out_of_school,ever_had_sex,is_head,undergone_gbv_last_12mnths,sexual_partners_last_12mnths,received_gifts_for_sex,ever_had_sti,no_condom_use,is_orphan,has_child,used_drugs_last_12mnths,is_eligible
98225,13,2,0,0,1,0,0,0,0,0,1,0,0,0,0
88842,15,2,0,0,0,0,1,0,0,0,0,0,0,0,0
8641,10,2,0,0,0,0,1,0,0,0,0,0,0,0,0


In [314]:
#exporting the departure encoder
# Save the label encoders to a file
with open("label_encoders.pkl", "wb") as file:
    pickle.dump(le, file)

In [313]:
#Save Label Encoder
le_enc= '/content/drive/MyDrive/DREAMS Models Project/Vulnerability_Screening/datasets/le_encoder.pkl'
with open(le_enc, 'wb') as file:
    # pickle.dump(filename, file) # This line was incorrect and is now fixed
    pickle.dump(le, file) # Save the 'grid_search' object which contains the model
print("Model saved as 'le_encoder.pkl'")

Model saved as 'le_encoder.pkl'


In [194]:
# # Create a label encoder
# le = LabelEncoder()

# # Iterate through columns and encode any that are of type 'object' (string)
# for col in vScreened.columns:
#     if vScreened[col].dtype == 'object':
#         vScreened[col] = le.fit_transform(vScreened[col])

# # Determine features correlation
# classifer = SelectKBest(k=6, score_func=f_classif)
# fits = classifer.fit(vScreened.drop(['is_eligible'], axis=1), vScreened['is_eligible'])
# x_1 = pd.DataFrame(fits.scores_)
# columns = pd.DataFrame(vScreened.drop(['is_eligible'], axis=1).columns)
# features_scores = pd.concat([columns, x_1], axis=1)
# features_scores.columns = ['Features', 'Score']
# print(features_scores.sort_values(by='Score', ascending=False))

In [205]:
from sklearn.feature_selection import SelectKBest, f_classif
#Determine features correlation
classifer = SelectKBest(k=6, score_func =f_classif)
fits = classifer.fit(vsData_En.drop(['is_eligible'], axis=1), vsData_En['is_eligible'])
x_1 = pd.DataFrame(fits.scores_)
columns  =  pd.DataFrame(vsData_En.drop(['is_eligible'], axis=1).columns)
features_scores = pd.concat([columns, x_1], axis=1)
features_scores.columns = ['Features', 'Score']
features_scores.sort_values(by='Score', ascending=False)

Unnamed: 0,Features,Score
6,undergone_gbv_last_12mnths,25989.92125
4,ever_had_sex,8056.171559
3,out_of_school,7646.518894
10,no_condom_use,4474.016516
11,is_orphan,3909.151097
12,has_child,2847.511186
8,received_gifts_for_sex,2420.731827
7,sexual_partners_last_12mnths,1981.449956
5,is_head,1046.641525
13,used_drugs_last_12mnths,766.722704


In [197]:
# label_encoders = {}
# categorical_columns = vScreened.select_dtypes(include='object').columns  # Get categorical column names
# for col in categorical_columns:
#     le = LabelEncoder()
#     vScreened[col] = le.fit_transform(vScreened[col])  # Use column name to index DataFrame
#     label_encoders[col] = le

# # Save the encoders
# with open('encoders.pkl', 'wb') as f:
#     pickle.dump(label_encoders, f)

In [206]:
#Define train set and test set for the dataset
X = vsData_En.drop(['is_eligible'], axis=1)
y = vsData_En['is_eligible']

#Splitting the dataset to training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=34, stratify=y)

#Display distributions
print("Training set\n",y_train.value_counts(normalize=True))
print("Validation set\n",y_val.value_counts(normalize=True))

Training set
 is_eligible
0    0.802945
1    0.197055
Name: proportion, dtype: float64
Validation set
 is_eligible
0    0.802959
1    0.197041
Name: proportion, dtype: float64


##### Functions to measure models outpu and confusion matrix

In [207]:
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
    """
    Function to compute different metrics to check classification model performance

    model: classifier
    predictors: independent variables
    target: dependent variable
    """

    # predicting using the independent variables
    pred = model.predict(predictors)

    acc = accuracy_score(target, pred)  # to compute Accuracy
    recall = recall_score(target, pred)  # to compute Recall
    precision = precision_score(target, pred)  # to compute Precision
    f1 = f1_score(target, pred)  # to compute F1-score

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {"Accuracy": acc, "Recall": recall, "Precision": precision, "F1": f1},
        index=[0],
    )

    return df_perf

def confusion_matrix_sklearn(model, predictors, target):
    """
    To plot the confusion_matrix with percentages

    model: classifier
    predictors: independent variables
    target: dependent variable
    """
    y_pred = model.predict(predictors)
    cm = confusion_matrix(target, y_pred)
    labels = np.asarray(
        [
            ["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
            for item in cm.flatten()
        ]
    ).reshape(2, 2)

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=labels, fmt="")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")

## Model Building

In [208]:
#define model scorer
scorer = metrics.make_scorer(metrics.recall_score)

### Model Pipeline

In [209]:
# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
    ExtraTreesClassifier,
)
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# To tune different models
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
# To help with model building

In [210]:
models = []  # Empty list to store all the models

# Define the model
models.append(("Xgboost", XGBClassifier(random_state=3, eval_metric="logloss")))
models.append(("Gradient Boosting", GradientBoostingClassifier(random_state=3)))
#models.append(("Logistic Regression", LogisticRegression(random_state=3)))
models.append(("Random Forest", RandomForestClassifier(random_state=3)))
models.append(("Bagging", BaggingClassifier(random_state=3)))
#models.append(("SVC", SVC(probability=True, random_state=1)))
models.append(("Extra Trees", ExtraTreesClassifier(random_state=3)))
#models.append(("K-Nearest Neighbors", KNeighborsClassifier()))
models.append(("Naive Bayes", GaussianNB()))
#models.append(("Adaboost", AdaBoostClassifier(random_state=3))) # Removed eval_metric
models.append(("Decision Tree", DecisionTreeClassifier(random_state=3))) # Removed eval_metric

results = []  # Empty list to store all model's CV scores
names = []  # Empty list to store name of the models
score = []
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
    scoring = "recall"
    kfold = StratifiedKFold(
        n_splits=5, shuffle=True, random_state=1
    )  # Setting number of splits equal to 5
    cv_result = cross_val_score(
        estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold
    )
    results.append(cv_result)
    names.append(name)
    print("{}: {}".format(name, cv_result.mean() * 100))

print("\n" "Validation Performance:" "\n")

for name, model in models:
    model.fit(X_train, y_train)
    scores = recall_score(y_test, model.predict(X_test))
    score.append(scores)
    print("{}: {}".format(name, scores))


Cross-Validation Performance:

Xgboost: 68.30742659758204
Gradient Boosting: 57.8468624064479
Random Forest: 68.26137017846861
Bagging: 69.63730569948187
Extra Trees: 69.72366148531951
Naive Bayes: 96.40184225676454
Decision Tree: 69.72366148531951

Validation Performance:

Xgboost: 0.5832940954892174
Gradient Boosting: 0.5096525096525096
Random Forest: 0.5835766079668518
Bagging: 0.5833882663150955
Extra Trees: 0.58367077879273
Naive Bayes: 0.9612957905640832
Decision Tree: 0.5832940954892174


#### Hyper tuned 2 - selected models

### Naive Bayes - BernouliliNB

In [294]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Model and hyperparameter grid
bnb = BernoulliNB()
param_grid = {'alpha': [0.1, 0.5, 1.0, 5.0, 10.0], 'binarize': [0.0, 0.5, 1.0]}

# GridSearchCV
grid_search = GridSearchCV(bnb, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best hyperparameters and accuracy
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Evaluating the model on test data
nb_model = grid_search.best_estimator_

# Predicting on test data
y_pred = nb_model.predict(X_val)

# Model evaluation
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print("Model Evaluation on Test Data:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Specificity calculation
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)
print(f"Specificity: {specificity:.4f}")


Best parameters: {'alpha': 0.1, 'binarize': 0.0}
Best cross-validation accuracy: 0.8403480761071286
Model Evaluation on Test Data:
Accuracy: 0.8428
Precision: 0.5586
Recall (Sensitivity): 0.9647
F1 Score: 0.7075
Confusion Matrix:
[[24660  5675]
 [  263  7181]]
Specificity: 0.8129


In [300]:
#Save model
filename= '/content/drive/MyDrive/DREAMS Models Project/Vulnerability_Screening/datasets/vs_bnb_model.pkl'
with open(filename, 'wb') as file:
    # pickle.dump(filename, file) # This line was incorrect and is now fixed
    pickle.dump(nb_model, file) # Save the 'grid_search' object which contains the model
print("Model saved as 'vs_bnb_model.pkl'")

Model saved as 'vs_bnb_model.pkl'


In [301]:
lmodel = joblib.load(open(filename, 'rb'))

##### Model Testing

In [302]:
tData = vsTestData.apply(preprocessing.LabelEncoder().fit_transform ).drop(['ID'], axis=1)

In [298]:
#  Make Predictions using the test dataset

y_pred = lmodel.predict(tData)

# Select only the "ID" column from dTest_En
#pTest = dTest_En[['ID']].copy()

# Create a DataFrame with "ID" and "Predicted_Labels"
#res = pd.concat([pTest, pd.Series(y_pred, name='Predicted_Labels')], axis=1)

res = pd.concat([testing_data, pd.Series(y_pred, name='Predicted_Labels')], axis=1)


# Save the DataFrame to a CSV file
res.to_csv("/content/drive/MyDrive/DREAMS Models Project/Vulnerability_Screening/datasets/ModelPredictionsResult.csv", index=False)


In [274]:
le = preprocessing.LabelEncoder()

# Encoding test data
vsTD = tData.apply(preprocessing.LabelEncoder().fit_transform)

print("\nEncoded Training Dataset")
vsTD.head(2)


Encoded Training Dataset


Unnamed: 0,age_at_screening,marital_status,has_disability,out_of_school,ever_had_sex,is_head,undergone_gbv_last_12mnths,sexual_partners_last_12mnths,received_gifts_for_sex,ever_had_sti,no_condom_use,is_orphan,has_child,used_drugs_last_12mnths
0,18,2,0,1,0,0,0,0,0,0,0,0,0,0
1,14,2,0,1,1,0,0,0,1,0,0,0,1,0


In [275]:
#Load the test result data

modelTestResult = pd.read_csv("ModelPredictionsResult.csv")


# Add 'ID' column back to the DataFrame
modelTestResult['ID'] = vsTestData['ID']

# Save the updated DataFrame to a new CSV file with IDs
modelTestResult.to_csv('model_output_with_id.csv', index=False)

# Make the final dataset with prediction and IDs only
fSub =  modelTestResult[["ID", "Predicted_Labels"]]
fSub = fSub.rename(columns={'Predicted_Labels': 'default'})
#Save the final file

fSub.to_csv('FinalPredictions.csv')

In [277]:
fSub.head(10)

Unnamed: 0,ID,default
0,SM-1,0
1,SM-2,0
2,SM-3,0
3,SM-4,0
4,SM-5,0
5,SM-6,0
6,SM-7,1
7,SM-8,0
8,SM-9,0
9,SM-10,0


### Hyper-tuned XGBoost Model

In [281]:
from sklearn.model_selection import RandomizedSearchCV
# definring model
xgb = XGBClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV

param_grid = {
    "n_estimators": [150, 200, 250],
    "scale_pos_weight": [5, 10],
    "learning_rate": [0.1, 0.2],
    "gamma": [0, 3, 5],
    "subsample": [0.8, 0.9],
}

# Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=10,
    n_jobs=-1,
    scoring=scorer,
    cv=5,
    random_state=1,
)

# Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train, y_train)

print(
    "Best parameters are {} with CV score={}:".format(
        randomized_cv.best_params_, randomized_cv.best_score_
    )
)

Best parameters are {'subsample': 0.8, 'scale_pos_weight': 5, 'n_estimators': 250, 'learning_rate': 0.2, 'gamma': 5} with CV score=0.8094888714928941:


In [282]:
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)
    return specificity

scorer = metrics.make_scorer(specificity_score)

In [304]:
# Evaluating the model on test data
xgb_model = randomized_cv.best_estimator_

# Predicting on test data
y_pred = xgb_model.predict(X_val)

# Model evaluation
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
specificity = specificity_score(y_val, y_pred)

print("Model Evaluation on Test Data:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"F1 Score: {f1:.4f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Model Evaluation on Test Data:
Accuracy: 0.8427
Precision: 0.5584
Recall (Sensitivity): 0.9647
Specificity: 0.8128
F1 Score: 0.7073
Confusion Matrix:
[[24655  5680]
 [  263  7181]]


In [311]:
#Save model
filename_1= '/content/drive/MyDrive/DREAMS Models Project/Vulnerability_Screening/datasets/vs_xgb_model.pkl'
with open(filename_1, 'wb') as file:
    # pickle.dump(filename, file) # This line was incorrect and is now fixed
    pickle.dump(xgb_model, file) # Save the 'grid_search' object which contains the model
print("Model saved as 'vs_xgb_model.pkl'")

Model saved as 'vs_xgb_model.pkl'


In [None]:
booster.save_model

In [307]:
lmod = joblib.load(open(filename_1, 'rb'))

In [308]:
#  Make Predictions using the test dataset

y_pred = lmod.predict(tData)

# Select only the "ID" column from dTest_En
#pTest = dTest_En[['ID']].copy()

# Create a DataFrame with "ID" and "Predicted_Labels"
#res = pd.concat([pTest, pd.Series(y_pred, name='Predicted_Labels')], axis=1)

res = pd.concat([testing_data, pd.Series(y_pred, name='Predicted_Labels')], axis=1)


# Save the DataFrame to a CSV file
res.to_csv("/content/drive/MyDrive/DREAMS Models Project/Vulnerability_Screening/datasets/XGBModelPredictionsResult.csv", index=False)