In [1]:
%%capture
%cd ../../


In [2]:
import sys
sys.path.extend(["recommender/src"])

In [3]:
import pandas as pd
import numpy as np
import warnings
import pickle
from scipy import stats
warnings.simplefilter("ignore")

import os
import numpy as np
from pathlib import Path
import json
import yaml
import typing as t

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score,accuracy_score,f1_score,recall_score
import xgboost as xgb
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
##utils import
from utils.encodes import gen_encode_cols
from utils.io import gen_dict,drop_cols, merge_dfs,drop_cols_list
from utils.metrics_report import get_multiclass_report
##src
from feature_engineering import group_feats, count_selected_options
from majorityvote import build_majorityvote
from feature_selection import get_feat_importance
from omegaconf import DictConfig, OmegaConf


##hyperparams tuning
import optuna

In [10]:
current_dir = Path.cwd()
raw_data_dir = current_dir.joinpath('recommender/data/raw')
processed_data_dir = current_dir.joinpath('recommender/data/processed')
config_dir = current_dir.joinpath('recommender/configs')
artifacts_dir = current_dir.joinpath('recommender/models/artifacts')
features_dir = current_dir.joinpath('recommender/models/features')
filename = 'kaggle_survey_2017_2021.csv'

In [11]:
full_data_dir = processed_data_dir.joinpath('full-data')
train_dir =  processed_data_dir.joinpath('train-data')
test_dir = processed_data_dir.joinpath('test-data')
models_dir = current_dir.joinpath('model/artifacts/')

In [12]:
## config calls
map_config= OmegaConf.load(config_dir.joinpath("col-mapping.yaml"))
colvals_config = OmegaConf.load(config_dir.joinpath("col-values.yaml"))

In [14]:
sampled_data = pd.read_csv(full_data_dir.joinpath('Features_KaggleResponses_v4.csv'))
sampled_data = sampled_data.fillna(0)
sampled_data.shape

(159348, 279)

In [15]:
with open(features_dir.joinpath('features_select_v4.json')) as feat_file:
    file_contents = feat_file.read()
features = (json.loads(file_contents)).values()
features

dict_values(['What is your current yearly compensation (approximate $USD)?', 'Uses Computer Vision Algorithms Count', 'What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Doctorate', 'Uses ML Framework Count', 'Uses ML Algorithms Count', 'Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? - Selected Choice', 'What type of computing platform do you use most often for your data science projects? - Selected Choice', 'In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice', 'For how many years have you used machine learning methods?', 'Uses NLP Algorithms Count', 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Javascript', 'Which types of specialized hardware do you use on a regular basis?  (Select all that apply) - Selected Choice -  NVIDIA GPUs '

In [8]:
with open(features_dir.joinpath('features_select_mutual_info_v4.json')) as feat_file:
    file_contents = feat_file.read()
features_mutual = (json.loads(file_contents)).values()
features_mutual

dict_values(['Does your current employer incorporate machine learning methods into their business?', 'For how many years have you used machine learning methods?', 'What is your current yearly compensation (approximate $USD)?', 'Uses ML Framework Count', 'In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice', 'Knows Pgm Lang Count', 'Uses IDE for dev Count', 'Uses Data Visualization libs Count', 'What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Doctorate', 'Has taken DS Course Count', 'Uses ML Algorithms Count', 'Approximately how many individuals are responsible for data science workloads at your place of business?', 'What is the size of the company where you are employed?', 'What is the primary tool that you use at work or school to analyze data? (Include text response) - Selected Choice', 'Uses Big Data Products Count', 'Approximately how much money have you (or y

### Train/Validation Test Split

In [9]:
### Test Model Baseline/Ensemble/Boosting
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

X = sampled_data[features].values
y = sampled_data['Target'].values

# train is now 75% of the entire data set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio, stratify= y)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size= test_ratio/(test_ratio + validation_ratio)) 

print(x_train.shape, x_val.shape, x_test.shape)

(119511, 36) (23902, 36) (15935, 36)


In [10]:
### Test Model Baseline/Ensemble/Boosting
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

X_red = sampled_data[features_mutual].values
y_red = sampled_data['Target'].values

# train is now 75% of the entire data set
x_train_red, x_test_red, y_train_red, y_test_red = train_test_split(X_red, y_red, test_size=1 - train_ratio, stratify= y)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val_red, x_test_red, y_val_red, y_test_red = train_test_split(x_test, y_test, test_size= test_ratio/(test_ratio + validation_ratio)) 

print(x_train.shape, x_val.shape, x_test.shape)

(119511, 36) (23902, 36) (15935, 36)


In [9]:
log_regression = LogisticRegression(random_state = 42)
log_regression.fit(x_train, y_train)
y_preds = log_regression.predict(x_test)
get_multiclass_report("Baseline", "Chisquare",y_preds, y_test)

Metrics for Baseline & Chisquare are 
               precision    recall  f1-score   support

           0       0.30      0.55      0.39      2651
           1       0.33      0.20      0.25      1436
           2       0.42      0.43      0.42      3352
           3       0.40      0.31      0.35      1485
           4       0.36      0.11      0.17      1394
           5       0.46      0.51      0.49      1738
           6       0.45      0.43      0.44      2590
           7       0.48      0.28      0.35      1289

    accuracy                           0.39     15935
   macro avg       0.40      0.35      0.36     15935
weighted avg       0.40      0.39      0.38     15935



In [12]:
# log_regression = LogisticRegression(random_state = 42)
# log_regression.fit(x_train_red, y_train_red)
# y_preds = log_regression.predict(x_test_red)
get_multiclass_report("Baseline", "Mutual Information Gain",y_preds, y_test_red)

Metrics for Baseline & Mutual Information Gain are 
               precision    recall  f1-score   support

           0       0.17      0.01      0.03      1028
           1       0.08      0.05      0.06       580
           2       0.21      0.39      0.27      1361
           3       0.15      0.09      0.11       603
           4       0.00      0.00      0.00       550
           5       0.14      0.56      0.23       663
           6       0.15      0.06      0.09      1026
           7       0.00      0.00      0.00       563

    accuracy                           0.17      6374
   macro avg       0.11      0.14      0.10      6374
weighted avg       0.13      0.17      0.12      6374



In [11]:
## Load the feature and label encoder 
file = open(artifacts_dir.joinpath('target_encoder_v2.pkl'),'rb')
label_obj = pickle.load(file)
le_name_mapping = dict(zip(label_obj.classes_, label_obj.transform(label_obj.classes_)))
print(le_name_mapping)

{'Data Analyst': 0, 'Data Engineer': 1, 'Data Scientist': 2, 'Machine Learning Engineer': 3, 'Product/Project Manager': 4, 'Research Scientist': 5, 'Software Engineer': 6, 'Statistician': 7}


In [10]:
majority_vote = build_majorityvote()
majority_vote.fit(x_train, y_train)
y_preds = majority_vote.predict(x_test)
get_multiclass_report("Majority Vote", "Chisquare",y_preds, y_test)

Metrics for Majority Vote & Chisquare are 
               precision    recall  f1-score   support

           0       0.38      0.74      0.50      2651
           1       0.75      0.43      0.55      1436
           2       0.41      0.70      0.52      3352
           3       0.92      0.31      0.46      1485
           4       0.84      0.12      0.21      1394
           5       0.56      0.68      0.62      1738
           6       0.94      0.38      0.54      2590
           7       0.95      0.25      0.40      1289

    accuracy                           0.50     15935
   macro avg       0.72      0.45      0.47     15935
weighted avg       0.67      0.50      0.49     15935



In [15]:
majority_vote = build_majorityvote()
majority_vote.fit(x_train_red, y_train_red)
y_preds = majority_vote.predict(x_test_red)
get_multiclass_report("Majority Vote","Mutual Info Gain",y_preds, y_test_red)

Metrics for Majority Vote & Mutual Info Gain are 
               precision    recall  f1-score   support

           0       0.17      0.17      0.17      1028
           1       0.09      0.04      0.06       580
           2       0.20      0.29      0.24      1361
           3       0.27      0.02      0.04       603
           4       0.00      0.00      0.00       550
           5       0.14      0.62      0.23       663
           6       0.13      0.02      0.04      1026
           7       0.00      0.00      0.00       563

    accuracy                           0.16      6374
   macro avg       0.12      0.15      0.10      6374
weighted avg       0.14      0.16      0.12      6374



In [13]:
## Load the feature and label encoder 
file = open(artifacts_dir.joinpath('target_encoder_v2.pkl'),'rb')
label_obj = pickle.load(file)
le_name_mapping = dict(zip(label_obj.classes_, label_obj.transform(label_obj.classes_)))
print(le_name_mapping)

{'Data Analyst': 0, 'Data Engineer': 1, 'Data Scientist': 2, 'Machine Learning Engineer': 3, 'Product/Project Manager': 4, 'Research Scientist': 5, 'Software Engineer': 6, 'Statistician': 7}


In [11]:
rf_class = RandomForestClassifier(random_state=42)
rf_class.fit(x_train, y_train)
y_preds = rf_class.predict(x_test)
get_multiclass_report("Random Classification", "Chisquare",y_preds, y_test)

Metrics for Random Classification & Chisquare are 
               precision    recall  f1-score   support

           0       0.63      0.79      0.70      2651
           1       0.88      0.78      0.83      1436
           2       0.72      0.81      0.76      3352
           3       0.85      0.69      0.76      1485
           4       0.87      0.84      0.85      1394
           5       0.82      0.81      0.81      1738
           6       0.80      0.70      0.75      2590
           7       0.86      0.72      0.78      1289

    accuracy                           0.77     15935
   macro avg       0.80      0.77      0.78     15935
weighted avg       0.78      0.77      0.77     15935



In [14]:
rf_class = RandomForestClassifier(random_state=42)
rf_class.fit(x_train_red, y_train_red)
y_preds = rf_class.predict(x_test_red)
get_multiclass_report("Random Classification", "Mutual Info Gain",y_preds, y_test_red)

Metrics for Random Classification & Mutual Info Gain are 
               precision    recall  f1-score   support

           0       0.19      0.27      0.23      1028
           1       0.10      0.01      0.02       580
           2       0.33      0.18      0.23      1361
           3       0.11      0.01      0.01       603
           4       0.17      0.00      0.01       550
           5       0.11      0.66      0.19       663
           6       0.11      0.02      0.04      1026
           7       0.00      0.00      0.00       563

    accuracy                           0.16      6374
   macro avg       0.14      0.14      0.09      6374
weighted avg       0.17      0.16      0.12      6374



In [None]:
## from test separate underperforming class
## test model spearetely for these
## then have separate models (baed on above accuracy)


In [15]:
xgb_class = xgb.XGBClassifier(random_state=42)
xgb_class.fit(x_train, y_train)
y_preds = xgb_class.predict(x_test)
get_multiclass_report("XGB Classification", "Chisquare",y_preds, y_test)

Metrics for XGB Classification & Chisquare are 
               precision    recall  f1-score   support

           0       0.43      0.59      0.50      2651
           1       0.76      0.50      0.61      1436
           2       0.51      0.65      0.57      3352
           3       0.67      0.53      0.59      1485
           4       0.76      0.61      0.68      1394
           5       0.70      0.61      0.65      1738
           6       0.58      0.53      0.55      2590
           7       0.77      0.60      0.68      1289

    accuracy                           0.58     15935
   macro avg       0.65      0.58      0.60     15935
weighted avg       0.61      0.58      0.59     15935



In [13]:
xgb_class = xgb.XGBClassifier(random_state=42)
xgb_class.fit(x_train_red, y_train_red)
y_preds = xgb_class.predict(x_test_red)
get_multiclass_report("XGB Classification", "Mutual Info Gain",y_preds, y_test_red)

Metrics for XGB Classification & Mutual Info Gain are 
               precision    recall  f1-score   support

           0       0.18      0.22      0.20      1028
           1       0.17      0.17      0.17       580
           2       0.31      0.54      0.39      1361
           3       0.11      0.01      0.02       603
           4       0.10      0.01      0.01       550
           5       0.12      0.34      0.18       663
           6       0.09      0.01      0.02      1026
           7       0.19      0.02      0.04       563

    accuracy                           0.21      6374
   macro avg       0.16      0.17      0.13      6374
weighted avg       0.17      0.21      0.16      6374



### Hyperparameter Tuning

In [16]:
def objective(trial):
    """Define the objective function"""
    rf_params =   {
            'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
            'max_depth': trial.suggest_int('max_depth', 4, 50),
            # 'min_samples_split': trial.suggest_int('min_samples_split', 1, 150),
            # 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 60),
        }

    xgb_params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        #'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        # 'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        # 'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        # 'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        # 'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        # 'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        # 'eval_metric': 'mlogloss',
        # 'use_label_encoder': False
    }

    # Fit the model
    optuna_model = RandomForestClassifier(**rf_params, random_state = 42)
    optuna_model.fit(x_train,y_train)

    # Make predictions
    y_pred = optuna_model.predict(x_val)

    # Evaluate predictions
    precision = precision_score(y_pred, y_val, average='micro')
    return precision

In [20]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
params = study.best_trial.params


[32m[I 2023-08-09 05:21:25,476][0m A new study created in memory with name: no-name-04ec4ea3-f9c8-4d81-84ce-1685122987ba[0m
[32m[I 2023-08-09 05:22:45,709][0m Trial 0 finished with value: 0.7604802945360221 and parameters: {'n_estimators': 396, 'max_depth': 27}. Best is trial 0 with value: 0.7604802945360221.[0m
[32m[I 2023-08-09 05:23:06,723][0m Trial 1 finished with value: 0.6885197891389842 and parameters: {'n_estimators': 118, 'max_depth': 19}. Best is trial 0 with value: 0.7604802945360221.[0m
[32m[I 2023-08-09 05:25:13,123][0m Trial 2 finished with value: 0.7480963936072296 and parameters: {'n_estimators': 643, 'max_depth': 24}. Best is trial 0 with value: 0.7604802945360221.[0m
[32m[I 2023-08-09 05:25:25,075][0m Trial 3 finished with value: 0.7512342063425654 and parameters: {'n_estimators': 60, 'max_depth': 26}. Best is trial 0 with value: 0.7604802945360221.[0m
[32m[I 2023-08-09 05:28:15,782][0m Trial 4 finished with value: 0.7705212952890972 and parameters: {

In [21]:
params =  {'n_estimators': 463, 'max_depth': 50}
rf_tuned = RandomForestClassifier(**params,random_state=42)
rf_tuned.fit(x_train, y_train)
y_preds = rf_tuned.predict(x_test)
get_multiclass_report(" Tuned Random Classification", "Chisquare",y_preds, y_test)

Metrics for  Tuned Random Classification & Chisquare are 
               precision    recall  f1-score   support

           0       0.61      0.78      0.68      2604
           1       0.87      0.79      0.83      1411
           2       0.72      0.81      0.76      3432
           3       0.87      0.72      0.78      1507
           4       0.87      0.83      0.85      1373
           5       0.82      0.83      0.83      1687
           6       0.83      0.69      0.75      2540
           7       0.87      0.70      0.77      1381

    accuracy                           0.77     15935
   macro avg       0.81      0.77      0.78     15935
weighted avg       0.78      0.77      0.77     15935



### Filter for SE target

In [21]:
## Load the feature and label encoder 
file = open(artifacts_dir.joinpath('target_encoder_v2.pkl'),'rb')
label_obj = pickle.load(file)
le_name_mapping = dict(zip(label_obj.classes_, label_obj.transform(label_obj.classes_)))
print(le_name_mapping)

{'Business Analyst': 0, 'Data Analyst': 1, 'Data Engineer': 2, 'Data Scientist': 3, 'Machine Learning Engineer': 4, 'Other': 5, 'Product/Project Manager': 6, 'Research Scientist': 7, 'Software Engineer': 8, 'Statistician': 9, 'Student': 10}


In [22]:
se_list = [elem for elem in range(len(y_test)) if y_test[elem] == 3]
se_list

[10,
 28,
 59,
 69,
 74,
 76,
 89,
 91,
 110,
 131,
 147,
 152,
 161,
 214,
 231,
 242,
 243,
 259,
 268,
 278,
 297,
 305,
 309,
 312,
 329,
 353,
 375,
 376,
 400,
 420,
 441,
 456,
 459,
 464,
 468,
 473,
 475,
 477,
 482,
 495,
 502,
 519,
 529,
 537,
 543,
 565,
 566,
 573,
 595,
 604,
 607,
 625,
 628,
 637,
 645,
 647,
 660,
 694,
 696,
 698,
 703,
 726,
 739,
 745,
 748,
 769,
 786,
 795,
 797,
 823,
 846,
 861,
 868,
 874,
 876,
 902,
 914,
 916,
 924,
 928,
 955,
 968,
 969,
 978,
 979,
 988,
 994,
 1011,
 1019,
 1029,
 1037,
 1062,
 1070,
 1072,
 1073,
 1097,
 1099,
 1119,
 1120,
 1137,
 1147,
 1149,
 1159,
 1181,
 1193,
 1199,
 1202,
 1209,
 1219,
 1221,
 1282,
 1295,
 1303,
 1317,
 1332,
 1352,
 1378,
 1382,
 1397,
 1398,
 1406,
 1410,
 1416,
 1419,
 1433,
 1447,
 1456,
 1469,
 1479,
 1505,
 1519,
 1537,
 1539,
 1541,
 1545,
 1578,
 1586,
 1589,
 1590,
 1594,
 1597,
 1599,
 1603,
 1604,
 1626,
 1652,
 1661,
 1664,
 1668,
 1692,
 1726,
 1775,
 1776,
 1807,
 1818,
 1824,
 18

In [23]:
se_df = X_test[se_list]
len(se_df)

5645

In [28]:
se_preds = rf_class.predict(se_df)

In [29]:
precision_score(y_test[se_list], se_preds, average='micro')

0.4967227635075288

In [30]:
accuracy_score(y_test[se_list], se_preds)

0.4967227635075288

In [None]:
## Filter for datascientist role

In [31]:
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier   #1vs1 & 1vsRest Classifiers
from sklearn.svm import SVC

In [32]:
model = SVC(gamma='scale',random_state=0)

#Define 1-vs-1 Strategy / Classifier
ovo = OneVsOneClassifier(model)

In [33]:
#fit model to training data
ovo.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
ovo_pred = ovo.predict(X_test)

In [None]:
get_multiclass_report("SVM", "Chisquare",ovo_pred, y_test)

### Separate Models for 3- DS, 5-Other, 10-Student

In [39]:
ds_list = [elem for elem in range(len(y_test)) if y_test[elem] == 3]
len(ds_list)

2859

In [42]:
ds_list = [elem for elem in range(len(y_train)) if y_train[elem] == 3]
len(ds_list)

21170

In [48]:
2859/8

357.375

In [43]:
ds_list = [elem for elem in range(len(y_val)) if y_val[elem] == 3]
len(ds_list)

4198

In [16]:
def get_exclusive_indices(exclude_id:int, input_arr: np.array, length:int) -> t.List:
    other_class = []
    for i in range(9):
        if i == exclude_id:
            pass
        else:
            indices = [elem for elem in range(len(input_arr)) if input_arr[elem] == i][0:length]
            [other_class.append(i) for i in indices]
    return list(set(other_class))

def get_inclusive_indices(include_id:int, input_arr: np.array) -> t.List:
    inc_list = [elem for elem in range(len(input_arr)) if input_arr[elem] == include_id]
    return inc_list

In [49]:
train_inc = get_inclusive_indices(3,y_train)
valid_inc = get_inclusive_indices(3,y_val)
test_inc = get_inclusive_indices(3, y_test)
train_exc = get_exclusive_indices(3,y_train,2646)
valid_exc = get_exclusive_indices(3, y_val, 524)
test_exc = get_exclusive_indices(3, y_test,357)

In [50]:
len(test_exc)

2856

In [52]:
X_train_ds = x_train[train_inc]
X_train_non_ds = x_train[train_exc] 
X_test_ds = x_test[test_inc]
X_test_non_ds = x_test[test_exc]

In [53]:
arr_ds_train = np.repeat(1, len(X_train_ds))
arr_non_ds_train = np.repeat(0, len(X_train_non_ds))
arr_ds_test = np.repeat(1, len(X_test_ds))
arr_non_ds_test = np.repeat(0, len(X_test_non_ds))


In [56]:
X_train_ds_final = np.concatenate((X_train_ds,X_train_non_ds),axis=0)
y_train_ds_final = np.concatenate((arr_ds_train, arr_non_ds_train), axis=0)
X_test_ds_final = np.concatenate((X_test_ds,X_test_non_ds),axis=0)
y_test_ds_final = np.concatenate((arr_ds_test, arr_non_ds_test), axis=0)


In [60]:
# rf_class = RandomForestClassifier(**params, random_state=42)
# rf_class.fit(X_train_ds_final, y_train_ds_final)
# y_preds = rf_class.predict(X_test_ds_final)
print(accuracy_score(y_preds, y_test_ds_final))
print(precision_score(y_preds, y_test_ds_final))
print(recall_score(y_preds, y_test_ds_final))
print(f1_score(y_preds, y_test_ds_final))

0.7765529308836395
0.8433018537950332
0.7441358024691358
0.7906214133464502


In [61]:
get_multiclass_report(" Random Classification - Data Scientist", "Chisquare",y_preds, y_test_ds_final)

Metrics for  Random Classification - Data Scientist & Chisquare are 
               precision    recall  f1-score   support

           0       0.82      0.71      0.76      2856
           1       0.74      0.84      0.79      2859

    accuracy                           0.78      5715
   macro avg       0.78      0.78      0.78      5715
weighted avg       0.78      0.78      0.78      5715



### Business Analyst model


In [10]:
ds_list = [elem for elem in range(len(y_test)) if y_test[elem] == 0]
len(ds_list)


2824

In [11]:
ds_list = [elem for elem in range(len(y_train)) if y_train[elem] == 0]
len(ds_list)

21093

In [12]:
ds_list = [elem for elem in range(len(y_val)) if y_val[elem] == 0]
len(ds_list)

4206

In [14]:
2824/8

353.0

In [17]:
train_inc = get_inclusive_indices(0,y_train)
valid_inc = get_inclusive_indices(0,y_val)
test_inc = get_inclusive_indices(0, y_test)
train_exc = get_exclusive_indices(0,y_train,2636)
valid_exc = get_exclusive_indices(0, y_val, 526)
test_exc = get_exclusive_indices(0, y_test,353)

In [18]:
X_train_ds = x_train[train_inc]
X_train_non_ds = x_train[train_exc] 
X_test_ds = x_test[test_inc]
X_test_non_ds = x_test[test_exc]

In [19]:
arr_ba_train = np.repeat(1, len(X_train_ds))
arr_non_ba_train = np.repeat(0, len(X_train_non_ds))
arr_ba_test = np.repeat(1, len(X_test_ds))
arr_non_ba_test = np.repeat(0, len(X_test_non_ds))


In [20]:
X_train_ds_final = np.concatenate((X_train_ds,X_train_non_ds),axis=0)
y_train_ds_final = np.concatenate((arr_ba_train, arr_non_ba_train), axis=0)
X_test_ds_final = np.concatenate((X_test_ds,X_test_non_ds),axis=0)
y_test_ds_final = np.concatenate((arr_ba_test, arr_non_ba_test), axis=0)


In [21]:
rf_class = RandomForestClassifier(**params, random_state=42)
rf_class.fit(X_train_ds_final, y_train_ds_final)
y_preds = rf_class.predict(X_test_ds_final)
print(accuracy_score(y_preds, y_test_ds_final))
print(precision_score(y_preds, y_test_ds_final))
print(recall_score(y_preds, y_test_ds_final))
print(f1_score(y_preds, y_test_ds_final))

0.844003777148253
0.9171388101983002
0.813953488372093
0.8624708624708625
