In [1]:
%%capture
%cd ../../

In [2]:
import sys

sys.path.extend(["recommender"])

In [3]:
import pandas as pd
import numpy as np
import warnings
import pickle
from scipy import stats

warnings.simplefilter("ignore")

import os
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier

##utils import
from utils.encodes import gen_encode_cols
from utils.io import gen_dict, drop_cols, merge_dfs, drop_cols_list
from utils.metrics_report import get_multiclass_report

##src
from src.feature_engineering import group_feats, count_selected_options
from src.majorityvote import build_majorityvote
from src.feature_selection import get_feat_importance


from omegaconf import DictConfig, OmegaConf

##imbalance
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

##hyperparams tuning
import optuna

In [4]:
current_dir = Path.cwd()
raw_data_dir = current_dir.joinpath("recommender/data/raw")
processed_data_dir = current_dir.joinpath("recommender/data/processed")
config_dir = current_dir.joinpath("recommender/configs")
artifacts_dir = current_dir.joinpath("recommender/models/artifacts")
features_dir = current_dir.joinpath("recommender/models/features")
filename = "kaggle_survey_2017_2021.csv"

In [5]:
data = pd.read_csv(raw_data_dir.joinpath(filename), skiprows=1)
data = data.iloc[:, 2:250]
data = data[~data["What is your age (# years)?"].isna()]
data = data.fillna(0)
data.shape
print(data.shape)

(105856, 248)


In [6]:
## config calls
map_config = OmegaConf.load(config_dir.joinpath("col-mapping.yaml"))
colvals_config = OmegaConf.load(config_dir.joinpath("col-values.yaml"))

In [7]:
encode_df = gen_encode_cols(data)
encode_df.shape

(105856, 248)

### Mapping the columns of raw data(before encoding) i.e Age, Higher Education, Coding experience with proper scale


In [8]:
q_dict = gen_dict(data)
q_dict

{0: 'What is your age (# years)?',
 1: 'What is your gender? - Selected Choice',
 2: 'In which country do you currently reside?',
 3: 'What is the highest level of formal education that you have attained or plan to attain within the next 2 years?',
 4: 'Select the title most similar to your current role (or most recent title if retired): - Selected Choice',
 5: 'For how many years have you been writing code and/or programming?',
 6: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Python',
 7: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - R',
 8: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - SQL',
 9: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - C',
 10: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - C++',
 1

In [9]:
age_label = map_config["age_label"]

data["What is your age (# years)?"] = data["What is your age (# years)?"].map(age_label)

code_exp_label = map_config["code_exp_label"]
data[q_dict.get(5)] = data[q_dict.get(5)].map(code_exp_label)
data[q_dict.get(5)] = data[q_dict.get(5)].fillna(0)

bachelors = colvals_config["bachelors"]
masters = colvals_config["masters"]
doctoral = colvals_config["doctoral"]
highschool = colvals_config["highschool"]
none = colvals_config["none"]

data.loc[data[q_dict.get(3)].isin(bachelors), q_dict.get(3)] = "Bachelors"
data.loc[data[q_dict.get(3)].isin(masters), q_dict.get(3)] = "Masters"
data.loc[data[q_dict.get(3)].isin(doctoral), q_dict.get(3)] = "Doctorate"
data.loc[data[q_dict.get(3)].isin(highschool), q_dict.get(3)] = "High-School"
data.loc[data[q_dict.get(3)].isin(none), q_dict.get(3)] = "None"

#### One hot columns for Higher education

In [10]:
ohe_cols = [q_dict.get(3)]
ohe_df = pd.get_dummies(data[ohe_cols])
ohe_df.head(2)

Unnamed: 0,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Bachelors,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Doctorate,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_High-School,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Masters,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_No formal education past high school,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_None,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Professional degree
0,1,0,0,0,0,0,0
1,0,0,0,1,0,0,0


### Add 2 Kinds of Features
- Flag for group of features i.e if the particular answer is attempted or no
- Count of the options selected under that caetgory of question

In [11]:
platform_ds = [col for col in encode_df.columns if "data science courses" in col]
pgm_lang = [col for col in encode_df.columns if "programming languages" in col]
ide = [col for col in encode_df.columns if "IDE's" in col]
notebooks = [col for col in encode_df.columns if "hosted notebook" in col]
data_viz = [col for col in encode_df.columns if "data visualization" in col]
ml_framework = [
    col for col in encode_df.columns if "machine learning frameworks" in col
]
ml_algo = [col for col in encode_df.columns if "ML algorithms" in col]
comp_vision = [col for col in encode_df.columns if "computer vision" in col]
nlp_method = [col for col in encode_df.columns if "(NLP) methods" in col]
cloud_compute = [col for col in encode_df.columns if "cloud computing platforms" in col]
automate_ml = [col for col in encode_df.columns if "automated" in col]
business_intel = [col for col in encode_df.columns if "business intelligence" in col]
big_data = [col for col in encode_df.columns if "big data products" in col]
managed_ml = [col for col in encode_df.columns if "managed machine learning" in col]

### 1. Feature to flag the particular question

In [13]:
platform_df = group_feats(platform_ds, encode_df, "Has taken DS Course")
pgm_lang_df = group_feats(pgm_lang, encode_df, "Knows Pgm Lang")
ide_df = group_feats(ide, encode_df, "Uses IDE for dev")
notebooks_df = group_feats(notebooks, encode_df, "Uses Notebooks for dev")
data_viz_df = group_feats(platform_ds, encode_df, "Uses Data Visualization libs")
ml_framework_df = group_feats(ml_framework, encode_df, "Uses ML Framework")
ml_algo_df = group_feats(ml_algo, encode_df, "Uses ML Algorithms")
cv_df = group_feats(comp_vision, encode_df, "Uses Computer Vision Algorithms")
nlp_df = group_feats(nlp_method, encode_df, "Uses NLP Algorithms")
automate_df = group_feats(automate_ml, encode_df, "Uses Auto ML tools")
business_intel_df = group_feats(business_intel, encode_df, "Uses Business Intel tools")
big_data_df = group_feats(big_data, encode_df, "Uses Big Data Products")
managed_ml_df = group_feats(managed_ml, encode_df, "Has Managed ML tools")
cloud_compute_df = group_feats(
    cloud_compute, encode_df, "Knows Cloud Compute Platforms"
)

### 2. Count of multiple choice answers

In [14]:
count_platform_df = count_selected_options(
    platform_ds, encode_df, "Has taken DS Course Count"
)
count_pgm_lang_df = count_selected_options(pgm_lang, encode_df, "Knows Pgm Lang Count")
count_ide_df = count_selected_options(ide, encode_df, "Uses IDE for dev Count")
count_notebooks_df = count_selected_options(
    notebooks, encode_df, "Uses Notebooks for dev Count"
)
count_data_viz_df = count_selected_options(
    platform_ds, encode_df, "Uses Data Visualization libs Count"
)
count_ml_framework_df = count_selected_options(
    ml_framework, encode_df, "Uses ML Framework Count"
)
count_ml_algo_df = count_selected_options(
    ml_algo, encode_df, "Uses ML Algorithms Count"
)
count_cv_df = count_selected_options(
    comp_vision, encode_df, "Uses Computer Vision Algorithms Count"
)
count_nlp_df = count_selected_options(
    nlp_method, encode_df, "Uses NLP Algorithms Count"
)
count_automate_df = count_selected_options(
    automate_ml, encode_df, "Uses Auto ML tools Count"
)
count_business_intel_df = count_selected_options(
    business_intel, encode_df, "Uses Business Intel tools Count"
)
count_big_data_df = count_selected_options(
    big_data, encode_df, "Uses Big Data Products Count"
)
count_managed_ml_df = count_selected_options(
    managed_ml, encode_df, "Has Managed ML tools Count"
)
count_cloud_compute_df = count_selected_options(
    cloud_compute, encode_df, "Knows Cloud Compute Platforms Count"
)

### Add the above feature to our encoded dataframe 


In [15]:
group_dfs = [
    platform_df,
    pgm_lang_df,
    ide_df,
    notebooks_df,
    data_viz_df,
    ml_framework_df,
    ml_algo_df,
    cv_df,
    nlp_df,
    cloud_compute_df,
    automate_df,
    business_intel_df,
    big_data_df,
    managed_ml_df,
]
encode_df = merge_dfs(group_dfs, encode_df)

In [15]:
encode_df.head()

Unnamed: 0,What is your age (# years)?,What is your gender? - Selected Choice,In which country do you currently reside?,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?,Select the title most similar to your current role (or most recent title if retired): - Selected Choice,For how many years have you been writing code and/or programming?,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Python,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - R,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - SQL,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - C,...,Uses Data Visualization libs,Uses ML Framework,Uses ML Algorithms,Uses Computer Vision Algorithms,Uses NLP Algorithms,Knows Cloud Compute Platforms,Uses Auto ML tools,Uses Business Intel tools,Uses Big Data Products,Has Managed ML tools
0,7,1,0,1,4,4,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,1,18,0,16,8,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1,17,0,2,5,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,1,22,2,6,8,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,1,0,2,4,3,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
encode_df.shape

(105856, 276)

### Add correctly mapped columns/onehot encoded columns i.e Age, Code experience, Higher Education


In [17]:
encode_df = pd.concat([encode_df, data[[q_dict.get(3), q_dict.get(5)]]], axis=1)
encode_df = pd.concat([encode_df, ohe_df], axis=1)
encode_df.shape

(105856, 285)

In [18]:
encode_dict = gen_dict(encode_df)
encode_df = drop_cols([0, 1, 2, 3, 5], encode_df, index_label=True)
encode_df.head()

Unnamed: 0,Select the title most similar to your current role (or most recent title if retired): - Selected Choice,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Python,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - R,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - SQL,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - C,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - C++,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Java,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Javascript,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Julia,What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Swift,...,Uses Business Intel tools,Uses Big Data Products,Has Managed ML tools,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Bachelors,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Doctorate,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_High-School,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Masters,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_No formal education past high school,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_None,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Professional degree
0,4,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1,0,0,0,0,0,0
1,16,1,0,1,1,1,1,0,0,0,...,0.0,0.0,0.0,0,0,0,1,0,0,0
2,2,0,0,0,0,1,1,0,0,0,...,0.0,0.0,0.0,0,0,0,1,0,0,0
3,6,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,1,0,0,0,0,0
4,4,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0,1,0,0,0,0,0


#### Target Labels Generation

In [19]:
feat_df = encode_df.copy(deep=True)

In [20]:
feature_dict = gen_dict(data)
feature_dict

{0: 'What is your age (# years)?',
 1: 'What is your gender? - Selected Choice',
 2: 'In which country do you currently reside?',
 3: 'What is the highest level of formal education that you have attained or plan to attain within the next 2 years?',
 4: 'Select the title most similar to your current role (or most recent title if retired): - Selected Choice',
 5: 'For how many years have you been writing code and/or programming?',
 6: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Python',
 7: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - R',
 8: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - SQL',
 9: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - C',
 10: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - C++',
 1

In [21]:
data[feature_dict[4]].value_counts()

Student                                 21242
Data Scientist                          16910
Software Engineer                       10252
Data Analyst                             8489
Other                                    8346
0                                        6985
Research Scientist                       5371
Business Analyst                         4097
Currently not employed                   3638
Machine Learning Engineer                3192
Data Engineer                            2466
Product/Project Manager                  1843
Not employed                             1784
Software Developer/Software Engineer     1726
Statistician                             1451
Scientist/Researcher                      960
Program/Project Manager                   849
Consultant                                785
DBA/Database Engineer                     780
Researcher                                601
Research Assistant                        600
Manager                           

In [22]:
feat_df["Target"] = data[feature_dict[4]].copy()

In [23]:
se_list = colvals_config["software_engineer"]
research_eng = colvals_config["research_engineer"]
feat_df.loc[feat_df["Target"].isin(se_list), "Target"] = "Software Engineer"
feat_df.loc[feat_df["Target"].isin(research_eng), "Target"] = "Researcher"

In [24]:
feat_df.shape

(105856, 279)

In [25]:
feat_df = feat_df.rename(columns={feat_df.columns[(feat_df.shape[1] - 1)]: "Target"})
feat_df = feat_df[feat_df["Target"] != "Currently not employed"]
feat_df.loc[feat_df["Target"] == 0, "Target"] = "Student"
feat_df.loc[feat_df["Target"] == 0, "Target"] = "Student"
se_list = ["Software Developer/Software Engineer", "Software Engineer"]
analyst = ["Business Analyst", "Data Analyst"]
research_eng = ["Researcher", "Scientist/Researcher"]
feat_df.loc[feat_df["Target"].isin(se_list), "Target"] = "Software Engineer"
feat_df.loc[feat_df["Target"].isin(research_eng), "Target"] = "Researcher"
feat_df.loc[feat_df["Target"].isin(analyst), "Target"] = "Data Analyst"
labels = [
    "Student",
    "Data Scientist",
    "Data Analyst",
    "Software Engineer",
    "Research Scientist",
    "Machine Learning Engineer",
    "Data Engineer",
    "Product/Project Manager",
    "Statistician",
]
feat_df.loc[~feat_df["Target"].isin(labels), "Target"] = "Other"

In [26]:
exclude_list = ["Other", "Student"]
feat_df = feat_df[~feat_df["Target"].isin(exclude_list)]

In [27]:
feat_df.to_parquet(
    processed_data_dir.joinpath("Features_Labels-KaggleResponses.parquet"), index=False
)

In [28]:
feat_df["Target"].value_counts()

Data Scientist               16910
Data Analyst                 12586
Software Engineer            11978
Research Scientist            5371
Machine Learning Engineer     3192
Data Engineer                 2466
Product/Project Manager       1843
Statistician                  1451
Name: Target, dtype: int64

In [29]:
le = LabelEncoder()
feat_df["Target"] = le.fit_transform(feat_df["Target"])
pickle.dump(le, open(artifacts_dir.joinpath("target_encoder_v2.pkl"), "wb"))

In [30]:
## Load the feature and label encoder
file = open(artifacts_dir.joinpath("target_encoder_v2.pkl"), "rb")
label_obj = pickle.load(file)
le_name_mapping = dict(zip(label_obj.classes_, label_obj.transform(label_obj.classes_)))
print(le_name_mapping)

{'Data Analyst': 0, 'Data Engineer': 1, 'Data Scientist': 2, 'Machine Learning Engineer': 3, 'Product/Project Manager': 4, 'Research Scientist': 5, 'Software Engineer': 6, 'Statistician': 7}


In [31]:
## Filter business analyst and other profiles : ml eng/dataeng/statistician/product eng


def generate_additional_samples(
    majority_df: pd.DataFrame, minority_df: pd.DataFrame, minority_label: int
) -> pd.DataFrame:
    concat_df = pd.concat([majority_df, minority_df], axis=0)
    X = concat_df.iloc[:, 1:-1].values
    y = concat_df["Target"].values
    resample = SMOTETomek(tomek=TomekLinks(sampling_strategy="majority"))
    X, y = resample.fit_resample(X, y)
    sample_indices = [elem for elem in range(len(y)) if y[elem] == minority_label]
    X = X[sample_indices]
    y = y[sample_indices]
    oversample_df = pd.DataFrame(X, columns=minority_df.iloc[:, 1:-1].columns)
    oversample_df["Target"] = y
    return oversample_df


research_df = generate_additional_samples(
    feat_df[feat_df["Target"] == 6], feat_df[feat_df["Target"] == 5], 5
)
ml_df = generate_additional_samples(
    feat_df[feat_df["Target"] == 6], feat_df[feat_df["Target"] == 3], 3
)
pm_df = generate_additional_samples(
    feat_df[feat_df["Target"] == 6], feat_df[feat_df["Target"] == 4], 4
)
de_df = generate_additional_samples(
    feat_df[feat_df["Target"] == 6], feat_df[feat_df["Target"] == 1], 1
)
stats_df = generate_additional_samples(
    feat_df[feat_df["Target"] == 6], feat_df[feat_df["Target"] == 7], 7
)

In [32]:
ds_df = generate_additional_samples(
    feat_df[feat_df["Target"] == 2], feat_df[feat_df["Target"] == 2], 2
)
df_df.shape

ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead

In [33]:
dfs = [stats_df, de_df, pm_df, ml_df, research_df]
oversample_df = pd.concat(dfs, axis=0)
oversample_df = pd.concat([feat_df, oversample_df], axis=0)
oversample_df["Target"].value_counts()

5    17280
2    16910
3    15165
1    14433
4    13813
7    13429
0    12586
6    11978
Name: Target, dtype: int64

In [34]:
ds_df = generate_additional_samples(
    oversample_df[oversample_df["Target"] == 7],
    oversample_df[oversample_df["Target"] == 6],
    6,
)
se_df = generate_additional_samples(
    oversample_df[oversample_df["Target"] == 7],
    oversample_df[oversample_df["Target"] == 0],
    0,
)
da_df = generate_additional_samples(
    oversample_df[oversample_df["Target"] == 7],
    oversample_df[oversample_df["Target"] == 2],
    2,
)

In [35]:
dfs = [ds_df, se_df, da_df]
oversample_new_df = pd.concat(dfs, axis=0)
oversample_df = pd.concat([oversample_df, oversample_new_df], axis=0)
oversample_df["Target"].value_counts()

2    33815
0    26013
6    25400
5    17280
3    15165
1    14433
4    13813
7    13429
Name: Target, dtype: int64

In [36]:
oversample_df.shape

(159348, 279)

In [37]:
feat_df = oversample_df.copy(deep=True)
feat_df.to_csv(
    processed_data_dir.joinpath("Features_KaggleResponses_v4.csv"), index=False
)

In [38]:
encode_dict = gen_dict(feat_df)
encode_dict

{0: 'Select the title most similar to your current role (or most recent title if retired): - Selected Choice',
 1: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Python',
 2: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - R',
 3: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - SQL',
 4: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - C',
 5: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - C++',
 6: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Java',
 7: 'What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Javascript',
 8: 'What programming languages do you use on a regular basis? (Select all that apply) - Se

In [39]:
feat_df["Target"].value_counts()

2    33815
0    26013
6    25400
5    17280
3    15165
1    14433
4    13813
7    13429
Name: Target, dtype: int64

In [106]:
feat_df.to_csv(
    processed_data_dir.joinpath("Features-KaggleResponses_v3.csv"), index=False
)

In [112]:
feat_df = pd.read_csv(processed_data_dir.joinpath("Features-KaggleResponses_v3.csv"))
col_list = [
    platform_ds,
    pgm_lang,
    ide,
    notebooks,
    data_viz,
    ml_framework,
    ml_algo,
    comp_vision,
    nlp_method,
    cloud_compute,
    automate_ml,
    business_intel,
    big_data,
    managed_ml,
]
# final_df= drop_cols_list(col_list, feat_df)
final_df = feat_df.copy()

In [113]:
final_df.shape

(55797, 279)

In [114]:
feat_df["Target"].value_counts()

3    16910
7    11978
1     8489
6     5371
0     4097
4     3192
2     2466
5     1843
8     1451
Name: Target, dtype: int64

### Sampling strategy for handling imbalance data

In [35]:
X = final_df.iloc[:, 1:-1].values
y = final_df["Target"].values
resample = SMOTETomek(tomek=TomekLinks(sampling_strategy="majority"))
X, y = resample.fit_resample(X, y)

In [36]:
majority_sample = pd.DataFrame(X, columns=final_df.iloc[:, 1:-1].columns)
target_sample = pd.DataFrame(y, columns=["Target"])
sampled_data = pd.concat([majority_sample, target_sample], axis=1)
sampled_data.to_parquet(
    processed_data_dir.joinpath("Features-KaggleResponses_Sampled_v2.parquet"),
    index=False,
)

### Split the data into train and test


In [8]:
sampled_data = pd.read_csv(
    processed_data_dir.joinpath("Features_KaggleResponses_v4.csv")
)
le = LabelEncoder()
sampled_data["Target"] = le.fit_transform(sampled_data["Target"])
sampled_data = sampled_data.fillna(0)
X = sampled_data.iloc[:, 1:-1].values
y = sampled_data["Target"].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

### Feature Selection using Chisquare

In [9]:
X_train_fs, X_test_fs, sorted_cols = get_feat_importance(
    sampled_data, X_train, y_train, X_test
)

 1) Does your current employer incorporate machine learning methods into their business? 0.193237
 2) For how many years have you used machine learning methods? 0.187876
 3) What is your current yearly compensation (approximate $USD)? 0.181956
 4) Uses ML Framework Count        0.167921
 5) In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice 0.156839
 6) Knows Pgm Lang Count           0.150578
 7) Uses IDE for dev Count         0.136272
 8) Uses Data Visualization libs Count 0.135272
 9) What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Doctorate 0.134787
10) Has taken DS Course Count      0.130241
11) Uses ML Algorithms Count       0.128869
12) Approximately how many individuals are responsible for data science workloads at your place of business? 0.126312
13) What is the size of the company where you are employed? 0.126136
14) What is the primary tool that you use

In [10]:
## Feature store
import yaml, json

column_dictionary = {
    str(indice): sampled_data.iloc[:, 1:-1].columns[indice]
    for indice in sorted_cols[0:36]
}
with open(
    features_dir.joinpath("features_select_mutual_info_v4.json"), "w"
) as json_file:
    json.dump(column_dictionary, json_file)

In [50]:
with open(features_dir.joinpath("features_select_v2.json"), "r") as fp:
    data = json.load(fp)
data

{'112': 'What is your current yearly compensation (approximate $USD)?',
 '113': 'Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?',
 '100': 'In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice',
 '102': 'Approximately how many individuals are responsible for data science workloads at your place of business?',
 '145': 'Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? - Selected Choice',
 '249': 'Uses Computer Vision Algorithms Count',
 '101': 'What is the size of the company where you are employed?',
 '248': 'Uses ML Algorithms Count',
 '103': 'Does your current employer incorporate machine learning methods into their business?',
 '247': 'Uses ML Framework Count',
 '271': 'What is the highest level of formal education that you have 

### Modelling with Baseline,Bagging & Boosting methods

In [53]:
file = open(artifacts_dir.joinpath("target_encoder_v2.pkl"), "rb")
label_obj = pickle.load(file)
le_name_mapping = dict(zip(label_obj.classes_, label_obj.transform(label_obj.classes_)))
print(le_name_mapping)

{'Business Analyst': 0, 'Data Analyst': 1, 'Data Engineer': 2, 'Data Scientist': 3, 'Machine Learning Engineer': 4, 'Other': 5, 'Product/Project Manager': 6, 'Research Scientist': 7, 'Software Engineer': 8, 'Statistician': 9, 'Student': 10}


In [54]:
log_regression = LogisticRegression()
log_regression.fit(X_train_fs[:, sorted_cols[0:36]], y_train)
y_preds = log_regression.predict(X_test_fs[:, sorted_cols[0:36]])
get_multiclass_report("Baseline", "Chisquare", y_preds, y_test)

Metrics for Baseline & Chisquare are 
               precision    recall  f1-score   support

           0       0.25      0.17      0.20      5625
           1       0.28      0.18      0.21      5645
           2       0.36      0.36      0.36      5645
           3       0.33      0.26      0.29      5645
           4       0.47      0.47      0.47      5646
           5       0.17      0.07      0.10      5646
           6       0.30      0.29      0.30      5645
           7       0.44      0.55      0.48      5646
           8       0.36      0.31      0.33      5645
           9       0.38      0.36      0.37      5645
          10       0.35      0.87      0.50      5646

    accuracy                           0.35     62079
   macro avg       0.33      0.35      0.33     62079
weighted avg       0.33      0.35      0.33     62079



In [20]:
log_regression = LogisticRegression()
log_regression.fit(X_train_fs[:, sorted_cols[0:36]], y_train)
y_preds = log_regression.predict(X_test_fs[:, sorted_cols[0:36]])
get_multiclass_report("Baseline", "Chisquare", y_preds, y_test)

Metrics for Baseline & Chisquare are 
               precision    recall  f1-score   support

           0       0.41      0.39      0.40      5735
           1       0.45      0.41      0.43      5755
           2       0.43      0.37      0.40      5755
           3       0.20      0.06      0.09      5756
           4       0.58      0.70      0.64      5756
           5       0.45      0.43      0.44      5756
           6       0.49      0.89      0.63      5755

    accuracy                           0.46     40268
   macro avg       0.43      0.46      0.43     40268
weighted avg       0.43      0.46      0.43     40268



In [18]:
majority_vote = build_majorityvote()
majority_vote.fit(X_train_fs[:, sorted_cols[0:36]], y_train)
y_preds = majority_vote.predict(X_test_fs[:, sorted_cols[0:36]])
get_multiclass_report("Majority Vote", "Chisquare", y_preds, y_test)

Metrics for Majority Vote & Chisquare are 
               precision    recall  f1-score   support

           0       0.55      0.74      0.63      5735
           1       0.67      0.72      0.70      5755
           2       0.67      0.41      0.51      5755
           3       0.42      0.09      0.14      5756
           4       0.63      0.94      0.75      5756
           5       0.87      0.37      0.51      5756
           6       0.48      0.89      0.63      5755

    accuracy                           0.59     40268
   macro avg       0.61      0.59      0.55     40268
weighted avg       0.61      0.59      0.55     40268



In [55]:
majority_vote = build_majorityvote()
majority_vote.fit(X_train_fs[:, sorted_cols[0:36]], y_train)
y_preds = majority_vote.predict(X_test_fs[:, sorted_cols[0:36]])
get_multiclass_report("Majority Vote", "Chisquare", y_preds, y_test)

Metrics for Majority Vote & Chisquare are 
               precision    recall  f1-score   support

           0       0.56      0.61      0.59      5625
           1       0.59      0.55      0.57      5645
           2       0.67      0.71      0.69      5645
           3       0.59      0.45      0.51      5645
           4       0.70      0.59      0.64      5646
           5       0.52      0.20      0.29      5646
           6       0.45      0.93      0.60      5645
           7       0.67      0.54      0.60      5646
           8       0.92      0.25      0.39      5645
           9       0.96      0.36      0.52      5645
          10       0.37      0.87      0.52      5646

    accuracy                           0.55     62079
   macro avg       0.63      0.55      0.54     62079
weighted avg       0.63      0.55      0.54     62079



In [22]:
xgb_class = xgb.XGBClassifier()
xgb_class.fit(X_train_fs[:, sorted_cols[0:36]], y_train)
y_preds = xgb_class.predict(X_test_fs[:, sorted_cols[0:36]])
get_multiclass_report("XGB Classification", "Chisquare", y_preds, y_test)

Metrics for XGB Classification & Chisquare are 
               precision    recall  f1-score   support

           0       0.61      0.46      0.52      5735
           1       0.71      0.76      0.73      5755
           2       0.59      0.46      0.51      5755
           3       0.48      0.45      0.46      5756
           4       0.79      0.80      0.80      5756
           5       0.63      0.48      0.54      5756
           6       0.55      0.95      0.70      5755

    accuracy                           0.62     40268
   macro avg       0.62      0.62      0.61     40268
weighted avg       0.62      0.62      0.61     40268



In [56]:
xgb_class = xgb.XGBClassifier()
xgb_class.fit(X_train_fs[:, sorted_cols[0:36]], y_train)
y_preds = xgb_class.predict(X_test_fs[:, sorted_cols[0:36]])
get_multiclass_report("XGB Classification", "Chisquare", y_preds, y_test)

Metrics for XGB Classification & Chisquare are 
               precision    recall  f1-score   support

           0       0.54      0.37      0.44      5625
           1       0.49      0.32      0.39      5645
           2       0.64      0.63      0.64      5645
           3       0.54      0.43      0.48      5645
           4       0.68      0.66      0.67      5646
           5       0.49      0.22      0.31      5646
           6       0.59      0.72      0.65      5645
           7       0.71      0.71      0.71      5646
           8       0.53      0.42      0.47      5645
           9       0.59      0.71      0.64      5645
          10       0.42      0.95      0.58      5646

    accuracy                           0.56     62079
   macro avg       0.57      0.56      0.54     62079
weighted avg       0.57      0.56      0.54     62079



In [53]:
file = open(artifacts_dir.joinpath("target_encoder_v2.pkl"), "rb")
label_obj = pickle.load(file)
le_name_mapping = dict(zip(label_obj.classes_, label_obj.transform(label_obj.classes_)))
print(le_name_mapping)

{'Business Analyst': 0, 'Data Analyst': 1, 'Data Engineer': 2, 'Data Scientist': 3, 'Machine Learning Engineer': 4, 'Other': 5, 'Product/Project Manager': 6, 'Research Scientist': 7, 'Software Engineer': 8, 'Statistician': 9, 'Student': 10}


In [58]:
rf_class = RandomForestClassifier()
rf_class.fit(X_train_fs[:, sorted_cols[0:36]], y_train)
y_preds = rf_class.predict(X_test_fs[:, sorted_cols[0:36]])
get_multiclass_report("XGB Classification", "Chisquare", y_preds, y_test)

Metrics for XGB Classification & Chisquare are 
               precision    recall  f1-score   support

           0       0.80      0.62      0.70      5625
           1       0.67      0.51      0.58      5645
           2       0.83      0.86      0.84      5645
           3       0.59      0.50      0.54      5645
           4       0.78      0.73      0.75      5646
           5       0.59      0.25      0.35      5646
           6       0.83      0.89      0.86      5645
           7       0.78      0.81      0.80      5646
           8       0.66      0.52      0.58      5645
           9       0.73      0.80      0.76      5645
          10       0.42      0.93      0.58      5646

    accuracy                           0.67     62079
   macro avg       0.70      0.67      0.67     62079
weighted avg       0.70      0.67      0.67     62079



### Reduced Features Modelling


In [17]:
col_list = [
    platform_ds,
    pgm_lang,
    ide,
    notebooks,
    data_viz,
    ml_framework,
    ml_algo,
    comp_vision,
    nlp_method,
    cloud_compute,
    automate_ml,
    business_intel,
    big_data,
    managed_ml,
]
final_df = drop_cols_list(col_list, sampled_data)
X = final_df.iloc[:, 1:-1].values
y = final_df["Target"].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

In [18]:
final_df.shape

(115600, 93)

In [19]:
X_train_red, X_test_red, sorted_cols = get_feat_importance(
    final_df, X_train, y_train, X_test
)

 1) What is your current yearly compensation (approximate $USD)? 36147.495596
 2) Uses Computer Vision Algorithms Count 19017.741564
 3) What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Doctorate 16539.287712
 4) Uses ML Algorithms Count       14096.367058
 5) Uses ML Framework Count        13515.752686
 6) What type of computing platform do you use most often for your data science projects? - Selected Choice 13465.958808
 7) In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice 12149.330708
 8) For how many years have you used machine learning methods? 10057.127200
 9) Uses NLP Algorithms Count      8767.756241
10) Which types of specialized hardware do you use on a regular basis?  (Select all that apply) - Selected Choice -  NVIDIA GPUs  8440.465980
11) Who/what are your favorite media sources that report on data science topics? (Select all that apply) - Selected C

In [20]:
import yaml, json

column_dictionary = {
    str(indice): final_df.columns[indice] for indice in sorted_cols[0:36]
}
with open(features_dir.joinpath("features_select_reduced_v3.json"), "w") as json_file:
    json.dump(column_dictionary, json_file)

In [32]:
log_regression = LogisticRegression()
log_regression.fit(X_train_red[:, sorted_cols[0:36]], y_train)
y_preds = log_regression.predict(X_test_red[:, sorted_cols[0:36]])
get_multiclass_report("Baseline", "Chisquare", y_preds, y_test)

Metrics for Baseline & Chisquare are 
               precision    recall  f1-score   support

           0       0.39      0.38      0.38      5735
           1       0.44      0.43      0.43      5755
           2       0.42      0.36      0.39      5755
           3       0.23      0.15      0.18      5756
           4       0.57      0.70      0.63      5756
           5       0.42      0.37      0.39      5756
           6       0.56      0.83      0.67      5755

    accuracy                           0.46     40268
   macro avg       0.43      0.46      0.44     40268
weighted avg       0.43      0.46      0.44     40268



In [33]:
majority_vote = build_majorityvote()
majority_vote.fit(X_train_red[:, sorted_cols[0:36]], y_train)
y_preds = majority_vote.predict(X_test_red[:, sorted_cols[0:36]])
get_multiclass_report("Majority Vote", "Chisquare", y_preds, y_test)

Metrics for Majority Vote & Chisquare are 
               precision    recall  f1-score   support

           0       0.54      0.74      0.62      5735
           1       0.65      0.73      0.69      5755
           2       0.60      0.43      0.50      5755
           3       0.44      0.08      0.13      5756
           4       0.62      0.94      0.75      5756
           5       0.84      0.32      0.47      5756
           6       0.51      0.87      0.64      5755

    accuracy                           0.59     40268
   macro avg       0.60      0.59      0.54     40268
weighted avg       0.60      0.59      0.54     40268



In [34]:
xgb_class = xgb.XGBRFClassifier()
xgb_class.fit(X_train_red[:, sorted_cols[0:36]], y_train)
y_preds = xgb_class.predict(X_test_red[:, sorted_cols[0:36]])
get_multiclass_report("XGB Classification", "Chisquare", y_preds, y_test)

Metrics for XGB Classification & Chisquare are 
               precision    recall  f1-score   support

           0       0.42      0.39      0.40      5735
           1       0.53      0.53      0.53      5755
           2       0.48      0.39      0.43      5755
           3       0.30      0.35      0.32      5756
           4       0.68      0.76      0.71      5756
           5       0.44      0.41      0.43      5756
           6       0.75      0.78      0.77      5755

    accuracy                           0.52     40268
   macro avg       0.51      0.52      0.51     40268
weighted avg       0.51      0.52      0.51     40268



In [63]:
random_class = RandomForestClassifier()
random_class.fit(X_train_red[:, sorted_cols[0:36]], y_train)
y_preds = random_class.predict(X_test_red[:, sorted_cols[0:36]])
get_multiclass_report("RGB Classification", "Chisquare", y_preds, y_test)

Metrics for RGB Classification & Chisquare are 
               precision    recall  f1-score   support

           0       0.81      0.65      0.72      5625
           1       0.66      0.57      0.61      5645
           2       0.83      0.87      0.85      5645
           3       0.59      0.52      0.55      5645
           4       0.58      0.84      0.68      5646
           5       0.37      0.38      0.37      5646
           6       0.85      0.91      0.88      5645
           7       0.79      0.84      0.81      5646
           8       0.66      0.48      0.56      5645
           9       0.76      0.85      0.80      5645
          10       0.83      0.78      0.80      5646

    accuracy                           0.70     62079
   macro avg       0.70      0.70      0.69     62079
weighted avg       0.70      0.70      0.69     62079



In [41]:
random_class = RandomForestClassifier()
random_class.fit(X_train_fs[:, sorted_cols[0:36]], y_train)
y_preds = random_class.predict(X_test_fs[:, sorted_cols[0:36]])
get_multiclass_report("RGB Classification", "Chisquare", y_preds, y_test)

Metrics for RGB Classification & Chisquare are 
               precision    recall  f1-score   support

           0       0.70      0.54      0.61      5735
           1       0.82      0.86      0.84      5755
           2       0.60      0.51      0.55      5755
           3       0.49      0.41      0.45      5756
           4       0.82      0.84      0.83      5756
           5       0.69      0.52      0.59      5756
           6       0.55      0.93      0.69      5755

    accuracy                           0.66     40268
   macro avg       0.67      0.66      0.65     40268
weighted avg       0.67      0.66      0.65     40268



### OLD EXPERIMENTS

In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.56      0.64      0.60      5729
           1       0.38      0.95      0.55      5755
           2       0.81      0.32      0.46      5755
           3       0.46      0.05      0.09      5756
           4       0.86      0.68      0.76      5756
           5       0.88      0.36      0.51      5756
           6       0.51      0.86      0.64      5755

    accuracy                           0.55     40262
   macro avg       0.64      0.55      0.51     40262
weighted avg       0.64      0.55      0.51     40262



### Using XGBClassifier

In [87]:
def select_features(X_train, y_train, X_test):
    fs = SelectKBest(score_func=chi2, k="all")
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

In [88]:
def select_features_mutual_info(X_train, y_train, X_test):
    fs = SelectKBest(score_func=mutual_info_classif, k="all")
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

In [18]:
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
importances = fs.scores_
indices = np.argsort(importances)[::-1]
train_df = final_df.iloc[:, 1:-1].columns
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, train_df[indices[f]], importances[indices[f]]))

 1) What is your current yearly compensation (approximate $USD)? 232910.496204
 2) Approximately how many individuals are responsible for data science workloads at your place of business? 48746.844752
 3) Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)? 47714.058598
 4) In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice 44786.254887
 5) What is the size of the company where you are employed? 43840.329188
 6) Does your current employer incorporate machine learning methods into their business? 40207.116155
 7) What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Doctorate 34415.318950
 8) For how many years have you used machine learning methods? 31753.086800
 9) Uses ML Framework Count        21220.146958
10) Uses ML Algorithms Count       17377.12

In [41]:
X_train_fs, X_test_fs, fs = select_features_mutual_info(X_train, y_train, X_test)
importances = fs.scores_
indices = np.argsort(importances)[::-1]
train_df = final_df.iloc[:, 1:-1].columns
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, train_df[indices[f]], importances[indices[f]]))

 1) Does your current employer incorporate machine learning methods into their business? 0.266590
 2) What is your current yearly compensation (approximate $USD)? 0.259972
 3) In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice 0.195653
 4) Approximately how many individuals are responsible for data science workloads at your place of business? 0.192819
 5) What is the size of the company where you are employed? 0.191226
 6) For how many years have you used machine learning methods? 0.181922
 7) Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)? 0.171965
 8) What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Doctorate 0.164828
 9) Select any activities that make up an important part of your role at work: (Select all that apply) - Selected Choice - A

In [19]:
X_test_fs_red = X_test_fs[:, indices[0:35]]
X_train_red = X_train_fs[:, indices[0:35]]

In [20]:
majority_vote.fit(X_train_red, y_train)

MajorityVoteClassifier(classifiers=[LogisticRegression(C=0.001,
                                                       random_state=42),
                                    DecisionTreeClassifier(criterion='entropy',
                                                           max_depth=1,
                                                           random_state=42),
                                    KNeighborsClassifier(n_neighbors=1)])

### Chisquare test

In [None]:
## useline baseline
## majority,xgb
## hyperparam xgb

In [21]:
preds = majority_vote.predict(X_test_fs_red)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.56      0.63      0.59      5729
           1       0.38      0.95      0.54      5755
           2       0.83      0.32      0.46      5755
           3       0.43      0.08      0.13      5756
           4       0.93      0.67      0.78      5756
           5       0.87      0.33      0.48      5756
           6       0.51      0.87      0.64      5755

    accuracy                           0.55     40262
   macro avg       0.64      0.55      0.52     40262
weighted avg       0.64      0.55      0.52     40262



#### Mutual Info gain

In [42]:
X_test_mutinfo = X_test_fs[:, indices[0:35]]
X_train_mutinfo = X_train_fs[:, indices[0:35]]
majority_vote.fit(X_train_mutinfo, y_train)
preds = majority_vote.predict(X_test_mutinfo)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.56      0.62      0.59      5729
           1       0.37      0.94      0.54      5755
           2       0.81      0.32      0.46      5755
           3       0.45      0.04      0.08      5756
           4       0.86      0.68      0.76      5756
           5       0.88      0.34      0.49      5756
           6       0.51      0.87      0.64      5755

    accuracy                           0.54     40262
   macro avg       0.64      0.54      0.51     40262
weighted avg       0.64      0.54      0.51     40262



### XGB + Chisquare Feats

In [None]:
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train_red, y_train)
preds = dt_classifier.predict(X_test_fs_red)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.51      0.43      0.47      5729
           1       0.70      0.74      0.72      5755
           2       0.43      0.35      0.38      5755
           3       0.36      0.45      0.40      5756
           4       0.73      0.73      0.73      5756
           5       0.50      0.39      0.44      5756
           6       0.65      0.81      0.73      5755

    accuracy                           0.56     40262
   macro avg       0.55      0.56      0.55     40262
weighted avg       0.55      0.56      0.55     40262



In [None]:
## RandomRandomForestClassifier
random_forest = RandomForestClassifier()
random_forest.fit(X_train_red, y_train)
preds = random_forest.predict(X_test_fs_red)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.71      0.55      0.62      5729
           1       0.82      0.88      0.85      5755
           2       0.59      0.51      0.55      5755
           3       0.42      0.51      0.46      5756
           4       0.81      0.85      0.83      5756
           5       0.70      0.51      0.59      5756
           6       0.67      0.87      0.76      5755

    accuracy                           0.67     40262
   macro avg       0.68      0.67      0.67     40262
weighted avg       0.68      0.67      0.67     40262



In [22]:
tuned_model = xgb.XGBClassifier()
tuned_model.fit(X_train_red, y_train)
preds = tuned_model.predict(X_test_fs_red)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.62      0.45      0.52      5729
           1       0.70      0.77      0.73      5755
           2       0.58      0.46      0.51      5755
           3       0.41      0.54      0.47      5756
           4       0.79      0.80      0.79      5756
           5       0.61      0.46      0.53      5756
           6       0.67      0.89      0.76      5755

    accuracy                           0.62     40262
   macro avg       0.63      0.62      0.62     40262
weighted avg       0.63      0.62      0.62     40262



### XGB + Mutual Info gain

In [43]:
tuned_model = xgb.XGBClassifier()
tuned_model.fit(X_train_mutinfo, y_train)
preds = tuned_model.predict(X_test_mutinfo)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.60      0.48      0.54      5729
           1       0.71      0.77      0.74      5755
           2       0.59      0.46      0.51      5755
           3       0.42      0.54      0.47      5756
           4       0.79      0.80      0.79      5756
           5       0.57      0.51      0.54      5756
           6       0.74      0.85      0.79      5755

    accuracy                           0.63     40262
   macro avg       0.63      0.63      0.63     40262
weighted avg       0.63      0.63      0.63     40262



### Hyperparameter tuning with optuna

In [24]:
def objective(trial):
    """Define the objective function"""

    params = {
        "max_depth": trial.suggest_int("max_depth", 1, 9),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 1.0),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),
        "subsample": trial.suggest_loguniform("subsample", 0.01, 1.0),
        "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.01, 1.0),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 1.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 1.0),
        "eval_metric": "mlogloss",
        "use_label_encoder": False,
    }

    # Fit the model
    optuna_model = xgb.XGBClassifier(**params)
    optuna_model.fit(X_train_red, y_train)

    # Make predictions
    y_pred = optuna_model.predict(X_test_fs_red)

    # Evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [26]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)
params = study.best_trial.params

[32m[I 2023-08-02 12:33:06,994][0m A new study created in memory with name: no-name-c186c9ac-d1ee-4577-bb36-b93a3a2a9949[0m
[32m[I 2023-08-02 12:34:21,269][0m Trial 0 finished with value: 0.5441359097908698 and parameters: {'max_depth': 8, 'learning_rate': 0.21299226181626638, 'n_estimators': 500, 'min_child_weight': 9, 'gamma': 0.0014769202765595336, 'subsample': 0.0864466416277673, 'colsample_bytree': 0.04835182195102151, 'reg_alpha': 0.004932013900037481, 'reg_lambda': 1.1012446136221117e-08}. Best is trial 0 with value: 0.5441359097908698.[0m
[32m[I 2023-08-02 12:35:30,987][0m Trial 1 finished with value: 0.5359395956485024 and parameters: {'max_depth': 7, 'learning_rate': 0.09546011251564891, 'n_estimators': 437, 'min_child_weight': 8, 'gamma': 3.484445039207656e-08, 'subsample': 0.26869617259188067, 'colsample_bytree': 0.03798076183097272, 'reg_alpha': 4.3378460675602554e-05, 'reg_lambda': 0.04515879392799709}. Best is trial 0 with value: 0.5441359097908698.[0m
[32m[I 2

KeyboardInterrupt: 

In [29]:
params = {
    "max_depth": 8,
    "learning_rate": 0.21299226181626638,
    "n_estimators": 500,
    "min_child_weight": 9,
    "gamma": 0.0014769202765595336,
    "subsample": 0.0864466416277673,
    "colsample_bytree": 0.04835182195102151,
    "reg_alpha": 0.004932013900037481,
    "reg_lambda": 1.1012446136221117e-08,
}
tuned_model = xgb.XGBClassifier(**params)
tuned_model.fit(X_train_red, y_train)
preds = tuned_model.predict(X_test_fs_red)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.47      0.34      0.39      5729
           1       0.55      0.56      0.56      5755
           2       0.51      0.40      0.45      5755
           3       0.38      0.54      0.45      5756
           4       0.73      0.72      0.72      5756
           5       0.52      0.36      0.43      5756
           6       0.65      0.88      0.75      5755

    accuracy                           0.54     40262
   macro avg       0.54      0.54      0.54     40262
weighted avg       0.54      0.54      0.54     40262



In [38]:
final_df.columns[indices]

Index(['Select any activities that make up an important part of your role at work: (Select all that apply) - Selected Choice - Other',
       'What is the size of the company where you are employed?',
       'What is your current yearly compensation (approximate $USD)?',
       'For how many years have you used machine learning methods?',
       'In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice',
       'Approximately how many individuals are responsible for data science workloads at your place of business?',
       'What is the highest level of formal education that you have attained or plan to attain within the next 2 years?_Bachelors',
       'Approximately how many times have you used a TPU (tensor processing unit)?',
       'Uses Data Visualization libs Count', 'Uses ML Framework Count',
       'Who/what are your favorite media sources that report on data science topics? (Select all that apply) - Selected Choice - Blog

### RandomCV hyperparams

In [None]:
param_dist = {
    "n_estimators": stats.randint(150, 1000),
    "learning_rate": stats.uniform(0.01, 0.59),
    "subsample": stats.uniform(0.3, 0.6),
    "max_depth": [3, 4, 5, 6, 7, 8, 9],
    "colsample_bytree": stats.uniform(0.5, 0.4),
    "min_child_weight": [1, 2, 3, 4],
}
randomcv = RandomizedSearchCV(tuned_model, param_dist, random_state=42)
randomcv.fit(X_train_fs, y_train)



RandomizedSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           enable_categorical=False, gamma=0,
                                           gpu_id=-1, importance_type=None,
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_delta_step=0, max_depth=6,
                                           min_child_weight=1, missing=nan,
                                           monotone_constraints='()',
                                           n_estimators=1...
                   param_distributions={'colsample_bytree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9af93f4f40>,
                         