In [1]:
import os

os.getcwd()

'/home/ec2-user/SageMaker/recommender/notebooks'

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import sklearn
import warnings
from sklearn.decomposition import NMF
import typing as t

In [3]:
current_dir = Path.cwd().parent
data_dir = current_dir.joinpath("data")
filename = "kaggle_survey_2022_responses.csv"

In [4]:
data = pd.read_csv(data_dir.joinpath(filename), skiprows=1)
data = data.iloc[:, 1:250]
data = data.fillna(0)
data.shape
print(data.shape)

  exec(code_obj, self.user_global_ns, self.user_ns)


(23997, 249)


In [5]:
q_count = [i for i in range(0, 297, 1)]
q_dict = dict(zip(q_count, data.columns.tolist()))

In [6]:
q_dict

{0: 'What is your age (# years)?',
 1: 'What is your gender? - Selected Choice',
 2: 'In which country do you currently reside?',
 3: 'Are you currently a student? (high school, university, or graduate)',
 4: 'On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - Coursera',
 5: 'On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - edX',
 6: 'On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - Kaggle Learn Courses',
 7: 'On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - DataCamp',
 8: 'On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - Fast.ai',
 9: 'On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - Udacity',
 10: 'On which p

In [5]:
def encode_cols(df: pd.DataFrame, col: str) -> pd.DataFrame:
    col_count = df[col].value_counts()
    value_dict = pd.Series(range(len(col_count)), index=col_count.index)
    df = df[col].map(value_dict)
    return df

In [6]:
final_df = pd.DataFrame()
for col in data.columns:
    sub_df = data[[col]]
    map_df = encode_cols(sub_df, col)
    final_df = pd.concat([final_df, map_df], axis=1)

In [8]:
student_df = final_df[final_df.iloc[:, 3] == 1]
work_df = final_df[final_df.iloc[:, 3] == 0]
print(student_df.shape)
print(work_df.shape)

(11961, 249)
(12036, 249)


### Grouping Columnns based on String Match

In [9]:
def create_importance_dataframe(cluster_type: str, nmf: NMF, input_df: pd.DataFrame):
    if cluster_type == "NMF":
        nmf_df = pd.DataFrame(nmf.components_)
    elif cluster_type == "PCA":
        nmf_df = pd.DataFrame(nmf.n_components)
    else:
        raise ValueError(
            f"Invalid Cluster type.Expected NMF or PCA, given {cluster_type}"
        )
    nmf_df.columns = input_df.columns
    nmf_df = nmf_df.apply(np.abs)
    nmf_df = nmf_df.transpose()
    num_comps = nmf_df.shape[1]
    new_columns = [f"NMF Component{i}" for i in range(1, num_comps + 1)]
    nmf_df.columns = new_columns
    return nmf_df

In [10]:
def group_feats(col_list: list, input_df: pd.DataFrame, new_col: str) -> pd.DataFrame:
    """
    Group the feats based on the

    """
    merge_df = input_df.copy(deep=True)
    false_index = []
    true_index = []
    for col in col_list:
        if "None" in col or "No" in col:
            false_list = list(set(input_df[input_df[col] == 1].index))
            false_index.append(false_list)
        else:
            true_list = list(set(input_df[input_df[col] == 1].index))
            true_index.append(true_list)
    merge_df.loc[true_list, new_col] = 1
    merge_df.loc[false_list, new_col] = 0
    merge_df = merge_df.fillna(0)
    return merge_df

In [11]:
platform_ds = [col for col in final_df.columns if "data science courses" in col]
study_ds = [col for col in final_df.columns if "studying data science" in col]
pgm_lang = [col for col in final_df.columns if "programming languages" in col]
ide = [col for col in final_df.columns if "IDE's" in col]
notebooks = [col for col in final_df.columns if "hosted notebook" in col]
data_viz = [col for col in final_df.columns if "data visualization" in col]
ml_framework = [col for col in final_df.columns if "machine learning frameworks" in col]
ml_algo = [col for col in final_df.columns if "ML algorithms" in col]
comp_vision = [col for col in final_df.columns if "computer vision" in col]
nlp_method = [col for col in final_df.columns if "(NLP) methods" in col]
pretrained_model = [col for col in final_df.columns if "pre-trained model" in col]
cloud_compute = [col for col in final_df.columns if "cloud computing platforms" in col]
automate_ml = [col for col in final_df.columns if "automated" in col]
business_intel = [col for col in final_df.columns if "business intelligence" in col]
data_storage = [col for col in final_df.columns if "data storage" in col]
db_prod = [col for col in final_df.columns if "data products" in col]
managed_ml = [col for col in final_df.columns if "managed machine learning" in col]
serve_ml = [col for col in final_df.columns if "serve your machine" in col]

In [12]:
platform_df = group_feats(platform_ds, student_df, "Has taken DS Course")
study_ds_df = group_feats(study_ds, student_df, "Has studied DS Course")
pgm_lang_df = group_feats(pgm_lang, student_df, "Knows Pgm Lang")
ide_df = group_feats(ide, student_df, "Uses IDE for dev")
notebooks_df = group_feats(notebooks, student_df, "Uses Notebooks for dev")
data_viz_df = group_feats(platform_ds, student_df, "Uses Data Visualization libs")
ml_framework_df = group_feats(ml_framework, student_df, "Uses ML Framework")
ml_algo_df = group_feats(ml_algo, student_df, "Uses ML Algorithms")
cv_df = group_feats(comp_vision, student_df, "Uses Computer Vision Algorithms")
nlp_df = group_feats(nlp_method, student_df, "Uses NLP Algorithms")
pretrained_model_df = group_feats(
    pretrained_model, student_df, "Uses Pretrained Models"
)
cloud_df = group_feats(cloud_compute, student_df, "Uses Cloud Compute")
automate_df = group_feats(automate_ml, student_df, "Uses Auto ML tools")
business_intel_df = group_feats(business_intel, student_df, "Uses Business Intel tools")
data_storage_df = group_feats(data_storage, student_df, "Uses Data Storage tools")
db_prod_df = group_feats(db_prod, student_df, "Uses Data Products")
managed_ml_df = group_feats(managed_ml, student_df, "Has Managed ML tools")
serve_ml_df = group_feats(serve_ml, student_df, "Has Served ML Models")

In [13]:
def merge_dfs(dfs: t.List[pd.DataFrame], main_df: pd.DataFrame) -> pd.DataFrame:
    final_df = pd.DataFrame()
    for df in dfs:
        final_df = pd.concat([final_df, df.iloc[:, -1:]], axis=1)
    main_df = pd.concat([main_df, final_df], axis=1)
    return main_df

In [26]:
def drop_cols(cols: t.List[list], main_df: pd.DataFrame) -> pd.DataFrame:
    for col in cols:
        main_df = main_df.drop(columns=col)
    return main_df


def gen_dict(input_df: pd.DataFrame) -> dict:
    input_shape = input_df.shape[1]
    q_count = [i for i in range(0, input_shape, 1)]
    q_dict = dict(zip(q_count, input_df.columns.tolist()))
    return q_dict


def encode_and_bind(index_col: t.List, student_dict, input_df: pd.DataFrame):
    ohe_df = pd.DataFrame()
    for col in index_col:
        col_name = student_dict.get(col)
        input_df[col_name] = input_df[col_name].astype(str)
        dummies = pd.get_dummies(input_df[[col_name]])
        ohe_df = pd.concat([ohe_df, dummies])
        ohe_df = ohe_df.fillna(0)
    return ohe_df


def drop_cols_index(index: t.List, input_df: pd.DataFrame) -> pd.DataFrame:
    input_df.drop(input_df.columns[index], axis=1, inplace=True)
    return input_df

In [27]:
group_dfs = [
    platform_df,
    study_ds_df,
    pgm_lang_df,
    ide_df,
    notebooks_df,
    data_viz_df,
    ml_framework_df,
    ml_algo_df,
    cv_df,
    nlp_df,
    pretrained_model_df,
    cloud_df,
    automate_df,
    business_intel_df,
    data_storage_df,
    db_prod_df,
    managed_ml_df,
    serve_ml_df,
]
student_df_feat = merge_dfs(group_dfs, student_df)
student_df_feat.head(2)

Unnamed: 0,What is your age (# years)?,What is your gender? - Selected Choice,In which country do you currently reside?,"Are you currently a student? (high school, university, or graduate)",On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - Coursera,On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - edX,On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - Kaggle Learn Courses,On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - DataCamp,On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - Fast.ai,On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - Udacity,...,Uses Computer Vision Algorithms,Uses NLP Algorithms,Uses Pretrained Models,Uses Cloud Compute,Uses Auto ML tools,Uses Business Intel tools,Uses Data Storage tools,Uses Data Products,Has Managed ML tools,Has Served ML Models
2,0,0,8,1,1,1,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
col_list = [
    platform_ds,
    study_ds,
    pgm_lang,
    ide,
    notebooks,
    data_viz,
    ml_framework,
    ml_algo,
    comp_vision,
    nlp_method,
    pretrained_model,
    cloud_compute,
    automate_ml,
    business_intel,
    data_storage,
    db_prod,
    managed_ml,
    serve_ml,
]
student_final_df = drop_cols(col_list, student_df_feat)
student_dict = gen_dict(student_final_df)

In [29]:
multivariate_cols = [0, 1, 2, 3, 4, 11, 12, 13, 14, 15, 16]
ohe_cols = [9, 10]

In [30]:
student_final = encode_and_bind(ohe_cols, student_dict, student_final_df)

In [31]:
student_final = drop_cols_index(ohe_cols, student_final)
student_final

Unnamed: 0,For how many years have you been writing code and/or programming?_0,For how many years have you been writing code and/or programming?_1,For how many years have you been writing code and/or programming?_2,For how many years have you been writing code and/or programming?_3,For how many years have you been writing code and/or programming?_4,For how many years have you been writing code and/or programming?_5,For how many years have you been writing code and/or programming?_6,For how many years have you been writing code and/or programming?_7,For how many years have you used machine learning methods?_0,For how many years have you used machine learning methods?_3,For how many years have you used machine learning methods?_4,For how many years have you used machine learning methods?_5,For how many years have you used machine learning methods?_6,For how many years have you used machine learning methods?_7,For how many years have you used machine learning methods?_8,For how many years have you used machine learning methods?_9
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
23992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [25]:
student_final

In [27]:
pca = PCA(n_components=3, random_state=42)
pca.fit_transform(s)
pca_by_column = ColumnTransformer(
    transformers=[
        (
            "platform_ds",
            pca,
            np.where(student_df.columns.str.contains("data science courses"))[0],
        ),
        (
            "study_ds",
            pca,
            np.where(student_df.columns.str.contains("data science courses"))[0],
        ),
        (
            "pgm_lang",
            pca,
            np.where(student_df.columns.str.contains("programming languages"))[0],
        ),
        # ('ide', pca, np.where(student_df.columns.str.contains('IDE\'s'))[0]),
        # ('notebooks', pca, np.where(student_df.columns.str.contains('hosted notebook'))[0]),
        # ('data viz', pca, np.where(student_df.columns.str.contains('data visualization'))[0]),
        # ('ml framework', pca, np.where(student_df.columns.str.contains('machine learning frameworks'))[0]),
        # ('ml algo', pca, np.where(student_df.columns.str.contains('ML algorithms'))[0]),
        # ('computer vision', pca, np.where(student_df.columns.str.contains('computer vision'))[0]),
        # ('nlp_method', pca, np.where(student_df.columns.str.contains('(NLP) methods'))[0]),
        # ('pretrained_model', pca, np.where(student_df.columns.str.contains('pre-trained model'))[0]),
        # ('cloud_compute', pca, np.where(student_df.columns.str.contains('cloud computing platforms'))[0]),
        # ('automated', pca, np.where(student_df.columns.str.contains('automated'))[0]),
        # ('business intel', pca, np.where(student_df.columns.str.contains('business intelligence'))[0]),
        # ('data_storage', pca, np.where(student_df.columns.str.contains('data storage'))[0]),
        # ('dp_prod', pca, np.where(student_df.columns.str.contains('data products'))[0]),
        # ('managed_ml', pca, np.where(student_df.columns.str.contains('managed machine learning'))[0]),
        # ('serve ml', pca, np.where(student_df.columns.str.contains('serve your machine'))[0]),
    ],
    remainder="passthrough",
)

prepData = Pipeline(steps=[("scaler", StandardScaler()), ("pca", pca_by_column)])

model = prepData.fit(student_df)
red_data = prepData.fit_transform(student_df)

In [None]:
np.where(student_df.columns.str.contains("data science courses"))[0]

array([ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])

In [None]:
student_df.shape

(11961, 186)

In [31]:
red_data

array([[ 4.91613367,  0.99721668, -0.88125764, ...,  0.        ,
         0.        ,  0.        ],
       [-1.47687467, -1.46559041,  2.29429647, ...,  0.        ,
         0.        ,  0.        ],
       [ 3.5705821 ,  1.17535212, -0.88125764, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-2.07022671,  2.33556772,  0.1772604 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.79243993, -0.1350453 ,  3.88207352, ...,  0.        ,
         0.        ,  0.        ],
       [-1.47687467, -1.46559041, -0.88125764, ...,  0.        ,
         0.        ,  0.        ]])

In [None]:
model.named_steps["pca"].named_transformers_["platform_ds"].n_components

3

In [None]:
obj = model.named_steps["pca"].named_transformers_["platform_ds"]
feat_df = create_importance_dataframe(obj, student_df)

ValueError: DataFrame constructor not properly called!

### Non- Negative Matrix Factoization

In [44]:
nmf = NMF(n_components=3, init="random")
x = nmf.fit(student_final)
nmf_feats = nmf.transform(student_final)

In [45]:
nmf_featdf = create_importance_dataframe("NMF", x, student_final)

In [46]:
nmf_featdf["NMF Component1"].sort_values(ascending=False)[:30]

For how many years have you been writing code and/or programming?_0     9.880235e-01
For how many years have you used machine learning methods?_4            1.608721e-53
For how many years have you been writing code and/or programming?_7     1.340829e-68
For how many years have you used machine learning methods?_3           3.562225e-269
For how many years have you been writing code and/or programming?_1     0.000000e+00
For how many years have you been writing code and/or programming?_2     0.000000e+00
For how many years have you been writing code and/or programming?_3     0.000000e+00
For how many years have you been writing code and/or programming?_4     0.000000e+00
For how many years have you been writing code and/or programming?_5     0.000000e+00
For how many years have you been writing code and/or programming?_6     0.000000e+00
For how many years have you used machine learning methods?_0            0.000000e+00
For how many years have you used machine learning methods?_5     

In [47]:
nmf_featdf["NMF Component2"].sort_values(ascending=False)[:30]

For how many years have you used machine learning methods?_3           1.233520e+00
For how many years have you been writing code and/or programming?_2    6.995739e-03
For how many years have you been writing code and/or programming?_4    9.147270e-22
For how many years have you used machine learning methods?_6           2.797561e-53
For how many years have you been writing code and/or programming?_6    5.194361e-68
For how many years have you used machine learning methods?_5           1.056713e-77
For how many years have you been writing code and/or programming?_0    0.000000e+00
For how many years have you been writing code and/or programming?_1    0.000000e+00
For how many years have you been writing code and/or programming?_3    0.000000e+00
For how many years have you been writing code and/or programming?_5    0.000000e+00
For how many years have you been writing code and/or programming?_7    0.000000e+00
For how many years have you used machine learning methods?_0           0.000

In [48]:
nmf_featdf["NMF Component3"].sort_values(ascending=False)[:30]

For how many years have you used machine learning methods?_0            9.503088e-01
For how many years have you been writing code and/or programming?_1     8.983053e-09
For how many years have you been writing code and/or programming?_4     4.639142e-47
For how many years have you been writing code and/or programming?_3     5.441628e-55
For how many years have you used machine learning methods?_6            8.107837e-80
For how many years have you been writing code and/or programming?_5     1.458624e-83
For how many years have you used machine learning methods?_7            6.998600e-95
For how many years have you been writing code and/or programming?_6     5.165963e-95
For how many years have you used machine learning methods?_8           4.370284e-138
For how many years have you used machine learning methods?_9           6.485816e-223
For how many years have you been writing code and/or programming?_0     0.000000e+00
For how many years have you been writing code and/or programming?

##### test_df.shape