In [1]:
import pandas as pd
import numpy as np

In [15]:
def missing_values_analysis(df):
    na_columns_ = [col for col in df.columns if df[col].isnull().sum() > 0]
    n_miss = df[na_columns_].isnull().sum().sort_values(ascending=False)
    ratio_ = (df[na_columns_].isnull().sum() / df.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio_, 2)], axis=1, keys=['Total Missing Values', 'Ratio'])
    missing_df = pd.DataFrame(missing_df).sort_values(by="Ratio", ascending=False)
    return missing_df


def check_df(dataframe, head=5, tail=5):
    print("*" * 70)
    print(" Shape ".center(70, "*"))
    print("*" * 70)
    print(dataframe.shape)

    print("*" * 70)
    print(" Types ".center(70, "*"))
    print("*" * 70)
    print(dataframe.dtypes)

    print("*" * 70)
    print(" Head ".center(70, "*"))
    print("*" * 70)
    print(dataframe.head(head))

    print("*" * 70)
    print(" Tail ".center(70, "*"))
    print("*" * 70)
    print(dataframe.tail(tail))

    print("*" * 70)
    print(" NA ".center(70, "*"))
    print("*" * 70)
    print(missing_values_analysis(dataframe))

    print("*" * 70)
    print(" Quantiles ".center(70, "*"))
    print("*" * 70)
    print(dataframe.describe([.01, .05, .1, .5, .9, .95, .99]).T)

    print("*" * 70)
    print(" Duplicate Rows ".center(70, "*"))
    print("*" * 70)
    print(dataframe.duplicated().sum())

    print("*" * 70)
    print(" Uniques ".center(70, "*"))
    print("*" * 70)
    print(dataframe.nunique())

In [2]:
train = pd.read_csv("./train_user_lang_work_skill_education.csv")

In [3]:
test = pd.read_csv("./test_user_lang_work_skill_education.csv")

In [None]:
def impute_and_scaled_model():
    from sklearn.model_selection import KFold
    from catboost import CatBoostClassifier
    import lightgbm as lgb
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, recall_score, precision_score, accuracy_score
    import re

    

    train = pd.read_csv("./train_user_lang_work_skill.csv")



    X_train_user_id = train["user_id"]

    y = train["moved_after_2019"]
    X = train.drop(columns=["moved_after_2019", "user_id"])
    X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

    auc_errs = []
    ap_errs = []
    prec_errs = []
    recall_errs = []
    f1_errs = []
    acc_scores = []

    models = []

    kf = KFold(n_splits=10)

    for split_train, split_val in list(kf.split(X)):
        
        split_train = X.index[split_train]
        split_val = X.index[split_val]
        
        X_train, y_train = X.loc[split_train], y.loc[split_train]
        X_val, y_val = X.loc[split_val], y.loc[split_val]
        print("Train shape:", X_train.shape, "|", "Val Shape:", X_val.shape)
        print("Positive Count in Val Split:", y_val.sum())

        model = lgb.LGBMClassifier()
        

        model.fit(X_train, y_train, eval_metric = 'accuracy',
                    eval_set = [(X_val, y_val), (X_train, y_train)],
                    eval_names = ['valid', 'train'],
                    early_stopping_rounds = 100, verbose = 200)
        
        preds = model.predict(X_val)
        pred_probas = model.predict_proba(X_val)[:,1]

        print("Positive Count in Predictions:", preds.sum())
        
        acc = accuracy_score(y_val, preds)
        print("Accuracy: ", acc)
        acc_scores.append(acc)
        
        f1_err = f1_score(y_val, preds)
        print("Fold F1: ", f1_err)
        f1_errs.append(f1_err)

        prec_err = precision_score(y_val, preds)
        print("Fold Precision: ", prec_err)
        prec_errs.append(prec_err)

        recall_err = recall_score(y_val, preds)
        print("Fold Recall: ", recall_err)
        recall_errs.append(recall_err)
        
        auc_err = roc_auc_score(y_val, pred_probas)
        print("Fold AUC: ", auc_err)
        auc_errs.append(auc_err)

        ap_err = average_precision_score(y_val, pred_probas)
        print("Fold AP: ", ap_err)
        ap_errs.append(ap_err)
        
        models.append(model)
        
        print("\n", "*"*70, "\n")

    import plotly.express as px
    importance = [model.feature_importances_ for model in models]

    f_importance = pd.concat([pd.Series(X.columns.to_list(),name='Feature'),
                            pd.Series(np.mean(importance,axis=0),name="Importance")],
                            axis=1).sort_values(by='Importance',
                                                ascending=True)

    fig = px.bar(f_importance.tail(20),x='Importance',y='Feature')
    fig.update_layout(
        title_text="First 20 Important Features - CatBoost Average of Folds"
    )
    fig.show()

    return models


        

        

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import xgboost
import lightgbm as lgb

In [5]:
train.shape

(53019, 346)

In [8]:
data1 = train.drop(columns=["user_id", "moved_after_2019"])
data2 = test.drop(columns="user_id")

In [11]:
imp_mean = IterativeImputer(
    estimator=xgboost.XGBRegressor(
        n_estimators=5,
        random_state=1,
        tree_method='gpu_hist',
    ),
    missing_values=np.nan,
    max_iter=5,
    initial_strategy='mean',
    imputation_order='ascending',
    verbose=2,
    random_state=1
)
imp_mean.fit(data1)
imp_mean_train = imp_mean.transform(data1)
imp_mean_test = imp_mean.transform(data2)

[IterativeImputer] Completing matrix with shape (53019, 344)
[IterativeImputer] Ending imputation round 1/5, elapsed time 338.92
[IterativeImputer] Change: 74.21400869475927, scaled tolerance: 0.729 
[IterativeImputer] Ending imputation round 2/5, elapsed time 707.93
[IterativeImputer] Change: 38.85702480375767, scaled tolerance: 0.729 
[IterativeImputer] Ending imputation round 3/5, elapsed time 1069.07
[IterativeImputer] Change: 7.567001298069954, scaled tolerance: 0.729 
[IterativeImputer] Ending imputation round 4/5, elapsed time 1405.20
[IterativeImputer] Change: 9.054042220115662, scaled tolerance: 0.729 
[IterativeImputer] Ending imputation round 5/5, elapsed time 1763.23
[IterativeImputer] Change: 4.778707787394524, scaled tolerance: 0.729 




[IterativeImputer] Completing matrix with shape (53019, 344)
[IterativeImputer] Ending imputation round 1/5, elapsed time 48.33
[IterativeImputer] Ending imputation round 2/5, elapsed time 75.88
[IterativeImputer] Ending imputation round 3/5, elapsed time 100.71
[IterativeImputer] Ending imputation round 4/5, elapsed time 130.76
[IterativeImputer] Ending imputation round 5/5, elapsed time 157.02
[IterativeImputer] Completing matrix with shape (13255, 344)
[IterativeImputer] Ending imputation round 1/5, elapsed time 9.40
[IterativeImputer] Ending imputation round 2/5, elapsed time 18.69
[IterativeImputer] Ending imputation round 3/5, elapsed time 30.36
[IterativeImputer] Ending imputation round 4/5, elapsed time 41.21
[IterativeImputer] Ending imputation round 5/5, elapsed time 50.52


In [16]:
check_df(data1)

**********************************************************************
******************************* Shape ********************************
**********************************************************************
(53019, 344)
**********************************************************************
******************************* Types ********************************
**********************************************************************
industry_-1                           int64
industry_Accounting                   int64
industry_Airlines/Aviation            int64
industry_Apparel & Fashion            int64
industry_Architecture & Planning      int64
                                     ...   
school_name_count_norm              float64
degree_count                        float64
degree_count_norm                   float64
fields_of_study_count               float64
fields_of_study_count_norm          float64
Length: 344, dtype: object
**************************************************

In [23]:
df_iter_imputed_train = pd.DataFrame(imp_mean_train, columns=data1.columns)

In [24]:
check_df(df_iter_imputed_train)

**********************************************************************
******************************* Shape ********************************
**********************************************************************
(53019, 344)
**********************************************************************
******************************* Types ********************************
**********************************************************************
industry_-1                         float64
industry_Accounting                 float64
industry_Airlines/Aviation          float64
industry_Apparel & Fashion          float64
industry_Architecture & Planning    float64
                                     ...   
school_name_count_norm              float64
degree_count                        float64
degree_count_norm                   float64
fields_of_study_count               float64
fields_of_study_count_norm          float64
Length: 344, dtype: object
**************************************************

In [25]:
df_iter_imputed_test = pd.DataFrame(imp_mean_test, columns=data2.columns)

In [26]:
check_df(df_iter_imputed_test)

**********************************************************************
******************************* Shape ********************************
**********************************************************************
(13255, 344)
**********************************************************************
******************************* Types ********************************
**********************************************************************
industry_-1                         float64
industry_Accounting                 float64
industry_Airlines/Aviation          float64
industry_Apparel & Fashion          float64
industry_Architecture & Planning    float64
                                     ...   
school_name_count_norm              float64
degree_count                        float64
degree_count_norm                   float64
fields_of_study_count               float64
fields_of_study_count_norm          float64
Length: 344, dtype: object
**************************************************

In [27]:
df_iter_imputed_train.head()

Unnamed: 0,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,...,company_id_count,company_id_count_norm,skill_count,skill_count_norm,school_name_count,school_name_count_norm,degree_count,degree_count_norm,fields_of_study_count,fields_of_study_count_norm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.945804,0.736451,8.396971,0.365086,0.834533,0.417267,4.0,2.0,0.807414,0.403707
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.090652,0.515109,19.977311,0.4077,0.43793,0.43793,2.0,2.0,0.403675,0.403675
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.625,0.15625,8.619874,0.41047,0.752864,0.376432,5.0,2.5,0.787145,0.393573
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.825863,0.456466,9.867496,0.328917,1.25035,0.416783,6.0,3.0,1.300354,0.433451
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.132618,0.377539,19.166329,0.399299,0.762746,0.381373,0.0,0.085146,0.810272,0.405136


In [31]:
df_iter_imputed_test.shape


(13255, 344)

In [33]:
type(df_iter_imputed_train)

pandas.core.frame.DataFrame

In [35]:
df_iter_imputed_train.to_csv("imputed_train.csv", index=False)
df_iter_imputed_test.to_csv("imputed_test.csv", index=False)


In [30]:
data2.columns

Index(['industry_-1', 'industry_Accounting', 'industry_Airlines/Aviation',
       'industry_Apparel & Fashion', 'industry_Architecture & Planning',
       'industry_Automotive', 'industry_Aviation & Aerospace',
       'industry_Banking', 'industry_Biotechnology',
       'industry_Broadcast Media',
       ...
       'company_id_count', 'company_id_count_norm', 'skill_count',
       'skill_count_norm', 'school_name_count', 'school_name_count_norm',
       'degree_count', 'degree_count_norm', 'fields_of_study_count',
       'fields_of_study_count_norm'],
      dtype='object', length=344)

In [17]:
imp_mean_train.shape

(53019, 344)

In [18]:
imp_mean_test.shape

(13255, 344)

In [19]:
type(imp_mean_test)

numpy.ndarray