In [1]:
import pandas as pd
import numpy as np

In [8]:
def missing_values_analysis(df):
    na_columns_ = [col for col in df.columns if df[col].isnull().sum() > 0]
    n_miss = df[na_columns_].isnull().sum().sort_values(ascending=False)
    ratio_ = (df[na_columns_].isnull().sum() / df.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio_, 2)], axis=1, keys=['Total Missing Values', 'Ratio'])
    missing_df = pd.DataFrame(missing_df).sort_values(by="Ratio", ascending=False)
    return missing_df


def check_df(dataframe, head=5, tail=5):
    print("*" * 70)
    print(" Shape ".center(70, "*"))
    print("*" * 70)
    print(dataframe.shape)

    print("*" * 70)
    print(" Types ".center(70, "*"))
    print("*" * 70)
    print(dataframe.dtypes)

    print("*" * 70)
    print(" Head ".center(70, "*"))
    print("*" * 70)
    print(dataframe.head(head))

    print("*" * 70)
    print(" Tail ".center(70, "*"))
    print("*" * 70)
    print(dataframe.tail(tail))

    print("*" * 70)
    print(" NA ".center(70, "*"))
    print("*" * 70)
    print(missing_values_analysis(dataframe))

    print("*" * 70)
    print(" Quantiles ".center(70, "*"))
    print("*" * 70)
    print(dataframe.describe([.01, .05, .1, .5, .9, .95, .99]).T)

    print("*" * 70)
    print(" Duplicate Rows ".center(70, "*"))
    print("*" * 70)
    print(dataframe.duplicated().sum())

    print("*" * 70)
    print(" Uniques ".center(70, "*"))
    print("*" * 70)
    print(dataframe.nunique())

In [10]:
train = pd.read_csv("./train_user_lang_work_skill_education.csv")

In [11]:
test = pd.read_csv("./test_user_lang_work_skill_education.csv")

In [64]:
def impute_and_scaled_model(train):
    from sklearn.model_selection import KFold
    from catboost import CatBoostClassifier
    import lightgbm as lgb
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, recall_score, precision_score, accuracy_score
    import re
    import xgboost as xgb
    from sklearn.ensemble import AdaBoostClassifier
    from catboost import CatBoostClassifier




    X_train_user_id = train["user_id"]

    y = train["moved_after_2019"]
    X = train.drop(columns=["moved_after_2019", "user_id"])

    auc_errs = []
    ap_errs = []
    prec_errs = []
    recall_errs = []
    f1_errs = []
    acc_scores = []

    models = []

    kf = KFold(n_splits=10)

    for split_train, split_val in list(kf.split(X)):
        
        split_train = X.index[split_train]
        split_val = X.index[split_val]
        
        X_train, y_train = X.loc[split_train], y.loc[split_train]
        X_val, y_val = X.loc[split_val], y.loc[split_val]
        print("Train shape:", X_train.shape, "|", "Val Shape:", X_val.shape)
        print("Positive Count in Val Split:", y_val.sum())

        model = CatBoostClassifier(
            **{'learning_rate': 0.01,
            'depth': 2,
            'iterations': 100000,
            'use_best_model': True,
            'eval_metric': "F1:use_weights=False",
            'scale_pos_weight': 5,
            'random_state': 986,
            'allow_writing_files': False,
            'thread_count': 12
            },
            early_stopping_rounds=2500,
        )
            

        model.fit(
            X_train,
            y_train,
            eval_set=(X_val, y_val),
            verbose=1500,
        )
        
        preds = model.predict(X_val)
        pred_probas = model.predict_proba(X_val)[:,1]

        print("Positive Count in Predictions:", preds.sum())
        
        acc = accuracy_score(y_val, preds)
        print("Accuracy: ", acc)
        acc_scores.append(acc)
        
        f1_err = f1_score(y_val, preds)
        print("Fold F1: ", f1_err)
        f1_errs.append(f1_err)

        prec_err = precision_score(y_val, preds)
        print("Fold Precision: ", prec_err)
        prec_errs.append(prec_err)

        recall_err = recall_score(y_val, preds)
        print("Fold Recall: ", recall_err)
        recall_errs.append(recall_err)
        
        auc_err = roc_auc_score(y_val, pred_probas)
        print("Fold AUC: ", auc_err)
        auc_errs.append(auc_err)

        ap_err = average_precision_score(y_val, pred_probas)
        print("Fold AP: ", ap_err)
        ap_errs.append(ap_err)
        
        models.append(model)
        
        print("\n", "*"*70, "\n")

    import plotly.express as px
    importance = [model.feature_importances_ for model in models]

    f_importance = pd.concat([pd.Series(X.columns.to_list(),name='Feature'),
                            pd.Series(np.mean(importance,axis=0),name="Importance")],
                            axis=1).sort_values(by='Importance',
                                                ascending=True)

    fig = px.bar(f_importance.tail(20),x='Importance',y='Feature')
    fig.update_layout(
        title_text="First 20 Important Features - CatBoost Average of Folds"
    )
    fig.show()

    return models


        

        

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import xgboost
import lightgbm as lgb

In [5]:
train.shape

(53019, 346)

In [8]:
data1 = train.drop(columns=["user_id", "moved_after_2019"])
data2 = test.drop(columns="user_id")

In [11]:
imp_mean = IterativeImputer(
    estimator=xgboost.XGBRegressor(
        n_estimators=5,
        random_state=1,
        tree_method='gpu_hist',
    ),
    missing_values=np.nan,
    max_iter=5,
    initial_strategy='mean',
    imputation_order='ascending',
    verbose=2,
    random_state=1
)
imp_mean.fit(data1)
imp_mean_train = imp_mean.transform(data1)
imp_mean_test = imp_mean.transform(data2)

[IterativeImputer] Completing matrix with shape (53019, 344)
[IterativeImputer] Ending imputation round 1/5, elapsed time 338.92
[IterativeImputer] Change: 74.21400869475927, scaled tolerance: 0.729 
[IterativeImputer] Ending imputation round 2/5, elapsed time 707.93
[IterativeImputer] Change: 38.85702480375767, scaled tolerance: 0.729 
[IterativeImputer] Ending imputation round 3/5, elapsed time 1069.07
[IterativeImputer] Change: 7.567001298069954, scaled tolerance: 0.729 
[IterativeImputer] Ending imputation round 4/5, elapsed time 1405.20
[IterativeImputer] Change: 9.054042220115662, scaled tolerance: 0.729 
[IterativeImputer] Ending imputation round 5/5, elapsed time 1763.23
[IterativeImputer] Change: 4.778707787394524, scaled tolerance: 0.729 




[IterativeImputer] Completing matrix with shape (53019, 344)
[IterativeImputer] Ending imputation round 1/5, elapsed time 48.33
[IterativeImputer] Ending imputation round 2/5, elapsed time 75.88
[IterativeImputer] Ending imputation round 3/5, elapsed time 100.71
[IterativeImputer] Ending imputation round 4/5, elapsed time 130.76
[IterativeImputer] Ending imputation round 5/5, elapsed time 157.02
[IterativeImputer] Completing matrix with shape (13255, 344)
[IterativeImputer] Ending imputation round 1/5, elapsed time 9.40
[IterativeImputer] Ending imputation round 2/5, elapsed time 18.69
[IterativeImputer] Ending imputation round 3/5, elapsed time 30.36
[IterativeImputer] Ending imputation round 4/5, elapsed time 41.21
[IterativeImputer] Ending imputation round 5/5, elapsed time 50.52


In [16]:
check_df(data1)

**********************************************************************
******************************* Shape ********************************
**********************************************************************
(53019, 344)
**********************************************************************
******************************* Types ********************************
**********************************************************************
industry_-1                           int64
industry_Accounting                   int64
industry_Airlines/Aviation            int64
industry_Apparel & Fashion            int64
industry_Architecture & Planning      int64
                                     ...   
school_name_count_norm              float64
degree_count                        float64
degree_count_norm                   float64
fields_of_study_count               float64
fields_of_study_count_norm          float64
Length: 344, dtype: object
**************************************************

In [23]:
df_iter_imputed_train = pd.DataFrame(imp_mean_train, columns=data1.columns)

In [24]:
check_df(df_iter_imputed_train)

**********************************************************************
******************************* Shape ********************************
**********************************************************************
(53019, 344)
**********************************************************************
******************************* Types ********************************
**********************************************************************
industry_-1                         float64
industry_Accounting                 float64
industry_Airlines/Aviation          float64
industry_Apparel & Fashion          float64
industry_Architecture & Planning    float64
                                     ...   
school_name_count_norm              float64
degree_count                        float64
degree_count_norm                   float64
fields_of_study_count               float64
fields_of_study_count_norm          float64
Length: 344, dtype: object
**************************************************

In [25]:
df_iter_imputed_test = pd.DataFrame(imp_mean_test, columns=data2.columns)

In [26]:
check_df(df_iter_imputed_test)

**********************************************************************
******************************* Shape ********************************
**********************************************************************
(13255, 344)
**********************************************************************
******************************* Types ********************************
**********************************************************************
industry_-1                         float64
industry_Accounting                 float64
industry_Airlines/Aviation          float64
industry_Apparel & Fashion          float64
industry_Architecture & Planning    float64
                                     ...   
school_name_count_norm              float64
degree_count                        float64
degree_count_norm                   float64
fields_of_study_count               float64
fields_of_study_count_norm          float64
Length: 344, dtype: object
**************************************************

In [27]:
df_iter_imputed_train.head()

Unnamed: 0,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,...,company_id_count,company_id_count_norm,skill_count,skill_count_norm,school_name_count,school_name_count_norm,degree_count,degree_count_norm,fields_of_study_count,fields_of_study_count_norm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.945804,0.736451,8.396971,0.365086,0.834533,0.417267,4.0,2.0,0.807414,0.403707
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.090652,0.515109,19.977311,0.4077,0.43793,0.43793,2.0,2.0,0.403675,0.403675
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.625,0.15625,8.619874,0.41047,0.752864,0.376432,5.0,2.5,0.787145,0.393573
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.825863,0.456466,9.867496,0.328917,1.25035,0.416783,6.0,3.0,1.300354,0.433451
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.132618,0.377539,19.166329,0.399299,0.762746,0.381373,0.0,0.085146,0.810272,0.405136


In [31]:
df_iter_imputed_test.shape


(13255, 344)

In [33]:
type(df_iter_imputed_train)

pandas.core.frame.DataFrame

In [35]:
df_iter_imputed_train.to_csv("imputed_train.csv", index=False)
df_iter_imputed_test.to_csv("imputed_test.csv", index=False)


In [4]:
train_imputed = pd.read_csv("./imputed_train.csv")
test_imputed = pd.read_csv("./imputed_test.csv")

In [5]:
train_imputed.shape

(53019, 344)

In [6]:
test_imputed.shape

(13255, 344)

In [14]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [15]:
train[train.profiency_proficiency_count.isna()==True]


Unnamed: 0,user_id,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands,user_location_nevsehir,user_location_ordu,user_location_philippines,user_location_poland,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia,user_location_sweden,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom,user_location_united states,user_location_van,user_location_yalova,user_location_zonguldak,moved_after_2019,profiency_proficiency_count,profiency_proficiency_mean,profiency_proficiency_max,profiency_proficiency_min,profiency_proficiency_sum,language_albanian_count,language_albanian_count_norm,language_almanca_count,language_almanca_count_norm,language_arapca_count,language_arapca_count_norm,language_armenian_count,language_armenian_count_norm,language_arnavutca_count,language_arnavutca_count_norm,language_azerice_count,language_azerice_count_norm,language_bosnakca_count,language_bosnakca_count_norm,language_bulgarca_count,language_bulgarca_count_norm,language_cince_count,language_cince_count_norm,language_czech_count,language_czech_count_norm,language_dutch_count,language_dutch_count_norm,language_farsca_count,language_farsca_count_norm,language_fince_count,language_fince_count_norm,language_fransizca_count,language_fransizca_count_norm,language_greek_count,language_greek_count_norm,language_hebrew_count,language_hebrew_count_norm,language_ingilizce_count,language_ingilizce_count_norm,language_isaret dilleri_count,language_isaret dilleri_count_norm,language_ispanyolca_count,language_ispanyolca_count_norm,language_italyanca_count,language_italyanca_count_norm,language_japonca_count,language_japonca_count_norm,language_kazakca_count,language_kazakca_count_norm,language_korece_count,language_korece_count_norm,language_kurtce_count,language_kurtce_count_norm,language_latince_count,language_latince_count_norm,language_lehce_count,language_lehce_count_norm,language_other_count,language_other_count_norm,language_persian_count,language_persian_count_norm,language_polish_count,language_polish_count_norm,language_portekizce_count,language_portekizce_count_norm,language_rusca_count,language_rusca_count_norm,language_sirpca_count,language_sirpca_count_norm,language_swedish_count,language_swedish_count_norm,language_turkce_count,language_turkce_count_norm,language_urdu_count,language_urdu_count_norm,language_uzbek_count,language_uzbek_count_norm,language_count,user_work_start_year_count,user_work_start_month_count,user_work_start_month_mean,user_work_start_month_max,user_work_start_month_min,user_work_start_month_sum,user_worked_company_count_count,user_worked_company_count_mean,user_worked_company_count_max,user_worked_company_count_min,user_worked_company_count_sum,user_changed_work_count_count,user_changed_work_count_mean,user_changed_work_count_max,user_changed_work_count_min,user_changed_work_count_sum,work_location_adana_count,work_location_adana_count_norm,work_location_ankara_count,work_location_ankara_count_norm,work_location_antalya_count,work_location_antalya_count_norm,work_location_area_count,work_location_area_count_norm,work_location_bursa_count,work_location_bursa_count_norm,work_location_elazig_count,work_location_elazig_count_norm,work_location_eskisehir_count,work_location_eskisehir_count_norm,work_location_gebze_count,work_location_gebze_count_norm,work_location_germany_count,work_location_germany_count_norm,work_location_istanbul_count,work_location_istanbul_count_norm,work_location_izmir_count,work_location_izmir_count_norm,work_location_kayseri_count,work_location_kayseri_count_norm,work_location_kocaeli_count,work_location_kocaeli_count_norm,work_location_konya_count,work_location_konya_count_norm,work_location_manisa_count,work_location_manisa_count_norm,work_location_mersin_count,work_location_mersin_count_norm,work_location_mugla_count,work_location_mugla_count_norm,work_location_sakarya_count,work_location_sakarya_count_norm,work_location_tekirdag_count,work_location_tekirdag_count_norm,work_location_trabzon_count,work_location_trabzon_count_norm,work_location_turkey_count,work_location_turkey_count_norm,work_location_turkiye_count,work_location_turkiye_count_norm,work_location_united kingdom_count,work_location_united kingdom_count_norm,work_location_united states_count,work_location_united states_count_norm,company_id_count,company_id_count_norm,skill_count,skill_count_norm,school_name_count,school_name_count_norm,degree_count,degree_count_norm,fields_of_study_count,fields_of_study_count_norm
2,4880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,4.0,4.250000,8.0,1.0,17.0,4.0,4.0,4.0,4.0,16.0,4.0,4.0,4.0,4.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625000,0.156250,8.619874,0.410470,0.752864,0.376432,5.0,2.5,0.787145,0.393573
5,47498,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,10.000000,10.0,10.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.166667,14.500080,0.467745,0.386371,0.386371,3.0,3.0,0.391145,0.391145
10,65923,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,11.500000,12.0,11.0,23.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125000,0.062500,1.221627,0.152703,1.536660,0.384165,14.0,3.5,1.536778,0.384194
12,33971,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,5.000000,5.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,1.000000,4.159028,0.462114,0.990261,0.330087,6.0,3.0,1.024783,0.341594
14,40150,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,7.000000,7.0,7.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500000,0.500000,5.527332,0.425179,0.780224,0.390112,0.0,,0.771625,0.385812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53009,41192,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,7.000000,7.0,7.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.558559,0.558559,1.457962,0.485987,0.393535,0.393535,3.0,3.0,0.403675,0.403675
53011,855,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.000000,8.0,6.0,21.0,3.0,3.0,3.0,3.0,9.0,3.0,3.0,3.0,3.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.789671,0.596557,8.732455,0.485136,0.426426,0.426426,3.0,3.0,0.403739,0.403739
53012,14909,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,5.666667,11.0,2.0,17.0,3.0,3.0,3.0,3.0,9.0,3.0,3.0,3.0,3.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.425474,0.475158,0.250454,0.250454,0.776940,0.388470,4.0,2.0,0.768753,0.384377
53013,20367,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.646957,0.455870,0.373806,0.373806,4.0,4.0,0.403739,0.403739


In [18]:
train_imputed["user_id"] = train["user_id"]
test_imputed["user_id"] = test["user_id"]

In [20]:
train_imputed[train_imputed.user_id == 4880]

Unnamed: 0,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands,user_location_nevsehir,user_location_ordu,user_location_philippines,user_location_poland,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia,user_location_sweden,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom,user_location_united states,user_location_van,user_location_yalova,user_location_zonguldak,profiency_proficiency_count,profiency_proficiency_mean,profiency_proficiency_max,profiency_proficiency_min,profiency_proficiency_sum,language_albanian_count,language_albanian_count_norm,language_almanca_count,language_almanca_count_norm,language_arapca_count,language_arapca_count_norm,language_armenian_count,language_armenian_count_norm,language_arnavutca_count,language_arnavutca_count_norm,language_azerice_count,language_azerice_count_norm,language_bosnakca_count,language_bosnakca_count_norm,language_bulgarca_count,language_bulgarca_count_norm,language_cince_count,language_cince_count_norm,language_czech_count,language_czech_count_norm,language_dutch_count,language_dutch_count_norm,language_farsca_count,language_farsca_count_norm,language_fince_count,language_fince_count_norm,language_fransizca_count,language_fransizca_count_norm,language_greek_count,language_greek_count_norm,language_hebrew_count,language_hebrew_count_norm,language_ingilizce_count,language_ingilizce_count_norm,language_isaret dilleri_count,language_isaret dilleri_count_norm,language_ispanyolca_count,language_ispanyolca_count_norm,language_italyanca_count,language_italyanca_count_norm,language_japonca_count,language_japonca_count_norm,language_kazakca_count,language_kazakca_count_norm,language_korece_count,language_korece_count_norm,language_kurtce_count,language_kurtce_count_norm,language_latince_count,language_latince_count_norm,language_lehce_count,language_lehce_count_norm,language_other_count,language_other_count_norm,language_persian_count,language_persian_count_norm,language_polish_count,language_polish_count_norm,language_portekizce_count,language_portekizce_count_norm,language_rusca_count,language_rusca_count_norm,language_sirpca_count,language_sirpca_count_norm,language_swedish_count,language_swedish_count_norm,language_turkce_count,language_turkce_count_norm,language_urdu_count,language_urdu_count_norm,language_uzbek_count,language_uzbek_count_norm,language_count,user_work_start_year_count,user_work_start_month_count,user_work_start_month_mean,user_work_start_month_max,user_work_start_month_min,user_work_start_month_sum,user_worked_company_count_count,user_worked_company_count_mean,user_worked_company_count_max,user_worked_company_count_min,user_worked_company_count_sum,user_changed_work_count_count,user_changed_work_count_mean,user_changed_work_count_max,user_changed_work_count_min,user_changed_work_count_sum,work_location_adana_count,work_location_adana_count_norm,work_location_ankara_count,work_location_ankara_count_norm,work_location_antalya_count,work_location_antalya_count_norm,work_location_area_count,work_location_area_count_norm,work_location_bursa_count,work_location_bursa_count_norm,work_location_elazig_count,work_location_elazig_count_norm,work_location_eskisehir_count,work_location_eskisehir_count_norm,work_location_gebze_count,work_location_gebze_count_norm,work_location_germany_count,work_location_germany_count_norm,work_location_istanbul_count,work_location_istanbul_count_norm,work_location_izmir_count,work_location_izmir_count_norm,work_location_kayseri_count,work_location_kayseri_count_norm,work_location_kocaeli_count,work_location_kocaeli_count_norm,work_location_konya_count,work_location_konya_count_norm,work_location_manisa_count,work_location_manisa_count_norm,work_location_mersin_count,work_location_mersin_count_norm,work_location_mugla_count,work_location_mugla_count_norm,work_location_sakarya_count,work_location_sakarya_count_norm,work_location_tekirdag_count,work_location_tekirdag_count_norm,work_location_trabzon_count,work_location_trabzon_count_norm,work_location_turkey_count,work_location_turkey_count_norm,work_location_turkiye_count,work_location_turkiye_count_norm,work_location_united kingdom_count,work_location_united kingdom_count_norm,work_location_united states_count,work_location_united states_count_norm,company_id_count,company_id_count_norm,skill_count,skill_count_norm,school_name_count,school_name_count_norm,degree_count,degree_count_norm,fields_of_study_count,fields_of_study_count_norm,user_id
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.915944,0.086339,0.086339,0.134359,0.086602,0.084041,0.084041,0.916102,0.084042,0.915733,0.084041,0.084041,0.084041,0.084041,0.084041,0.084041,0.084041,0.084046,0.084041,0.084041,0.084041,0.084041,0.084041,0.084069,0.084041,0.913434,0.084041,0.084063,0.084041,0.084041,0.084041,0.084041,0.084041,0.084041,0.084041,0.084041,0.084041,0.91653,0.085463,0.084041,0.084041,0.915833,0.084041,0.084041,0.084041,0.084041,0.084041,0.084041,0.084041,0.084041,0.084041,0.084041,0.084041,0.084052,0.084041,0.084041,0.084041,0.91512,0.084041,0.084041,0.084041,0.084075,0.084041,0.084059,0.084041,0.915842,0.084041,0.084041,0.084041,0.084041,0.084041,0.915954,0.08405,0.084047,0.084041,0.084041,0.084041,0.915944,4.0,4.0,4.25,8.0,1.0,17.0,4.0,4.0,4.0,4.0,16.0,4.0,4.0,4.0,4.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625,0.15625,8.619874,0.41047,0.752864,0.376432,5.0,2.5,0.787145,0.393573,4880


In [21]:
train_imputed["moved_after_2019"] = train["moved_after_2019"]


In [33]:
models = impute_and_scaled_model(train_imputed)

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2070
[0]	validation_0-logloss:0.53609	validation_1-logloss:0.53097
[99]	validation_0-logloss:0.23891	validation_1-logloss:0.17709
Positive Count in Predictions: 1937
Accuracy:  0.8896642776310826
Fold F1:  0.8540054903918144
Fold Precision:  0.8833247289623128
Fold Recall:  0.8265700483091788
Fold AUC:  0.9621862593868082
Fold AP:  0.9466992570418715

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2063




[0]	validation_0-logloss:0.53367	validation_1-logloss:0.53166
[99]	validation_0-logloss:0.24046	validation_1-logloss:0.17677
Positive Count in Predictions: 2005
Accuracy:  0.8924933987174651
Fold F1:  0.859882005899705
Fold Precision:  0.8723192019950124
Fold Recall:  0.8477944740668929
Fold AUC:  0.9619248982760847
Fold AP:  0.9467910759349235

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 1989




[0]	validation_0-logloss:0.53428	validation_1-logloss:0.53196
[99]	validation_0-logloss:0.23068	validation_1-logloss:0.17486
Positive Count in Predictions: 1930
Accuracy:  0.8941908713692946
Fold F1:  0.8568512375606021
Fold Precision:  0.8699481865284974
Fold Recall:  0.8441427853192559
Fold AUC:  0.9638740510173901
Fold AP:  0.946334496808511

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 1990




[0]	validation_0-logloss:0.53328	validation_1-logloss:0.53165
[99]	validation_0-logloss:0.23362	validation_1-logloss:0.17782
Positive Count in Predictions: 1895
Accuracy:  0.8919275745001886
Fold F1:  0.8525096525096525
Fold Precision:  0.8738786279683377
Fold Recall:  0.8321608040201005
Fold AUC:  0.9629075328332485
Fold AP:  0.94551127588204

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2061




[0]	validation_0-logloss:0.53571	validation_1-logloss:0.53173
[99]	validation_0-logloss:0.23693	validation_1-logloss:0.17329
Positive Count in Predictions: 1986
Accuracy:  0.8941908713692946
Fold F1:  0.8613787991104522
Fold Precision:  0.877643504531722
Fold Recall:  0.8457059679767104
Fold AUC:  0.9632238029816006
Fold AP:  0.9469357772494043

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2025




[0]	validation_0-logloss:0.53454	validation_1-logloss:0.53163
[99]	validation_0-logloss:0.24320	validation_1-logloss:0.17364
Positive Count in Predictions: 1939
Accuracy:  0.8928706148623161
Fold F1:  0.856710393541877
Fold Precision:  0.8757091284167097
Fold Recall:  0.8385185185185186
Fold AUC:  0.9608768935755
Fold AP:  0.9436642862376754

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2095




[0]	validation_0-logloss:0.53275	validation_1-logloss:0.53234
[99]	validation_0-logloss:0.23210	validation_1-logloss:0.17634
Positive Count in Predictions: 1981
Accuracy:  0.8962655601659751
Fold F1:  0.865063788027478
Fold Precision:  0.889954568399798
Fold Recall:  0.841527446300716
Fold AUC:  0.9647334701164592
Fold AP:  0.951152179431564

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2069




[0]	validation_0-logloss:0.53222	validation_1-logloss:0.53217
[99]	validation_0-logloss:0.23084	validation_1-logloss:0.17651
Positive Count in Predictions: 1999
Accuracy:  0.8921161825726142
Fold F1:  0.8593903638151426
Fold Precision:  0.8744372186093047
Fold Recall:  0.8448525857902368
Fold AUC:  0.9643641417194032
Fold AP:  0.9503264274358333

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2069




[0]	validation_0-logloss:0.53039	validation_1-logloss:0.53231
[99]	validation_0-logloss:0.21986	validation_1-logloss:0.18053
Positive Count in Predictions: 1976
Accuracy:  0.9017351942663145
Fold F1:  0.8711990111248455
Fold Precision:  0.8917004048582996
Fold Recall:  0.8516191396810053
Fold AUC:  0.968417167271359
Fold AP:  0.954824430128313

 ********************************************************************** 

Train shape: (47718, 344) | Val Shape: (5301, 344)
Positive Count in Val Split: 2092




[0]	validation_0-logloss:0.53591	validation_1-logloss:0.53182
[99]	validation_0-logloss:0.24186	validation_1-logloss:0.17715
Positive Count in Predictions: 1990
Accuracy:  0.8853046594982079
Fold F1:  0.8510534051935325
Fold Precision:  0.8728643216080402
Fold Recall:  0.8303059273422562
Fold AUC:  0.9616591303021438
Fold AP:  0.9458703213223887

 ********************************************************************** 



In [34]:

test_imputed.drop(columns=["user_id"],inplace=True)


In [35]:
model_preds = [model.predict(test_imputed) for model in models]
model_preds

[array([0, 1, 0, ..., 1, 0, 0]),
 array([0, 1, 0, ..., 1, 0, 0]),
 array([0, 1, 0, ..., 1, 0, 1]),
 array([0, 1, 0, ..., 1, 0, 0]),
 array([0, 1, 0, ..., 1, 0, 1]),
 array([0, 1, 0, ..., 1, 0, 0]),
 array([0, 1, 0, ..., 1, 0, 1]),
 array([0, 1, 0, ..., 1, 0, 0]),
 array([0, 1, 0, ..., 1, 0, 0]),
 array([0, 1, 0, ..., 1, 0, 0])]

In [36]:
sample_submission = pd.DataFrame()
df = pd.read_csv("./data/test_users.csv")
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,1
2,2110,0
3,55082,0
4,37165,0


In [37]:
sample_submission.moved_after_2019.value_counts()


0    10149
1     3106
Name: moved_after_2019, dtype: int64

In [38]:
sample_submission.to_csv('submission.csv',index=False)


In [39]:
models = impute_and_scaled_model(train)

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2070



`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.



[0]	validation_0-logloss:0.53308	validation_1-logloss:0.52850
[99]	validation_0-logloss:0.23749	validation_1-logloss:0.17050
Positive Count in Predictions: 1962
Accuracy:  0.8902301018483592
Fold F1:  0.855654761904762
Fold Precision:  0.8792048929663608
Fold Recall:  0.8333333333333334
Fold AUC:  0.9626493219974173
Fold AP:  0.9473829959941733

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2063



`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.



[0]	validation_0-logloss:0.53231	validation_1-logloss:0.52917
[99]	validation_0-logloss:0.24418	validation_1-logloss:0.17066
Positive Count in Predictions: 2010
Accuracy:  0.8874009807619766
Fold F1:  0.8534249938620182
Fold Precision:  0.8646766169154229
Fold Recall:  0.842462433349491
Fold AUC:  0.9608331685886546
Fold AP:  0.9454568523396797

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 1989



`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.



[0]	validation_0-logloss:0.53256	validation_1-logloss:0.52939
[99]	validation_0-logloss:0.23156	validation_1-logloss:0.17597
Positive Count in Predictions: 1935
Accuracy:  0.8940022632968692
Fold F1:  0.8567787971457697
Fold Precision:  0.868733850129199
Fold Recall:  0.845148315736551
Fold AUC:  0.9635448938373248
Fold AP:  0.9460741625845037

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 1990



`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.



[0]	validation_0-logloss:0.53131	validation_1-logloss:0.52924
[99]	validation_0-logloss:0.23658	validation_1-logloss:0.17125
Positive Count in Predictions: 1896
Accuracy:  0.8943794794417201
Fold F1:  0.8558929490478642
Fold Precision:  0.8771097046413502
Fold Recall:  0.835678391959799
Fold AUC:  0.9618070728036316
Fold AP:  0.9443493408185366

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2061



`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.



[0]	validation_0-logloss:0.53278	validation_1-logloss:0.52961
[99]	validation_0-logloss:0.23380	validation_1-logloss:0.17312
Positive Count in Predictions: 1985
Accuracy:  0.8909845341380611
Fold F1:  0.8571428571428572
Fold Precision:  0.873551637279597
Fold Recall:  0.8413391557496361
Fold AUC:  0.9641654618971718
Fold AP:  0.9481895537606182

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2025



`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.



[0]	validation_0-logloss:0.53233	validation_1-logloss:0.52930
[99]	validation_0-logloss:0.23961	validation_1-logloss:0.17426
Positive Count in Predictions: 1949
Accuracy:  0.8940022632968692
Fold F1:  0.8585807750377453
Fold Precision:  0.8753206772703951
Fold Recall:  0.8424691358024692
Fold AUC:  0.961974705862408
Fold AP:  0.945287982052777

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2095



`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.



[0]	validation_0-logloss:0.53136	validation_1-logloss:0.52979
[99]	validation_0-logloss:0.23298	validation_1-logloss:0.17282
Positive Count in Predictions: 1971
Accuracy:  0.8947566955865711
Fold F1:  0.8627643876045252
Fold Precision:  0.8899036022323693
Fold Recall:  0.8372315035799522
Fold AUC:  0.9647111442526156
Fold AP:  0.9501398866525843

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2069



`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.



[0]	validation_0-logloss:0.52938	validation_1-logloss:0.52959
[99]	validation_0-logloss:0.22910	validation_1-logloss:0.17260
Positive Count in Predictions: 1991
Accuracy:  0.8898528857035081
Fold F1:  0.8561576354679803
Fold Precision:  0.8729281767955801
Fold Recall:  0.8400193330111165
Fold AUC:  0.9650711151927239
Fold AP:  0.9507458673595238

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2069



`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.



[0]	validation_0-logloss:0.52807	validation_1-logloss:0.52974
[99]	validation_0-logloss:0.22209	validation_1-logloss:0.17551
Positive Count in Predictions: 1982
Accuracy:  0.8972086005281026
Fold F1:  0.8654653172056282
Fold Precision:  0.884460141271443
Fold Recall:  0.847269212179797
Fold AUC:  0.9677704412731383
Fold AP:  0.9538914629936839

 ********************************************************************** 

Train shape: (47718, 344) | Val Shape: (5301, 344)
Positive Count in Val Split: 2092



`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.



[0]	validation_0-logloss:0.53243	validation_1-logloss:0.52925
[99]	validation_0-logloss:0.24698	validation_1-logloss:0.17419
Positive Count in Predictions: 2002
Accuracy:  0.8811544991511036
Fold F1:  0.8461162677088422
Fold Precision:  0.8651348651348651
Fold Recall:  0.8279158699808795
Fold AUC:  0.9602538450950868
Fold AP:  0.944181706050698

 ********************************************************************** 



In [40]:

test.drop(columns=["user_id"],inplace=True)


In [41]:
model_preds = [model.predict(test) for model in models]
model_preds

[array([0, 1, 0, ..., 1, 0, 1]),
 array([0, 1, 0, ..., 1, 0, 1]),
 array([0, 1, 0, ..., 1, 0, 1]),
 array([0, 1, 0, ..., 1, 0, 1]),
 array([0, 1, 0, ..., 1, 0, 1]),
 array([0, 1, 0, ..., 1, 0, 1]),
 array([0, 1, 0, ..., 1, 0, 1]),
 array([0, 1, 0, ..., 1, 0, 0]),
 array([0, 1, 0, ..., 1, 0, 1]),
 array([0, 1, 0, ..., 1, 0, 0])]

In [42]:
sample_submission = pd.DataFrame()
df = pd.read_csv("./data/test_users.csv")
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,1
2,2110,0
3,55082,0
4,37165,0


In [43]:
sample_submission.moved_after_2019.value_counts()


0    10168
1     3087
Name: moved_after_2019, dtype: int64

In [44]:
sample_submission.to_csv('submission2.csv',index=False)


In [57]:
models = impute_and_scaled_model(train_imputed)

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2070
Positive Count in Predictions: 1939
Accuracy:  0.8760844964164466
Fold F1:  0.8361187328510851
Fold Precision:  0.8643630737493554
Fold Recall:  0.8096618357487922
Fold AUC:  0.9523777921270388
Fold AP:  0.9335121417710979

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2063
Positive Count in Predictions: 2001
Accuracy:  0.8849490758204451
Fold F1:  0.8499015748031497
Fold Precision:  0.8630684657671165
Fold Recall:  0.8371303926320892
Fold AUC:  0.9542545057607261
Fold AP:  0.9364785100634457

 ********************************************************************** 

Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 1989
Positive Count in Predictions: 1897
Accuracy:  0.8781591852131271
Fold F1:  0.833762223365929
Fold Precision:  0.8539799683711122
Fold Recall:  0.814

In [58]:
model_preds = [model.predict(test_imputed) for model in models]
model_preds

[array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 1], dtype=int64)]

In [59]:
sample_submission = pd.DataFrame()
df = pd.read_csv("./data/test_users.csv")
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,1
2,2110,0
3,55082,1
4,37165,1


In [60]:
sample_submission.moved_after_2019.value_counts()


0    10040
1     3215
Name: moved_after_2019, dtype: int64

In [61]:
sample_submission.to_csv('submission3.csv',index=False)


In [65]:
models = impute_and_scaled_model(train)


Train shape: (47717, 344) | Val Shape: (5302, 344)
Positive Count in Val Split: 2070
0:	learn: 0.7397760	test: 0.7377232	best: 0.7377232 (0)	total: 188ms	remaining: 5h 12m 39s
1500:	learn: 0.8036762	test: 0.7992723	best: 0.7994339 (1476)	total: 15.3s	remaining: 16m 47s
3000:	learn: 0.8120065	test: 0.8071107	best: 0.8071107 (2998)	total: 30.8s	remaining: 16m 35s
4500:	learn: 0.8152730	test: 0.8106339	best: 0.8106339 (4452)	total: 47.3s	remaining: 16m 43s
6000:	learn: 0.8167475	test: 0.8114754	best: 0.8122056 (5758)	total: 1m 3s	remaining: 16m 37s
7500:	learn: 0.8179539	test: 0.8127310	best: 0.8131417 (6662)	total: 1m 19s	remaining: 16m 17s
9000:	learn: 0.8189472	test: 0.8145709	best: 0.8146472 (8854)	total: 1m 34s	remaining: 15m 57s
10500:	learn: 0.8207237	test: 0.8163097	best: 0.8163097 (10480)	total: 1m 50s	remaining: 15m 40s
12000:	learn: 0.8213216	test: 0.8177137	best: 0.8178822 (11638)	total: 2m 5s	remaining: 15m 18s
13500:	learn: 0.8220926	test: 0.8189691	best: 0.8193069 (13184)	t

In [66]:
model_preds = [model.predict(test) for model in models]
model_preds

[array([0, 1, 0, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 1], dtype=int64)]

In [67]:
sample_submission = pd.DataFrame()
df = pd.read_csv("./data/test_users.csv")
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,1
2,2110,0
3,55082,1
4,37165,1


In [68]:
sample_submission.moved_after_2019.value_counts()


0    7296
1    5959
Name: moved_after_2019, dtype: int64

In [69]:
sample_submission.to_csv('submission4.csv',index=False)


In [12]:
check_df(train)

**********************************************************************
******************************* Shape ********************************
**********************************************************************
(53019, 346)
**********************************************************************
******************************* Types ********************************
**********************************************************************
user_id                         int64
industry_-1                     int64
industry_Accounting             int64
industry_Airlines/Aviation      int64
industry_Apparel & Fashion      int64
                               ...   
school_name_count_norm        float64
degree_count                  float64
degree_count_norm             float64
fields_of_study_count         float64
fields_of_study_count_norm    float64
Length: 346, dtype: object
**********************************************************************
******************************** Head *******

## feature selection

In [72]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

In [70]:
train.shape

(53019, 346)

In [76]:
import xgboost as xgb

y = train["moved_after_2019"]
X = train.drop(["user_id", "moved_after_2019"], axis=1)


# Create a RandomForestClassifier object
classifier = xgb.XGBClassifier()

# Use RFE to select the most important features
rfe = RFE(classifier, n_features_to_select=10)
rfe = rfe.fit(X, y)

# Print the selected features
selected_features = X.columns[rfe.support_]
print("Selected Features:", selected_features)

KeyboardInterrupt: 

In [77]:
import shap

y = train["moved_after_2019"]
X = train.drop(["user_id", "moved_after_2019"], axis=1)

model = xgb.XGBClassifier()
model.fit(X, y)

# use the SHAP library to calculate the SHAP values for each feature
explainer = shap.Explainer(model.predict_proba, X)
shap_values = explainer(X)

# calculate the mean absolute SHAP value for each feature
mean_abs_shap = np.mean(np.abs(shap_values), axis=0)

# sort the features by mean absolute SHAP value
sorted_idx = np.argsort(-mean_abs_shap)

# print the top 10 most important features
print("Top 10 most important features:")
for idx in sorted_idx[:30]:
    print(f"{X.columns[idx]}: {mean_abs_shap[idx]}")

Permutation explainer:   2%|▏         | 946/53019 [05:54<5:30:32,  2.63it/s] 


KeyboardInterrupt: 

In [17]:
imp_mean_train.shape

(53019, 344)

In [18]:
imp_mean_test.shape

(13255, 344)

In [19]:
type(imp_mean_test)

numpy.ndarray