In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
def missing_values_analysis(df):
    na_columns_ = [col for col in df.columns if df[col].isnull().sum() > 0]
    n_miss = df[na_columns_].isnull().sum().sort_values(ascending=False)
    ratio_ = (df[na_columns_].isnull().sum() / df.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio_, 2)], axis=1, keys=['Total Missing Values', 'Ratio'])
    missing_df = pd.DataFrame(missing_df).sort_values(by="Ratio", ascending=False)
    return missing_df


def check_df(dataframe, head=5, tail=5):
    print("*" * 70)
    print(" Shape ".center(70, "*"))
    print("*" * 70)
    print(dataframe.shape)

    print("*" * 70)
    print(" Types ".center(70, "*"))
    print("*" * 70)
    print(dataframe.dtypes)

    print("*" * 70)
    print(" Head ".center(70, "*"))
    print("*" * 70)
    print(dataframe.head(head))

    print("*" * 70)
    print(" Tail ".center(70, "*"))
    print("*" * 70)
    print(dataframe.tail(tail))

    print("*" * 70)
    print(" NA ".center(70, "*"))
    print("*" * 70)
    print(missing_values_analysis(dataframe))

    print("*" * 70)
    print(" Quantiles ".center(70, "*"))
    print("*" * 70)
    print(dataframe.describe([.01, .05, .1, .5, .9, .95, .99]).T)

    print("*" * 70)
    print(" Duplicate Rows ".center(70, "*"))
    print("*" * 70)
    print(dataframe.duplicated().sum())

    print("*" * 70)
    print(" Uniques ".center(70, "*"))
    print("*" * 70)
    print(dataframe.nunique())

In [3]:
def cat_plots(dataframe, cat_col):
    print("".center(100, "#"))
    print(dataframe[cat_col].value_counts())
    print("".center(100, "#"))

    plt.figure(figsize=(15, 10))
    sns.set_style("whitegrid")
    plt.suptitle(cat_col.capitalize(), size=16)
    plt.subplot(1, 3, 1)
    plt.title("Percentages")
    plt.pie(dataframe[cat_col].value_counts().values.tolist(),
            labels=dataframe[cat_col].value_counts().keys().tolist(),
            labeldistance=1.1,
            wedgeprops={'linewidth': 3, 'edgecolor': 'white'},
            colors=colors,
            autopct='%1.0f%%')

    

    plt.subplot(1, 3, 2)
    plt.title("Countplot")
    sns.countplot(data=dataframe, x=cat_col, palette=colors)
    plt.tight_layout(pad=3)

In [37]:
train = pd.read_csv("../data/train_users.csv")

In [38]:
test = pd.read_csv("../data/test_users.csv")

In [39]:
check_df(train)

**********************************************************************
******************************* Shape ********************************
**********************************************************************
(53019, 4)
**********************************************************************
******************************* Types ********************************
**********************************************************************
user_id              int64
industry            object
location            object
moved_after_2019     int64
dtype: object
**********************************************************************
******************************** Head ********************************
**********************************************************************
   user_id                             industry                    location  \
0     1301  Information Technology and Services  Istanbul, Istanbul, Turkey   
1     6950                             Internet  Istanbul, Istanbul, T

In [41]:
province={
"1.0" : "Adana",
"2.0" : "Adıyaman",
"3.0" : "Afyonkarahisar",
"4.0" : "Ağrı",
"5.0" : "Amasya",
"6.0" : "Ankara",
"7.0" : "Antalya",
"8.0" : "Artvin",
"9.0" : "Aydın",
"10.0" : "Balıkesir",
"11.0" : "Bilecik",
"12.0" : "Bingöl",
"13.0" : "Bitlis",
"14.0" : "Bolu",
"15.0" : "Burdur",
"16.0" : "Bursa",
"17.0" : "Çanakkale",
"18.0" : "Çankırı",
"19.0" : "Çorum",
"20.0" : "Denizli",
"21.0" : "Diyarbakır",
"22.0" : "Edirne",
"23.0" : "Elâzığ",
"24.0" : "Erzincan",
"25.0" : "Erzurum",
"26.0" : "Eskişehir",
"27.0" : "Gaziantep",
"28.0" : "Giresun",
"29.0" : "Gümüşhane",
"30.0" : "Hakkâri",
"31.0" : "Hatay",
"32.0" : "Isparta",
"33.0" : "Mersin",
"34.0" : "İstanbul",
"35.0" : "İzmir",
"36.0" : "Kars",
"37.0" : "Kastamonu",
"38.0" : "Kayseri",
"39.0" : "Kırklareli",
"40.0" : "Kırşehir",
"41.0" : "Kocaeli",
"42.0" : "Konya",
"43.0" : "Kütahya",
"44.0" : "Malatya",
"45.0" : "Manisa",
"46.0" : "Kahramanmaraş",
"47.0" : "Mardin",
"48.0" : "Muğla",
"49.0" : "Muş",
"50.0" : "Nevşehir",
"51.0" : "Niğde",
"52.0" : "Ordu",
"53.0" : "Rize",
"54.0" : "Sakarya",
"55.0" : "Samsun",
"56.0" : "Siirt",
"57.0" : "Sinop",
"58.0" : "Sivas",
"59.0" : "Tekirdağ",
"60.0" : "Tokat",
"61.0" : "Trabzon",
"62.0" : "Tunceli",
"63.0" : "Şanlıurfa",
"64.0" : "Uşak",
"65.0" : "Van",
"66.0" : "Yozgat",
"67.0" : "Zonguldak",
"68.0" : "Aksaray",
"69.0" : "Bayburt",
"70.0" : "Karaman",
"71.0" : "Kırıkkale",
"72.0" : "Batman",
"73.0" : "Şırnak",
"74.0" : "Bartın",
"75.0" : "Ardahan",
"76.0" : "Iğdır",
"77.0" : "Yalova",
"78.0" : "Karabük",
"79.0" : "Kilis",
"80.0" : "Osmaniye",
"81.0" : "Düzce"
}


In [42]:
import swifter
from unidecode import unidecode

def check_string(string_to_check):
    string_to_check = unidecode(str(string_to_check).lower())
    if "," in string_to_check:
        sub = string_to_check.split(", ")
    else:
        sub = string_to_check.split()

    sub_dict = dict((v,unidecode(k.lower())) for v, k in province.items())

    for val in sub:
        if val in sub_dict.values():
            return val
        else:
            continue
        
    if "turkey" in sub:
        return "turkey"
    else:
        return (val + " yurtdisi")


#df['result'] = df['string_to_check'].apply(check_string)

In [43]:
train["user_location"] = train["location"].swifter.apply(check_string)
test["user_location"] = test["location"].swifter.apply(check_string)


Pandas Apply:   0%|          | 0/53019 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/13255 [00:00<?, ?it/s]

In [45]:
train["user_location"] = train["user_location"].replace(['kingdom yurtdisi','states yurtdisi'],['united kingdom yurtdisi', 'united states yurtdisi'])
test["user_location"] = test["user_location"].replace(['kingdom yurtdisi','states yurtdisi'],['united kingdom yurtdisi', 'united states yurtdisi'])


In [47]:
train.loc[train['user_location'].str.contains('yurtdisi'), 'is_yurtdisi'] = 1
train.loc[~train['user_location'].str.contains('yurtdisi'), 'is_yurtdisi'] = 0

test.loc[test['user_location'].str.contains('yurtdisi'), 'is_yurtdisi'] = 1
test.loc[~test['user_location'].str.contains('yurtdisi'), 'is_yurtdisi'] = 0

In [52]:
train.drop(columns="location", inplace=True)
test.drop(columns="location", inplace=True)

In [53]:
app_train = pd.get_dummies(train)
app_test = pd.get_dummies(test)

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

Training Features shape:  (53019, 238)
Testing Features shape:  (13255, 194)


In [54]:
labels = app_train["moved_after_2019"]

app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

app_train['moved_after_2019'] = labels

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

Training Features shape:  (53019, 195)
Testing Features shape:  (13255, 194)


In [55]:
correlations = app_train.corr()['moved_after_2019'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))

Most Positive Correlations:
 industry_Fine Art                               0.009466
industry_Financial Services                     0.010671
user_location_philippines yurtdisi              0.012220
industry_Think Tanks                            0.012306
industry_Higher Education                       0.012504
industry_Restaurants                            0.013197
user_location_ankara                            0.015855
industry_Entertainment                          0.017716
industry_Aviation & Aerospace                   0.019422
industry_Political Organization                 0.020450
industry_Defense & Space                        0.029694
industry_Information Technology and Services    0.036407
industry_Internet                               0.037546
user_location_istanbul                          0.068572
moved_after_2019                                1.000000
Name: moved_after_2019, dtype: float64

Most Negative Correlations:
 user_location_turkey                           

In [61]:
app_train["is_yurtdisi"] = app_train["is_yurtdisi"].astype(np.uint8)


In [64]:
app_test["is_yurtdisi"] = app_test["is_yurtdisi"].astype(np.uint8)


In [66]:
app_train.to_csv("train_user_revise.csv", index=False)
app_test.to_csv("test_user_revise.csv", index=False)

# Language

In [72]:
train = pd.read_csv("./train_user_revise.csv")
test = pd.read_csv("./test_user_revise.csv")

In [73]:
languages = pd.read_csv("../data/languages.csv")

In [74]:
check_df(languages)

**********************************************************************
******************************* Shape ********************************
**********************************************************************
(76062, 3)
**********************************************************************
******************************* Types ********************************
**********************************************************************
user_id         int64
language       object
proficiency    object
dtype: object
**********************************************************************
******************************** Head ********************************
**********************************************************************
   user_id   language          proficiency
0        8  İngilizce    full_professional
1        8     Türkçe  native_or_bilingual
2        8  Fransızca           elementary
3       10  ingilizce                  NaN
4       11    Turkish  native_or_bilingual
************

In [78]:
from unidecode import unidecode
import swifter

def fix_language(lang):
    lang = unidecode(str(lang).lower())
    return lang

languages["language"] = languages["language"].swifter.apply(fix_language)

Pandas Apply:   0%|          | 0/76062 [00:00<?, ?it/s]

In [79]:
languages["language"] = languages["language"].replace(["english","turkish", "german", "spanish", "french", "russian", "arabic", "italian", "japanese"],["ingilizce", "turkce", "almanca", "ispanyolca", "fransizca", "rusca", "arapca", "italyanca", "japonca"])


In [80]:
languages.loc[languages['language'].str.contains('turk'), 'language'] = 'turkce'
languages.loc[languages['language'].str.contains('isp'), 'language'] = 'ispanyolca'
languages.loc[languages['language'].str.contains('ing'), 'language'] = 'ingilizce'
languages.loc[languages['language'].str.contains('alm'), 'language'] = 'almanca'
languages.loc[languages['language'].str.contains('kore'), 'language'] = 'korece'
languages.loc[languages['language'].str.contains('deut'), 'language'] = 'almanca'
languages.loc[languages['language'].str.contains('cin'), 'language'] = 'cince'
languages.loc[languages['language'].str.contains('chin'), 'language'] = 'cince'
languages.loc[languages['language'].str.contains('eng'), 'language'] = 'ingilizce'
languages.loc[languages['language'].str.contains('kurd'), 'language'] = 'kurtce'
languages.loc[languages['language'].str.contains('azer'), 'language'] = 'azerice'
languages.loc[languages['language'].str.contains('pers'), 'language'] = 'persian'


In [81]:
languages.loc[languages['language'].str.contains('bulg'), 'language'] = 'bulgarca'
languages.loc[languages['language'].str.contains('leh'), 'language'] = 'lehce'
languages.loc[languages['language'].str.contains('greek'), 'language'] = 'greek'
languages.loc[languages['language'].str.contains('pol'), 'language'] = 'polish'
languages.loc[languages['language'].str.contains('espa'), 'language'] = 'ispanyolca'
languages.loc[languages['language'].str.contains('fars'), 'language'] = 'farsca'
languages.loc[languages['language'].str.contains('port'), 'language'] = 'portekizce'
languages.loc[languages['language'].str.contains('lati'), 'language'] = 'latince'
languages.loc[languages['language'].str.contains('ger'), 'language'] = 'almanca'
languages.loc[languages['language'].str.contains('fran'), 'language'] = 'fransizca'
languages.loc[languages['language'].str.contains('kazak'), 'language'] = 'kazakca'
languages.loc[languages['language'].str.contains('rus'), 'language'] = 'rusca'
languages.loc[languages['language'].str.contains('bos'), 'language'] = 'bosnakca'
languages.loc[languages['language'].str.contains('yun'), 'language'] = 'greek'
languages.loc[languages['language'].str.contains('isvec'), 'language'] = 'swedish'
languages.loc[languages['language'].str.contains('sig'), 'language'] = 'isaret dilleri'



In [82]:
languages.loc[languages['language'].str.contains('serb'), 'language'] = 'sirpca'
languages.loc[languages['language'].str.contains('fin'), 'language'] = 'fince'
languages.loc[languages['language'].str.contains('fele'), 'language'] = 'dutch'
languages.loc[languages['language'].str.contains('cek'), 'language'] = 'czech'
languages.loc[languages['language'].str.contains('hung'), 'language'] = 'macarca'
languages.loc[languages['language'].str.contains('fre'), 'language'] = 'fransizca'
languages.loc[languages['language'].str.contains('ara'), 'language'] = 'arapca'


In [84]:
languages.loc[languages['language'].str.contains('isa'), 'language'] = 'isaret dilleri'
languages.loc[languages['language'].str.contains('ita'), 'language'] = 'italyanca'


In [86]:
mask = languages.language.map(languages.language.value_counts()) < 17
languages.language =  languages.language.mask(mask, 'other')

In [104]:
prof_dict = [{'col': 'proficiency', 
            'mapping': 
            {'nan': np.nan,
            'elementary':1,
            'limited_working':2,
            'professional_working':3,
            'full_professional': 4,
            'native_or_bilingual': 5}}]

In [105]:
from category_encoders import OrdinalEncoder
oe = OrdinalEncoder(mapping=prof_dict)

In [106]:
languages_encode = languages.copy()

In [107]:
languages_encode = oe.fit_transform(languages_encode)

In [108]:
languages_encode["proficiency"].replace(-1, 1, inplace = True)

In [4]:
def agg_numeric(df, group_var, df_name):
    """Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all numeric columns. Each instance of the grouping variable will have 
            the statistics (mean, min, max, sum; currently supported) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    numeric_df = df.select_dtypes('number')
    numeric_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

In [113]:
language_agg = agg_numeric(languages_encode, group_var = 'user_id', df_name = 'language')
language_agg.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,user_id,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum
0,8,3,3.333333,5.0,1.0,10.0
1,10,1,1.0,1.0,1.0,1.0
2,11,3,4.333333,5.0,3.0,13.0
3,12,2,4.0,5.0,3.0,8.0
4,13,2,3.0,4.0,2.0,6.0


In [117]:
train = train.merge(language_agg, on = 'user_id', how = 'left')
train.head()


Unnamed: 0,user_id,is_yurtdisi,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany yurtdisi,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands yurtdisi,user_location_nevsehir,user_location_ordu,user_location_philippines yurtdisi,user_location_poland yurtdisi,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia yurtdisi,user_location_sweden yurtdisi,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom yurtdisi,user_location_united states yurtdisi,user_location_van,user_location_yalova,user_location_zonguldak,moved_after_2019,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum
0,1301,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2.0,4.0,5.0,3.0,8.0
1,6950,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.0,1.666667,3.0,1.0,5.0
2,4880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,,,,,
3,26046,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1.0,1.0,1.0,1.0
4,11005,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,4.0,5.0,3.0,8.0


In [5]:
def target_corrs(df):

    # List of correlations
    corrs = []

    # Iterate through the columns 
    for col in df.columns:
        # Skip the target column
        if (col != 'moved_after_2019') & (col != "user_id"):
            # Calculate correlation with the target
            corr = df['moved_after_2019'].corr(df[col])

            # Append the list as a tuple
            corrs.append((col, corr))
            
    # Sort by absolute magnitude of correlations
    corrs = sorted(corrs, key = lambda x: abs(x[1]), reverse = True)
    
    return corrs

In [121]:
target_corrs(train)

[('user_location_turkey', -0.06876162368272187),
 ('user_location_istanbul', 0.06857212998160997),
 ('language_proficiency_max', 0.05718929016362843),
 ('language_proficiency_sum', 0.050054250825388454),
 ('language_proficiency_count', 0.043201836440100345),
 ('language_proficiency_mean', 0.04296523013627315),
 ('industry_Internet', 0.03754552322489718),
 ('industry_Information Technology and Services', 0.03640684481617477),
 ('industry_Defense & Space', 0.02969372907846926),
 ('industry_Telecommunications', -0.02909732592573648),
 ('industry_-1', -0.02556157348315457),
 ('industry_Textiles', -0.021869904948957852),
 ('industry_Political Organization', 0.020450320978415933),
 ('industry_Computer Networking', -0.0196152740724031),
 ('industry_Aviation & Aerospace', 0.019421748957494912),
 ('industry_Food Production', -0.018770875157902978),
 ('industry_Entertainment', 0.017716193884278118),
 ('industry_Health, Wellness and Fitness', -0.016394703651856808),
 ('user_location_ankara', 0.01

In [6]:
def count_categorical(df, group_var, df_name):
    """Computes counts and normalized counts for each observation
    of `group_var` of each unique category in every categorical variable
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    group_var : string
        The variable by which to group the dataframe. For each unique
        value of this variable, the final dataframe will have one row
        
    df_name : string
        Variable added to the front of column names to keep track of columns

    
    Return
    --------
    categorical : dataframe
        A dataframe with counts and normalized counts of each unique category in every categorical variable
        with one row for every unique value of the `group_var`.
        
    """
    
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('object'))

    # Make sure to put the identifying id on the column
    categorical[group_var] = df[group_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(group_var).agg(['sum', 'mean'])
    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['count', 'count_norm']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    return categorical

In [149]:
categorical_grouped = count_categorical(languages_encode, "user_id", "language")
categorical_grouped.shape

(37290, 72)

In [150]:
categorical_grouped.head()

Unnamed: 0_level_0,language_language_albanian_count,language_language_albanian_count_norm,language_language_almanca_count,language_language_almanca_count_norm,language_language_arapca_count,language_language_arapca_count_norm,language_language_armenian_count,language_language_armenian_count_norm,language_language_arnavutca_count,language_language_arnavutca_count_norm,language_language_azerice_count,language_language_azerice_count_norm,language_language_bosnakca_count,language_language_bosnakca_count_norm,language_language_bulgarca_count,language_language_bulgarca_count_norm,language_language_cince_count,language_language_cince_count_norm,language_language_czech_count,language_language_czech_count_norm,language_language_dutch_count,language_language_dutch_count_norm,language_language_farsca_count,language_language_farsca_count_norm,language_language_fince_count,language_language_fince_count_norm,language_language_fransizca_count,language_language_fransizca_count_norm,language_language_greek_count,language_language_greek_count_norm,language_language_hebrew_count,language_language_hebrew_count_norm,language_language_ingilizce_count,language_language_ingilizce_count_norm,language_language_isaret dilleri_count,language_language_isaret dilleri_count_norm,language_language_ispanyolca_count,language_language_ispanyolca_count_norm,language_language_italyanca_count,language_language_italyanca_count_norm,language_language_japonca_count,language_language_japonca_count_norm,language_language_kazakca_count,language_language_kazakca_count_norm,language_language_korece_count,language_language_korece_count_norm,language_language_kurtce_count,language_language_kurtce_count_norm,language_language_latince_count,language_language_latince_count_norm,language_language_lehce_count,language_language_lehce_count_norm,language_language_other_count,language_language_other_count_norm,language_language_persian_count,language_language_persian_count_norm,language_language_polish_count,language_language_polish_count_norm,language_language_portekizce_count,language_language_portekizce_count_norm,language_language_rusca_count,language_language_rusca_count_norm,language_language_sirpca_count,language_language_sirpca_count_norm,language_language_swedish_count,language_language_swedish_count_norm,language_language_turkce_count,language_language_turkce_count_norm,language_language_urdu_count,language_language_urdu_count_norm,language_language_uzbek_count,language_language_uzbek_count_norm
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1
8,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.333333,0,0.0,0,0.0,1,0.333333,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.333333,0,0.0,0,0.0
10,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
11,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.333333,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.333333,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.333333,0,0.0,0,0.0
12,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.5,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.5,0,0.0,0,0.0
13,0,0.0,1,0.5,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.5,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [156]:
train = train.merge(categorical_grouped, left_on = 'user_id', right_index = True, how = 'left')
train.head()

Unnamed: 0,user_id,is_yurtdisi,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany yurtdisi,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands yurtdisi,user_location_nevsehir,user_location_ordu,user_location_philippines yurtdisi,user_location_poland yurtdisi,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia yurtdisi,user_location_sweden yurtdisi,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom yurtdisi,user_location_united states yurtdisi,user_location_van,user_location_yalova,user_location_zonguldak,moved_after_2019,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum,language_language_albanian_count,language_language_albanian_count_norm,language_language_almanca_count,language_language_almanca_count_norm,language_language_arapca_count,language_language_arapca_count_norm,language_language_armenian_count,language_language_armenian_count_norm,language_language_arnavutca_count,language_language_arnavutca_count_norm,language_language_azerice_count,language_language_azerice_count_norm,language_language_bosnakca_count,language_language_bosnakca_count_norm,language_language_bulgarca_count,language_language_bulgarca_count_norm,language_language_cince_count,language_language_cince_count_norm,language_language_czech_count,language_language_czech_count_norm,language_language_dutch_count,language_language_dutch_count_norm,language_language_farsca_count,language_language_farsca_count_norm,language_language_fince_count,language_language_fince_count_norm,language_language_fransizca_count,language_language_fransizca_count_norm,language_language_greek_count,language_language_greek_count_norm,language_language_hebrew_count,language_language_hebrew_count_norm,language_language_ingilizce_count,language_language_ingilizce_count_norm,language_language_isaret dilleri_count,language_language_isaret dilleri_count_norm,language_language_ispanyolca_count,language_language_ispanyolca_count_norm,language_language_italyanca_count,language_language_italyanca_count_norm,language_language_japonca_count,language_language_japonca_count_norm,language_language_kazakca_count,language_language_kazakca_count_norm,language_language_korece_count,language_language_korece_count_norm,language_language_kurtce_count,language_language_kurtce_count_norm,language_language_latince_count,language_language_latince_count_norm,language_language_lehce_count,language_language_lehce_count_norm,language_language_other_count,language_language_other_count_norm,language_language_persian_count,language_language_persian_count_norm,language_language_polish_count,language_language_polish_count_norm,language_language_portekizce_count,language_language_portekizce_count_norm,language_language_rusca_count,language_language_rusca_count_norm,language_language_sirpca_count,language_language_sirpca_count_norm,language_language_swedish_count,language_language_swedish_count_norm,language_language_turkce_count,language_language_turkce_count_norm,language_language_urdu_count,language_language_urdu_count_norm,language_language_uzbek_count,language_language_uzbek_count_norm
0,1301,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2.0,4.0,5.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0
1,6950,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.0,1.666667,3.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,1.0,0.333333,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,26046,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11005,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,4.0,5.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0


In [157]:
test.shape

(13255, 194)

In [158]:
train.shape

(53019, 272)

In [159]:
test = test.merge(language_agg, on="user_id", how="left")
test.head()

Unnamed: 0,user_id,is_yurtdisi,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany yurtdisi,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands yurtdisi,user_location_nevsehir,user_location_ordu,user_location_philippines yurtdisi,user_location_poland yurtdisi,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia yurtdisi,user_location_sweden yurtdisi,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom yurtdisi,user_location_united states yurtdisi,user_location_van,user_location_yalova,user_location_zonguldak,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum
0,17449,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,,,,,
1,33967,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,3.5,5.0,1.0,14.0
2,2110,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3.0,2.666667,5.0,1.0,8.0
3,55082,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,1.0,1.0,1.0,2.0
4,37165,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,4.5,5.0,4.0,9.0


In [160]:
test = test.merge(categorical_grouped, on="user_id", how="left")
test.head()

Unnamed: 0,user_id,is_yurtdisi,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany yurtdisi,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands yurtdisi,user_location_nevsehir,user_location_ordu,user_location_philippines yurtdisi,user_location_poland yurtdisi,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia yurtdisi,user_location_sweden yurtdisi,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom yurtdisi,user_location_united states yurtdisi,user_location_van,user_location_yalova,user_location_zonguldak,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum,language_language_albanian_count,language_language_albanian_count_norm,language_language_almanca_count,language_language_almanca_count_norm,language_language_arapca_count,language_language_arapca_count_norm,language_language_armenian_count,language_language_armenian_count_norm,language_language_arnavutca_count,language_language_arnavutca_count_norm,language_language_azerice_count,language_language_azerice_count_norm,language_language_bosnakca_count,language_language_bosnakca_count_norm,language_language_bulgarca_count,language_language_bulgarca_count_norm,language_language_cince_count,language_language_cince_count_norm,language_language_czech_count,language_language_czech_count_norm,language_language_dutch_count,language_language_dutch_count_norm,language_language_farsca_count,language_language_farsca_count_norm,language_language_fince_count,language_language_fince_count_norm,language_language_fransizca_count,language_language_fransizca_count_norm,language_language_greek_count,language_language_greek_count_norm,language_language_hebrew_count,language_language_hebrew_count_norm,language_language_ingilizce_count,language_language_ingilizce_count_norm,language_language_isaret dilleri_count,language_language_isaret dilleri_count_norm,language_language_ispanyolca_count,language_language_ispanyolca_count_norm,language_language_italyanca_count,language_language_italyanca_count_norm,language_language_japonca_count,language_language_japonca_count_norm,language_language_kazakca_count,language_language_kazakca_count_norm,language_language_korece_count,language_language_korece_count_norm,language_language_kurtce_count,language_language_kurtce_count_norm,language_language_latince_count,language_language_latince_count_norm,language_language_lehce_count,language_language_lehce_count_norm,language_language_other_count,language_language_other_count_norm,language_language_persian_count,language_language_persian_count_norm,language_language_polish_count,language_language_polish_count_norm,language_language_portekizce_count,language_language_portekizce_count_norm,language_language_rusca_count,language_language_rusca_count_norm,language_language_sirpca_count,language_language_sirpca_count_norm,language_language_swedish_count,language_language_swedish_count_norm,language_language_turkce_count,language_language_turkce_count_norm,language_language_urdu_count,language_language_urdu_count_norm,language_language_uzbek_count,language_language_uzbek_count_norm
0,17449,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,33967,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,3.5,5.0,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0
2,2110,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3.0,2.666667,5.0,1.0,8.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0
3,55082,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0
4,37165,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,4.5,5.0,4.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0


In [161]:
test.shape

(13255, 271)

In [163]:
train.to_csv("train_user_lang.csv", index=False)
test.to_csv("test_user_lang.csv", index=False)

# Work_exp

In [9]:
train[train.language_language_lehce_count.isna()==True]

Unnamed: 0,user_id,is_yurtdisi,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany yurtdisi,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands yurtdisi,user_location_nevsehir,user_location_ordu,user_location_philippines yurtdisi,user_location_poland yurtdisi,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia yurtdisi,user_location_sweden yurtdisi,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom yurtdisi,user_location_united states yurtdisi,user_location_van,user_location_yalova,user_location_zonguldak,moved_after_2019,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum,language_language_albanian_count,language_language_albanian_count_norm,language_language_almanca_count,language_language_almanca_count_norm,language_language_arapca_count,language_language_arapca_count_norm,language_language_armenian_count,language_language_armenian_count_norm,language_language_arnavutca_count,language_language_arnavutca_count_norm,language_language_azerice_count,language_language_azerice_count_norm,language_language_bosnakca_count,language_language_bosnakca_count_norm,language_language_bulgarca_count,language_language_bulgarca_count_norm,language_language_cince_count,language_language_cince_count_norm,language_language_czech_count,language_language_czech_count_norm,language_language_dutch_count,language_language_dutch_count_norm,language_language_farsca_count,language_language_farsca_count_norm,language_language_fince_count,language_language_fince_count_norm,language_language_fransizca_count,language_language_fransizca_count_norm,language_language_greek_count,language_language_greek_count_norm,language_language_hebrew_count,language_language_hebrew_count_norm,language_language_ingilizce_count,language_language_ingilizce_count_norm,language_language_isaret dilleri_count,language_language_isaret dilleri_count_norm,language_language_ispanyolca_count,language_language_ispanyolca_count_norm,language_language_italyanca_count,language_language_italyanca_count_norm,language_language_japonca_count,language_language_japonca_count_norm,language_language_kazakca_count,language_language_kazakca_count_norm,language_language_korece_count,language_language_korece_count_norm,language_language_kurtce_count,language_language_kurtce_count_norm,language_language_latince_count,language_language_latince_count_norm,language_language_lehce_count,language_language_lehce_count_norm,language_language_other_count,language_language_other_count_norm,language_language_persian_count,language_language_persian_count_norm,language_language_polish_count,language_language_polish_count_norm,language_language_portekizce_count,language_language_portekizce_count_norm,language_language_rusca_count,language_language_rusca_count_norm,language_language_sirpca_count,language_language_sirpca_count_norm,language_language_swedish_count,language_language_swedish_count_norm,language_language_turkce_count,language_language_turkce_count_norm,language_language_urdu_count,language_language_urdu_count_norm,language_language_uzbek_count,language_language_uzbek_count_norm
2,4880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,47498,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10,65923,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
12,33971,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
14,40150,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53009,41192,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
53011,855,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
53012,14909,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
53013,20367,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [7]:
train = pd.read_csv("./train_user_lang.csv")

In [8]:
test = pd.read_csv("./test_user_lang.csv")


In [9]:
work_exp = pd.read_csv("../data/work_experiences.csv")
check_df(work_exp)

**********************************************************************
******************************* Shape ********************************
**********************************************************************
(187019, 4)
**********************************************************************
******************************* Types ********************************
**********************************************************************
user_id              int64
company_id           int64
location            object
start_year_month     int64
dtype: object
**********************************************************************
******************************** Head ********************************
**********************************************************************
   user_id  company_id          location  start_year_month
0    53442        2651  Istanbul, Turkey            201509
1    34558         815  Istanbul, Turkey            201210
2    63761       26354               NaN          

In [11]:
work_exp.head()

Unnamed: 0,user_id,company_id,location,start_year_month
0,53442,2651,"Istanbul, Turkey",201509
1,34558,815,"Istanbul, Turkey",201210
2,63761,26354,,200010
3,10738,89,,201610
4,8711,3113,"Istanbul, Turkey",201801


In [18]:
work_exp[work_exp["user_id"]==63012]

Unnamed: 0,user_id,company_id,location,start_year_month
58037,63012,2158,,201107
72197,63012,1343,"Istanbul, Turkey",201804
170031,63012,26,,201310


In [19]:
work_exp.start_year_month.value_counts()

201807    4053
201907    3925
201707    3912
201806    3811
201706    2948
201809    2905
201906    2845
201909    2820
201606    2763
201607    2710
201506    2652
201801    2500
201901    2392
201808    2378
201709    2350
201708    2325
201507    2284
201908    2274
201910    2180
201810    2123
201406    2091
201811    2076
201701    2005
201802    1882
201710    1872
201608    1863
201508    1846
201911    1832
201306    1824
201307    1806
201407    1802
201805    1802
201609    1775
201902    1770
201803    1706
201912    1645
201601    1632
201904    1622
201804    1616
201509    1603
201812    1580
201409    1561
201905    1542
201206    1540
201702    1472
201408    1454
201711    1454
201705    1441
201903    1436
201501    1407
201207    1398
201610    1378
201308    1287
201611    1276
201703    1257
201704    1241
201401    1232
201309    1230
201603    1211
201712    1184
201106    1153
201107    1135
201510    1131
201301    1121
201602    1107
201605    1094
201502    

In [158]:
z=work_exp.copy()

In [159]:
z['start_date'] = pd.to_datetime(z['start_year_month'].astype(str), format='%Y%m')

In [160]:
z.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date
0,53442,2651,"Istanbul, Turkey",201509,2015-09-01
1,34558,815,"Istanbul, Turkey",201210,2012-10-01
2,63761,26354,,200010,2000-10-01
3,10738,89,,201610,2016-10-01
4,8711,3113,"Istanbul, Turkey",201801,2018-01-01


In [161]:
z.shape

(187019, 5)

In [162]:
z = z[z['start_date'].dt.year != 2019]

In [163]:
z.shape

(160736, 5)

In [164]:
z = z.sort_values(by=['user_id', 'start_date'], ascending=[True, True])


In [165]:
z.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date
147720,0,0,Serbest Çalışmalar,200509,2005-09-01
174454,0,0,Visual Studio Asp.Net Developer,200509,2005-09-01
760,2,9,"Elazig, Turkey",201612,2016-12-01
19762,2,7,"Elazig, Turkey",201706,2017-06-01
180157,2,10,"Mersin, Turkey",201806,2018-06-01


In [166]:
z['start_year_month_shifted'] = z.groupby('user_id')['start_date'].shift(-1)

z['job_duration'] = (z['start_year_month_shifted'] - z['start_date']).dt.days / 365
z['job_duration'].fillna(0, inplace=True)

z['job_duration'] = np.where((z['job_duration'] == 0) & (z['start_date'] != z['start_year_month_shifted']), (pd.to_datetime('2018-12-31') - z['start_date']).dt.days / 365, z['job_duration'])

In [167]:
z['job_count'] = np.where(z['job_duration'] > 0, 1, 0)
z['job_count'] = z.groupby('user_id')['job_count'].cumsum()

In [168]:
z['job_count'] = z.groupby(['user_id'])['job_count'].transform(lambda x: x.max())

In [169]:
z.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count
147720,0,0,Serbest Çalışmalar,200509,2005-09-01,2005-09-01,0.0,1
174454,0,0,Visual Studio Asp.Net Developer,200509,2005-09-01,NaT,13.339726,1
760,2,9,"Elazig, Turkey",201612,2016-12-01,2017-06-01,0.49863,3
19762,2,7,"Elazig, Turkey",201706,2017-06-01,2018-06-01,1.0,3
180157,2,10,"Mersin, Turkey",201806,2018-06-01,NaT,0.583562,3


In [157]:
z[z["user_id"]==10]


Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count
4573,10,27,"Elazig, Turkey",201308,2013-08-01,2018-05-01,4.750685,2
43293,10,26,"Istanbul, Turkey",201805,2018-05-01,2018-05-01,0.0,2
133573,10,26,istanbul,201805,2018-05-01,NaT,0.668493,2


In [171]:
z['min_start_year'] = z.groupby('user_id')['start_date'].transform('min').dt.year

In [172]:
z.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year
147720,0,0,Serbest Çalışmalar,200509,2005-09-01,2005-09-01,0.0,1,2005
174454,0,0,Visual Studio Asp.Net Developer,200509,2005-09-01,NaT,13.339726,1,2005
760,2,9,"Elazig, Turkey",201612,2016-12-01,2017-06-01,0.49863,3,2016
19762,2,7,"Elazig, Turkey",201706,2017-06-01,2018-06-01,1.0,3,2016
180157,2,10,"Mersin, Turkey",201806,2018-06-01,NaT,0.583562,3,2016


In [173]:
z['working_years'] = (pd.to_datetime('2018-12-31').year - z['min_start_year'])

In [201]:
df_sum = z.groupby('user_id')['job_duration'].sum()
df_sum = df_sum.reset_index()
df_sum = df_sum.rename(columns={'job_duration': 'total_job_duration'})

In [202]:
df_sum

Unnamed: 0,user_id,total_job_duration
0,0,13.339726
1,2,2.082192
2,5,1.583562
3,7,2.501370
4,10,5.419178
...,...,...
52997,66269,14.758904
52998,66270,4.167123
52999,66271,17.175342
53000,66272,6.504110


In [203]:
z = z.merge(df_sum, on="user_id", how="left")

In [211]:
z.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year3
0,0,0,Serbest Çalışmalar,200509,2005-09-01,2005-09-01,0.0,1,2005,13,13.339726,13.339726
1,0,0,Visual Studio Asp.Net Developer,200509,2005-09-01,NaT,13.339726,1,2005,13,13.339726,13.339726
2,2,9,"Elazig, Turkey",201612,2016-12-01,2017-06-01,0.49863,3,2016,2,2.082192,0.694064
3,2,7,"Elazig, Turkey",201706,2017-06-01,2018-06-01,1.0,3,2016,2,2.082192,0.694064
4,2,10,"Mersin, Turkey",201806,2018-06-01,NaT,0.583562,3,2016,2,2.082192,0.694064


In [212]:
z['avg_work_year'] = z['total_job_duration'] / z['job_count']


In [214]:
z.drop(columns="avg_work_year3", inplace=True)

In [215]:
z.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year
0,0,0,Serbest Çalışmalar,200509,2005-09-01,2005-09-01,0.0,1,2005,13,13.339726,13.339726
1,0,0,Visual Studio Asp.Net Developer,200509,2005-09-01,NaT,13.339726,1,2005,13,13.339726,13.339726
2,2,9,"Elazig, Turkey",201612,2016-12-01,2017-06-01,0.49863,3,2016,2,2.082192,0.694064
3,2,7,"Elazig, Turkey",201706,2017-06-01,2018-06-01,1.0,3,2016,2,2.082192,0.694064
4,2,10,"Mersin, Turkey",201806,2018-06-01,NaT,0.583562,3,2016,2,2.082192,0.694064


In [218]:
z[z["user_id"]==100]

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year
122,100,243,Ankara,200612,2006-12-01,2010-01-01,3.087671,6,2006,12,12.090411,2.015068
123,100,245,"Ankara, Türkiye",201001,2010-01-01,2010-01-01,0.0,6,2006,12,12.090411,2.015068
124,100,258,"Ankara, Türkiye",201001,2010-01-01,2013-03-01,3.164384,6,2006,12,12.090411,2.015068
125,100,257,"Ankara, Türkiye",201303,2013-03-01,2015-08-01,2.419178,6,2006,12,12.090411,2.015068
126,100,231,Ankara,201508,2015-08-01,2017-07-01,1.917808,6,2006,12,12.090411,2.015068
127,100,253,"Ankara, Türkiye",201707,2017-07-01,2018-01-01,0.50411,6,2006,12,12.090411,2.015068
128,100,260,Ankara,201801,2018-01-01,NaT,0.99726,6,2006,12,12.090411,2.015068


In [219]:
z['quit_job_2018'] = (z['start_date'].dt.year >= 2018).astype(int)

z['max_start_year'] = z.groupby('user_id')['start_date'].transform('max').dt.year

In [222]:
z[z["user_id"]==66273]

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year,quit_job_2018,max_start_year
160728,66273,9918,"İstanbul, Türkiye",201406,2014-06-01,2015-05-01,0.915068,7,2014,4,4.586301,0.655186,0,2018
160729,66273,13446,,201505,2015-05-01,2016-01-01,0.671233,7,2014,4,4.586301,0.655186,0,2018
160730,66273,13445,,201601,2016-01-01,2016-01-01,0.0,7,2014,4,4.586301,0.655186,0,2018
160731,66273,1900,,201601,2016-01-01,2016-12-01,0.917808,7,2014,4,4.586301,0.655186,0,2018
160732,66273,2696,,201612,2016-12-01,2017-08-01,0.665753,7,2014,4,4.586301,0.655186,0,2018
160733,66273,1509,,201708,2017-08-01,2018-09-01,1.084932,7,2014,4,4.586301,0.655186,0,2018
160734,66273,2412,,201809,2018-09-01,2018-10-01,0.082192,7,2014,4,4.586301,0.655186,1,2018
160735,66273,3843,,201810,2018-10-01,NaT,0.249315,7,2014,4,4.586301,0.655186,1,2018


In [224]:
z.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year,quit_job_2018,max_start_year
0,0,0,Serbest Çalışmalar,200509,2005-09-01,2005-09-01,0.0,1,2005,13,13.339726,13.339726,0,2005
1,0,0,Visual Studio Asp.Net Developer,200509,2005-09-01,NaT,13.339726,1,2005,13,13.339726,13.339726,0,2005
2,2,9,"Elazig, Turkey",201612,2016-12-01,2017-06-01,0.49863,3,2016,2,2.082192,0.694064,0,2018
3,2,7,"Elazig, Turkey",201706,2017-06-01,2018-06-01,1.0,3,2016,2,2.082192,0.694064,0,2018
4,2,10,"Mersin, Turkey",201806,2018-06-01,NaT,0.583562,3,2016,2,2.082192,0.694064,1,2018


In [227]:
z.drop(columns="duration_of_employment", inplace=True)

In [228]:
z.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year,quit_job_2018,max_start_year
0,0,0,Serbest Çalışmalar,200509,2005-09-01,2005-09-01,0.0,1,2005,13,13.339726,13.339726,0,2005
1,0,0,Visual Studio Asp.Net Developer,200509,2005-09-01,NaT,13.339726,1,2005,13,13.339726,13.339726,0,2005
2,2,9,"Elazig, Turkey",201612,2016-12-01,2017-06-01,0.49863,3,2016,2,2.082192,0.694064,0,2018
3,2,7,"Elazig, Turkey",201706,2017-06-01,2018-06-01,1.0,3,2016,2,2.082192,0.694064,0,2018
4,2,10,"Mersin, Turkey",201806,2018-06-01,NaT,0.583562,3,2016,2,2.082192,0.694064,1,2018


In [233]:
bins = [0, 1, 2, 5, 10, np.inf]
labels = ['Less than 1 year','1-2 years', '2-5 years', '5-10 years', 'More than 10 years']
z['job_tenure_groups'] = pd.cut(z['job_duration'], bins=bins, labels=labels)

In [242]:
z['year_of_job_start'] = z['start_date'].dt.year
z['month_of_job_start'] = z['start_date'].dt.month

In [247]:

# Duration of employment

# Year and Month of Job Start
z['year_of_job_start'] = z['start_date'].dt.year
z['month_of_job_start'] = z['start_date'].dt.month


# Seasonal trend - maybe start_date -> shifted?
z['quit_job_in_2018'] = [1 if x.year == 2018 else 0 for x in z['start_year_month_shifted']]
seasonal_trend = z.groupby('month_of_job_start')['quit_job_in_2018'].mean()

# Moving Average
z['moving_average'] = z.groupby(['year_of_job_start', 'month_of_job_start'])['quit_job_in_2018'].transform('mean')

# Time since the last job quitting trend change
z['quit_job_in_2018_change'] = z['quit_job_in_2018'].diff()
z['time_since_last_change'] = z.groupby(['user_id'])['quit_job_in_2018_change'].apply(lambda x: x.where(x!=0).count())

In [248]:
z['seasonal_trend'] = z['month_of_job_start'].map(seasonal_trend)

In [249]:
z.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year,quit_job_2018,max_start_year,job_tenure_groups,year_of_job_start,month_of_job_start,quit_job_in_2018,moving_average,quit_job_in_2018_change,time_since_last_change,seasonal_trend
0,0,0,Serbest Çalışmalar,200509,2005-09-01,2005-09-01,0.0,1,2005,13,13.339726,13.339726,0,2005,,2005,9,0,0.011321,,0.0,0.140378
1,0,0,Visual Studio Asp.Net Developer,200509,2005-09-01,NaT,13.339726,1,2005,13,13.339726,13.339726,0,2005,More than 10 years,2005,9,0,0.011321,0.0,,0.140378
2,2,9,"Elazig, Turkey",201612,2016-12-01,2017-06-01,0.49863,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2016,12,0,0.274924,0.0,2.0,0.110077
3,2,7,"Elazig, Turkey",201706,2017-06-01,2018-06-01,1.0,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2017,6,1,0.379919,1.0,,0.14075
4,2,10,"Mersin, Turkey",201806,2018-06-01,NaT,0.583562,3,2016,2,2.082192,0.694064,1,2018,Less than 1 year,2018,6,0,0.234584,-1.0,,0.14075


In [251]:


# create a categorical column for the quarter in which the user started their job
z['start_quarter'] = z['start_date'].dt.quarter
z['start_quarter'] = 'Q' + z['start_quarter'].astype(str)


# create a categorical column for the time of year in which the user started their job
z['start_season'] = np.where(z['start_date'].dt.month.isin([12, 1, 2]), 'winter',
                              np.where(z['start_date'].dt.month.isin([3, 4, 5]), 'spring',
                                       np.where(z['start_date'].dt.month.isin([6, 7, 8]), 'summer', 'fall')))

# create a polynomial feature to capture non-linear relationships between the start date and the target
z['start_date_squared'] = z['start_date'].dt.year ** 2


In [252]:
z.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year,quit_job_2018,max_start_year,job_tenure_groups,year_of_job_start,month_of_job_start,quit_job_in_2018,moving_average,quit_job_in_2018_change,time_since_last_change,seasonal_trend,start_quarter,start_season,start_date_squared
0,0,0,Serbest Çalışmalar,200509,2005-09-01,2005-09-01,0.0,1,2005,13,13.339726,13.339726,0,2005,,2005,9,0,0.011321,,0.0,0.140378,Q3,fall,4020025
1,0,0,Visual Studio Asp.Net Developer,200509,2005-09-01,NaT,13.339726,1,2005,13,13.339726,13.339726,0,2005,More than 10 years,2005,9,0,0.011321,0.0,,0.140378,Q3,fall,4020025
2,2,9,"Elazig, Turkey",201612,2016-12-01,2017-06-01,0.49863,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2016,12,0,0.274924,0.0,2.0,0.110077,Q4,winter,4064256
3,2,7,"Elazig, Turkey",201706,2017-06-01,2018-06-01,1.0,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2017,6,1,0.379919,1.0,,0.14075,Q2,summer,4068289
4,2,10,"Mersin, Turkey",201806,2018-06-01,NaT,0.583562,3,2016,2,2.082192,0.694064,1,2018,Less than 1 year,2018,6,0,0.234584,-1.0,,0.14075,Q2,summer,4072324


In [253]:
z[z["user_id"]==49764]


Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year,quit_job_2018,max_start_year,job_tenure_groups,year_of_job_start,month_of_job_start,quit_job_in_2018,moving_average,quit_job_in_2018_change,time_since_last_change,seasonal_trend,start_quarter,start_season,start_date_squared
118348,49764,3511,,201403,2014-03-01,2015-02-01,0.923288,3,2014,4,4.838356,1.612785,0,2016,Less than 1 year,2014,3,0,0.081186,0.0,,0.150295,Q1,spring,4056196
118349,49764,3298,"Istanbul, Turkey",201502,2015-02-01,2016-03-01,1.079452,3,2014,4,4.838356,1.612785,0,2016,1-2 years,2015,2,0,0.132093,0.0,,0.159553,Q1,winter,4060225
118350,49764,23615,,201603,2016-03-01,NaT,2.835616,3,2014,4,4.838356,1.612785,0,2016,2-5 years,2016,3,0,0.206441,0.0,,0.150295,Q1,spring,4064256


In [254]:
z.dtypes

user_id                              int64
company_id                           int64
location                            object
start_year_month                     int64
start_date                  datetime64[ns]
start_year_month_shifted    datetime64[ns]
job_duration                       float64
job_count                            int32
min_start_year                       int64
working_years                        int64
total_job_duration                 float64
avg_work_year                      float64
quit_job_2018                        int32
max_start_year                       int64
job_tenure_groups                 category
year_of_job_start                    int64
month_of_job_start                   int64
quit_job_in_2018                     int64
moving_average                     float64
quit_job_in_2018_change            float64
time_since_last_change             float64
seasonal_trend                     float64
start_quarter                       object
start_seaso

In [255]:
z.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year,quit_job_2018,max_start_year,job_tenure_groups,year_of_job_start,month_of_job_start,quit_job_in_2018,moving_average,quit_job_in_2018_change,time_since_last_change,seasonal_trend,start_quarter,start_season,start_date_squared
0,0,0,Serbest Çalışmalar,200509,2005-09-01,2005-09-01,0.0,1,2005,13,13.339726,13.339726,0,2005,,2005,9,0,0.011321,,0.0,0.140378,Q3,fall,4020025
1,0,0,Visual Studio Asp.Net Developer,200509,2005-09-01,NaT,13.339726,1,2005,13,13.339726,13.339726,0,2005,More than 10 years,2005,9,0,0.011321,0.0,,0.140378,Q3,fall,4020025
2,2,9,"Elazig, Turkey",201612,2016-12-01,2017-06-01,0.49863,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2016,12,0,0.274924,0.0,2.0,0.110077,Q4,winter,4064256
3,2,7,"Elazig, Turkey",201706,2017-06-01,2018-06-01,1.0,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2017,6,1,0.379919,1.0,,0.14075,Q2,summer,4068289
4,2,10,"Mersin, Turkey",201806,2018-06-01,NaT,0.583562,3,2016,2,2.082192,0.694064,1,2018,Less than 1 year,2018,6,0,0.234584,-1.0,,0.14075,Q2,summer,4072324


In [256]:
z.min_start_year.value_counts()

2013    13865
2015    13644
2014    13617
2012    12999
2011    12048
2016    11897
2017    10810
2010    10630
2009     8454
2007     7622
2008     7609
2018     6988
2006     6327
2005     4449
2004     3823
2003     2318
2000     2302
2002     2143
2001     1636
1998     1499
1999     1433
1997     1321
1995      773
1996      519
1994      489
1990      323
1993      316
1981      208
1986      195
1989      120
1992      113
1991       97
1988       79
1979       16
1987       13
1985       13
1984        8
1961        8
1983        5
1982        4
1968        2
1976        1
Name: min_start_year, dtype: int64

In [281]:
t = z.copy()

In [282]:
province={
"1.0" : "Adana",
"2.0" : "Adıyaman",
"3.0" : "Afyonkarahisar",
"4.0" : "Ağrı",
"5.0" : "Amasya",
"6.0" : "Ankara",
"7.0" : "Antalya",
"8.0" : "Artvin",
"9.0" : "Aydın",
"10.0" : "Balıkesir",
"11.0" : "Bilecik",
"12.0" : "Bingöl",
"13.0" : "Bitlis",
"14.0" : "Bolu",
"15.0" : "Burdur",
"16.0" : "Bursa",
"17.0" : "Çanakkale",
"18.0" : "Çankırı",
"19.0" : "Çorum",
"20.0" : "Denizli",
"21.0" : "Diyarbakır",
"22.0" : "Edirne",
"23.0" : "Elâzığ",
"24.0" : "Erzincan",
"25.0" : "Erzurum",
"26.0" : "Eskişehir",
"27.0" : "Gaziantep",
"28.0" : "Giresun",
"29.0" : "Gümüşhane",
"30.0" : "Hakkâri",
"31.0" : "Hatay",
"32.0" : "Isparta",
"33.0" : "Mersin",
"34.0" : "İstanbul",
"35.0" : "İzmir",
"36.0" : "Kars",
"37.0" : "Kastamonu",
"38.0" : "Kayseri",
"39.0" : "Kırklareli",
"40.0" : "Kırşehir",
"41.0" : "Kocaeli",
"42.0" : "Konya",
"43.0" : "Kütahya",
"44.0" : "Malatya",
"45.0" : "Manisa",
"46.0" : "Kahramanmaraş",
"47.0" : "Mardin",
"48.0" : "Muğla",
"49.0" : "Muş",
"50.0" : "Nevşehir",
"51.0" : "Niğde",
"52.0" : "Ordu",
"53.0" : "Rize",
"54.0" : "Sakarya",
"55.0" : "Samsun",
"56.0" : "Siirt",
"57.0" : "Sinop",
"58.0" : "Sivas",
"59.0" : "Tekirdağ",
"60.0" : "Tokat",
"61.0" : "Trabzon",
"62.0" : "Tunceli",
"63.0" : "Şanlıurfa",
"64.0" : "Uşak",
"65.0" : "Van",
"66.0" : "Yozgat",
"67.0" : "Zonguldak",
"68.0" : "Aksaray",
"69.0" : "Bayburt",
"70.0" : "Karaman",
"71.0" : "Kırıkkale",
"72.0" : "Batman",
"73.0" : "Şırnak",
"74.0" : "Bartın",
"75.0" : "Ardahan",
"76.0" : "Iğdır",
"77.0" : "Yalova",
"78.0" : "Karabük",
"79.0" : "Kilis",
"80.0" : "Osmaniye",
"81.0" : "Düzce"
}


In [283]:
import swifter
from unidecode import unidecode

def location(string_to_check):
    global val
    string_to_check = unidecode(str(string_to_check).lower())
    if ", " in string_to_check:
        sub = string_to_check.split(", ")
    elif "," in string_to_check:
        sub = string_to_check.split(",")
    elif "/" in string_to_check:
        sub = string_to_check.split("/")
    elif "-" in string_to_check:
        sub = string_to_check.split("-")
    else:
        sub = string_to_check.split()

    sub_dict = dict((v,unidecode(k.lower())) for v, k in province.items())

    for val in sub:
        if val in sub_dict.values():
            return val
        else:
            continue
    return val


#df['result'] = df['string_to_check'].apply(check_string)

In [284]:
t["job_location"] = t["location"].swifter.apply(location)


Pandas Apply:   0%|          | 0/160736 [00:00<?, ?it/s]

In [285]:
t.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year,quit_job_2018,max_start_year,job_tenure_groups,year_of_job_start,month_of_job_start,quit_job_in_2018,moving_average,quit_job_in_2018_change,time_since_last_change,seasonal_trend,start_quarter,start_season,start_date_squared,job_location
0,0,0,Serbest Çalışmalar,200509,2005-09-01,2005-09-01,0.0,1,2005,13,13.339726,13.339726,0,2005,,2005,9,0,0.011321,,0.0,0.140378,Q3,fall,4020025,calismalar
1,0,0,Visual Studio Asp.Net Developer,200509,2005-09-01,NaT,13.339726,1,2005,13,13.339726,13.339726,0,2005,More than 10 years,2005,9,0,0.011321,0.0,,0.140378,Q3,fall,4020025,developer
2,2,9,"Elazig, Turkey",201612,2016-12-01,2017-06-01,0.49863,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2016,12,0,0.274924,0.0,2.0,0.110077,Q4,winter,4064256,elazig
3,2,7,"Elazig, Turkey",201706,2017-06-01,2018-06-01,1.0,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2017,6,1,0.379919,1.0,,0.14075,Q2,summer,4068289,elazig
4,2,10,"Mersin, Turkey",201806,2018-06-01,NaT,0.583562,3,2016,2,2.082192,0.694064,1,2018,Less than 1 year,2018,6,0,0.234584,-1.0,,0.14075,Q2,summer,4072324,mersin


In [286]:
t.job_location.value_counts()[:100]

istanbul              55079
nan                   52988
ankara                17569
izmir                  6141
kocaeli                3834
turkey                 1804
bursa                  1327
eskisehir              1277
sakarya                1002
turkiye                 884
manisa                  871
antalya                 851
adana                   627
kayseri                 622
konya                   587
germany                 451
tekirdag                377
gebze                   344
trabzon                 316
mersin                  279
mugla                   273
united states           239
area                    205
elazig                  203
 istanbul               194
teknokent               176
united kingdom          163
spain                   163
denizli                 161
gaziantep               159
balikesir               159
samsun                  159
edirne                  157
isparta                 154
maslak                  144
stajyer             

In [287]:
mask = t.job_location.map(t.job_location.value_counts()) < 203
t.job_location =  t.job_location.mask(mask, "nan")

In [288]:
t.loc[t['job_location'].str.contains('turk'), 'job_location'] = 'turkiye'


In [289]:
t.job_location.replace("nan", np.nan, inplace=True)

In [290]:
t.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year,quit_job_2018,max_start_year,job_tenure_groups,year_of_job_start,month_of_job_start,quit_job_in_2018,moving_average,quit_job_in_2018_change,time_since_last_change,seasonal_trend,start_quarter,start_season,start_date_squared,job_location
0,0,0,Serbest Çalışmalar,200509,2005-09-01,2005-09-01,0.0,1,2005,13,13.339726,13.339726,0,2005,,2005,9,0,0.011321,,0.0,0.140378,Q3,fall,4020025,
1,0,0,Visual Studio Asp.Net Developer,200509,2005-09-01,NaT,13.339726,1,2005,13,13.339726,13.339726,0,2005,More than 10 years,2005,9,0,0.011321,0.0,,0.140378,Q3,fall,4020025,
2,2,9,"Elazig, Turkey",201612,2016-12-01,2017-06-01,0.49863,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2016,12,0,0.274924,0.0,2.0,0.110077,Q4,winter,4064256,elazig
3,2,7,"Elazig, Turkey",201706,2017-06-01,2018-06-01,1.0,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2017,6,1,0.379919,1.0,,0.14075,Q2,summer,4068289,elazig
4,2,10,"Mersin, Turkey",201806,2018-06-01,NaT,0.583562,3,2016,2,2.082192,0.694064,1,2018,Less than 1 year,2018,6,0,0.234584,-1.0,,0.14075,Q2,summer,4072324,mersin


In [291]:
t.job_location.value_counts()

istanbul         55079
ankara           17569
izmir             6141
kocaeli           3834
turkiye           2688
bursa             1327
eskisehir         1277
sakarya           1002
manisa             871
antalya            851
adana              627
kayseri            622
konya              587
germany            451
tekirdag           377
gebze              344
trabzon            316
mersin             279
mugla              273
united states      239
area               205
elazig             203
Name: job_location, dtype: int64

In [293]:
t = pd.merge(t,train[['user_id','moved_after_2019']],on='user_id', how='left')
t.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year,quit_job_2018,max_start_year,job_tenure_groups,year_of_job_start,month_of_job_start,quit_job_in_2018,moving_average,quit_job_in_2018_change,time_since_last_change,seasonal_trend,start_quarter,start_season,start_date_squared,job_location,moved_after_2019
0,0,0,Serbest Çalışmalar,200509,2005-09-01,2005-09-01,0.0,1,2005,13,13.339726,13.339726,0,2005,,2005,9,0,0.011321,,0.0,0.140378,Q3,fall,4020025,,0.0
1,0,0,Visual Studio Asp.Net Developer,200509,2005-09-01,NaT,13.339726,1,2005,13,13.339726,13.339726,0,2005,More than 10 years,2005,9,0,0.011321,0.0,,0.140378,Q3,fall,4020025,,0.0
2,2,9,"Elazig, Turkey",201612,2016-12-01,2017-06-01,0.49863,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2016,12,0,0.274924,0.0,2.0,0.110077,Q4,winter,4064256,elazig,
3,2,7,"Elazig, Turkey",201706,2017-06-01,2018-06-01,1.0,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2017,6,1,0.379919,1.0,,0.14075,Q2,summer,4068289,elazig,
4,2,10,"Mersin, Turkey",201806,2018-06-01,NaT,0.583562,3,2016,2,2.082192,0.694064,1,2018,Less than 1 year,2018,6,0,0.234584,-1.0,,0.14075,Q2,summer,4072324,mersin,


In [294]:
Mean_encoded_subject = t.groupby(['company_id'])['moved_after_2019'].mean().to_dict()
  
t['company_id2'] =  t['company_id'].map(Mean_encoded_subject)

In [295]:
t.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year,quit_job_2018,max_start_year,job_tenure_groups,year_of_job_start,month_of_job_start,quit_job_in_2018,moving_average,quit_job_in_2018_change,time_since_last_change,seasonal_trend,start_quarter,start_season,start_date_squared,job_location,moved_after_2019,company_id2
0,0,0,Serbest Çalışmalar,200509,2005-09-01,2005-09-01,0.0,1,2005,13,13.339726,13.339726,0,2005,,2005,9,0,0.011321,,0.0,0.140378,Q3,fall,4020025,,0.0,0.0
1,0,0,Visual Studio Asp.Net Developer,200509,2005-09-01,NaT,13.339726,1,2005,13,13.339726,13.339726,0,2005,More than 10 years,2005,9,0,0.011321,0.0,,0.140378,Q3,fall,4020025,,0.0,0.0
2,2,9,"Elazig, Turkey",201612,2016-12-01,2017-06-01,0.49863,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2016,12,0,0.274924,0.0,2.0,0.110077,Q4,winter,4064256,elazig,,1.0
3,2,7,"Elazig, Turkey",201706,2017-06-01,2018-06-01,1.0,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2017,6,1,0.379919,1.0,,0.14075,Q2,summer,4068289,elazig,,1.0
4,2,10,"Mersin, Turkey",201806,2018-06-01,NaT,0.583562,3,2016,2,2.082192,0.694064,1,2018,Less than 1 year,2018,6,0,0.234584,-1.0,,0.14075,Q2,summer,4072324,mersin,,0.333333


In [296]:
t.company_id2.value_counts().shape


(574,)

In [300]:
t.to_csv("work_ex.csv", index=False)

## Agg work_ex then finish other datasets!

In [4]:
work_exp = pd.read_csv("./work_ex.csv")
check_df(work_exp)

**********************************************************************
******************************* Shape ********************************
**********************************************************************
(160736, 28)
**********************************************************************
******************************* Types ********************************
**********************************************************************
user_id                       int64
company_id                    int64
location                     object
start_year_month              int64
start_date                   object
start_year_month_shifted     object
job_duration                float64
job_count                     int64
min_start_year                int64
working_years                 int64
total_job_duration          float64
avg_work_year               float64
quit_job_2018                 int64
max_start_year                int64
job_tenure_groups            object
year_of_job_start    

In [7]:
work_exp.dtypes

user_id                       int64
company_id                    int64
location                     object
start_year_month              int64
start_date                   object
start_year_month_shifted     object
job_duration                float64
job_count                     int64
min_start_year                int64
working_years                 int64
total_job_duration          float64
avg_work_year               float64
quit_job_2018                 int64
max_start_year                int64
job_tenure_groups            object
year_of_job_start             int64
month_of_job_start            int64
quit_job_in_2018              int64
moving_average              float64
quit_job_in_2018_change     float64
time_since_last_change      float64
seasonal_trend              float64
start_quarter                object
start_season                 object
start_date_squared            int64
job_location                 object
moved_after_2019            float64
company_id2                 

In [5]:
work_exp.head()

Unnamed: 0,user_id,company_id,location,start_year_month,start_date,start_year_month_shifted,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year,quit_job_2018,max_start_year,job_tenure_groups,year_of_job_start,month_of_job_start,quit_job_in_2018,moving_average,quit_job_in_2018_change,time_since_last_change,seasonal_trend,start_quarter,start_season,start_date_squared,job_location,moved_after_2019,company_id2
0,0,0,Serbest Çalışmalar,200509,2005-09-01,2005-09-01,0.0,1,2005,13,13.339726,13.339726,0,2005,,2005,9,0,0.011321,,0.0,0.140378,Q3,fall,4020025,,0.0,0.0
1,0,0,Visual Studio Asp.Net Developer,200509,2005-09-01,,13.339726,1,2005,13,13.339726,13.339726,0,2005,More than 10 years,2005,9,0,0.011321,0.0,,0.140378,Q3,fall,4020025,,0.0,0.0
2,2,9,"Elazig, Turkey",201612,2016-12-01,2017-06-01,0.49863,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2016,12,0,0.274924,0.0,2.0,0.110077,Q4,winter,4064256,elazig,,1.0
3,2,7,"Elazig, Turkey",201706,2017-06-01,2018-06-01,1.0,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2017,6,1,0.379919,1.0,,0.14075,Q2,summer,4068289,elazig,,1.0
4,2,10,"Mersin, Turkey",201806,2018-06-01,,0.583562,3,2016,2,2.082192,0.694064,1,2018,Less than 1 year,2018,6,0,0.234584,-1.0,,0.14075,Q2,summer,4072324,mersin,,0.333333


In [8]:
work_exp.drop(columns=["company_id", "location", "start_year_month", "start_date", "start_year_month_shifted"], inplace=True)

In [20]:
feature_aggs = {
    'job_duration' : ['max', 'min', 'std'],
    'job_count' : ['mean'],
    'min_start_year' : ['min'],
    'working_years': ['mean'],
    'total_job_duration' : ['min'],
    'avg_work_year' : ['min'],
    'quit_job_2018' : ['mean', 'sum'],
    'max_start_year': ['min'],
    'year_of_job_start': ['mean', 'sum'],
    'month_of_job_start':['mean', 'max', 'min', 'sum'],
    'quit_job_in_2018':['mean', 'sum'],
    'moving_average':['mean', 'max', 'min', 'sum'],
    'quit_job_in_2018_change':['mean', 'sum'],
    'time_since_last_change':['mean', 'sum'],
    'seasonal_trend':['mean', 'max', 'min', 'sum'],
    'start_date_squared':['mean', 'max', 'min', 'sum'],
    'company_id2':['mean', 'sum']}

In [19]:
work_exp.dtypes

user_id                      int64
job_duration               float64
job_count                    int64
min_start_year               int64
working_years                int64
total_job_duration         float64
avg_work_year              float64
quit_job_2018                int64
max_start_year               int64
job_tenure_groups           object
year_of_job_start            int64
month_of_job_start           int64
quit_job_in_2018             int64
moving_average             float64
quit_job_in_2018_change    float64
time_since_last_change     float64
seasonal_trend             float64
start_quarter               object
start_season                object
start_date_squared           int64
job_location                object
moved_after_2019           float64
company_id2                float64
dtype: object

In [22]:
feature_df = work_exp.drop(labels=["moved_after_2019"], axis=1).\
                drop_duplicates().reset_index(drop=True).groupby('user_id').agg(feature_aggs)
feature_df.columns = ['work_' + '_'.join(col) for col in feature_df.columns]
feature_df.reset_index(inplace=True)

In [23]:
feature_df.head()

Unnamed: 0,user_id,work_job_duration_max,work_job_duration_min,work_job_duration_std,work_job_count_mean,work_min_start_year_min,work_working_years_mean,work_total_job_duration_min,work_avg_work_year_min,work_quit_job_2018_mean,work_quit_job_2018_sum,work_max_start_year_min,work_year_of_job_start_mean,work_year_of_job_start_sum,work_month_of_job_start_mean,work_month_of_job_start_max,work_month_of_job_start_min,work_month_of_job_start_sum,work_quit_job_in_2018_mean,work_quit_job_in_2018_sum,work_moving_average_mean,work_moving_average_max,work_moving_average_min,work_moving_average_sum,work_quit_job_in_2018_change_mean,work_quit_job_in_2018_change_sum,work_time_since_last_change_mean,work_time_since_last_change_sum,work_seasonal_trend_mean,work_seasonal_trend_max,work_seasonal_trend_min,work_seasonal_trend_sum,work_start_date_squared_mean,work_start_date_squared_max,work_start_date_squared_min,work_start_date_squared_sum,work_company_id2_mean,work_company_id2_sum
0,0,13.339726,0.0,9.432611,1.0,2005,13.0,13.339726,13.339726,0.0,0,2005,2005.0,4010,9.0,9,9,18,0.0,0,0.011321,0.011321,0.011321,0.022642,0.0,0.0,0.0,0.0,0.140378,0.140378,0.140378,0.280756,4020025.0,4020025,4020025,8040050,0.0,0.0
1,2,1.0,0.49863,0.26833,3.0,2016,2.0,2.082192,0.694064,0.333333,1,2018,2017.0,6051,8.0,12,6,24,0.333333,1,0.296476,0.379919,0.234584,0.889427,0.0,0.0,2.0,2.0,0.130525,0.14075,0.110077,0.391576,4068290.0,4072324,4064256,12204869,0.777778,2.333333
2,5,1.583562,1.583562,,1.0,2017,1.0,1.583562,1.583562,0.0,0,2017,2017.0,2017,6.0,6,6,6,0.0,0,0.379919,0.379919,0.379919,0.379919,0.0,0.0,0.0,0.0,0.14075,0.14075,0.14075,0.14075,4068289.0,4068289,4068289,4068289,0.666667,0.666667
3,7,1.334247,0.082192,0.640305,4.0,2016,2.0,2.50137,0.625342,0.25,1,2018,2017.0,8068,8.5,12,7,34,0.25,1,0.274641,0.436094,0.028481,1.098563,0.0,0.0,2.0,2.0,0.144913,0.164128,0.110077,0.579651,4068290.0,4072324,4064256,16273158,0.25885,1.035398
4,10,4.750685,0.0,2.571646,2.0,2013,5.0,5.419178,2.709589,0.666667,2,2018,2016.333333,6049,6.0,8,5,18,0.666667,2,0.13278,0.177026,0.044289,0.39834,0.0,0.0,2.0,6.0,0.134272,0.141319,0.130749,0.402817,4065606.0,4072324,4052169,12196817,0.394827,1.18448


In [30]:
work_exp[:15]

Unnamed: 0,user_id,job_duration,job_count,min_start_year,working_years,total_job_duration,avg_work_year,quit_job_2018,max_start_year,job_tenure_groups,year_of_job_start,month_of_job_start,quit_job_in_2018,moving_average,quit_job_in_2018_change,time_since_last_change,seasonal_trend,start_quarter,start_season,start_date_squared,job_location,moved_after_2019,company_id2
0,0,0.0,1,2005,13,13.339726,13.339726,0,2005,,2005,9,0,0.011321,,0.0,0.140378,Q3,fall,4020025,,0.0,0.0
1,0,13.339726,1,2005,13,13.339726,13.339726,0,2005,More than 10 years,2005,9,0,0.011321,0.0,,0.140378,Q3,fall,4020025,,0.0,0.0
2,2,0.49863,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2016,12,0,0.274924,0.0,2.0,0.110077,Q4,winter,4064256,elazig,,1.0
3,2,1.0,3,2016,2,2.082192,0.694064,0,2018,Less than 1 year,2017,6,1,0.379919,1.0,,0.14075,Q2,summer,4068289,elazig,,1.0
4,2,0.583562,3,2016,2,2.082192,0.694064,1,2018,Less than 1 year,2018,6,0,0.234584,-1.0,,0.14075,Q2,summer,4072324,mersin,,0.333333
5,5,1.583562,1,2017,1,1.583562,1.583562,0,2017,1-2 years,2017,6,0,0.379919,0.0,0.0,0.14075,Q2,summer,4068289,istanbul,0.0,0.666667
6,7,1.0,4,2016,2,2.50137,0.625342,0,2018,Less than 1 year,2016,7,0,0.210332,0.0,,0.164128,Q3,summer,4064256,istanbul,0.0,0.345133
7,7,0.084932,4,2016,2,2.50137,0.625342,0,2018,Less than 1 year,2017,7,0,0.436094,0.0,2.0,0.164128,Q3,summer,4068289,istanbul,0.0,0.345133
8,7,1.334247,4,2016,2,2.50137,0.625342,0,2018,1-2 years,2017,8,1,0.423656,1.0,,0.141319,Q3,summer,4068289,,0.0,0.345133
9,7,0.082192,4,2016,2,2.50137,0.625342,1,2018,Less than 1 year,2018,12,0,0.028481,-1.0,,0.110077,Q4,winter,4072324,elazig,0.0,0.0


In [24]:
feature_df.user_id.value_counts()

0        1
44301    1
44290    1
44291    1
44292    1
        ..
22367    1
22369    1
22370    1
22371    1
66273    1
Name: user_id, Length: 53002, dtype: int64

In [37]:
categorical = pd.get_dummies(work_exp.select_dtypes('object'))
categorical['user_id'] = work_exp['user_id']
categorical.head()

Unnamed: 0,job_tenure_groups_1-2 years,job_tenure_groups_2-5 years,job_tenure_groups_5-10 years,job_tenure_groups_Less than 1 year,job_tenure_groups_More than 10 years,start_quarter_Q1,start_quarter_Q2,start_quarter_Q3,start_quarter_Q4,start_season_fall,start_season_spring,start_season_summer,start_season_winter,job_location_adana,job_location_ankara,job_location_antalya,job_location_area,job_location_bursa,job_location_elazig,job_location_eskisehir,job_location_gebze,job_location_germany,job_location_istanbul,job_location_izmir,job_location_kayseri,job_location_kocaeli,job_location_konya,job_location_manisa,job_location_mersin,job_location_mugla,job_location_sakarya,job_location_tekirdag,job_location_trabzon,job_location_turkiye,job_location_united states,user_id
0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
4,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2


In [39]:
categorical_grouped = categorical.groupby('user_id').agg(['sum', 'mean'])
categorical_grouped.head()

Unnamed: 0_level_0,job_tenure_groups_1-2 years,job_tenure_groups_1-2 years,job_tenure_groups_2-5 years,job_tenure_groups_2-5 years,job_tenure_groups_5-10 years,job_tenure_groups_5-10 years,job_tenure_groups_Less than 1 year,job_tenure_groups_Less than 1 year,job_tenure_groups_More than 10 years,job_tenure_groups_More than 10 years,start_quarter_Q1,start_quarter_Q1,start_quarter_Q2,start_quarter_Q2,start_quarter_Q3,start_quarter_Q3,start_quarter_Q4,start_quarter_Q4,start_season_fall,start_season_fall,start_season_spring,start_season_spring,start_season_summer,start_season_summer,start_season_winter,start_season_winter,job_location_adana,job_location_adana,job_location_ankara,job_location_ankara,job_location_antalya,job_location_antalya,job_location_area,job_location_area,job_location_bursa,job_location_bursa,job_location_elazig,job_location_elazig,job_location_eskisehir,job_location_eskisehir,job_location_gebze,job_location_gebze,job_location_germany,job_location_germany,job_location_istanbul,job_location_istanbul,job_location_izmir,job_location_izmir,job_location_kayseri,job_location_kayseri,job_location_kocaeli,job_location_kocaeli,job_location_konya,job_location_konya,job_location_manisa,job_location_manisa,job_location_mersin,job_location_mersin,job_location_mugla,job_location_mugla,job_location_sakarya,job_location_sakarya,job_location_tekirdag,job_location_tekirdag,job_location_trabzon,job_location_trabzon,job_location_turkiye,job_location_turkiye,job_location_united states,job_location_united states
Unnamed: 0_level_1,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2
0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.5,0,0.0,0,0.0,2,1.0,0,0.0,2,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,0,0.0,0,0.0,0,0.0,3,1.0,0,0.0,0,0.0,2,0.666667,0,0.0,1,0.333333,0,0.0,0,0.0,2,0.666667,1,0.333333,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,2,0.666667,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.333333,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
5,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
7,1,0.25,0,0.0,0,0.0,3,0.75,0,0.0,0,0.0,0,0.0,3,0.75,1,0.25,0,0.0,0,0.0,3,0.75,1,0.25,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.25,0,0.0,0,0.0,0,0.0,2,0.5,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
10,0,0.0,1,0.333333,0,0.0,1,0.333333,0,0.0,0,0.0,2,0.666667,1,0.333333,0,0.0,0,0.0,2,0.666667,1,0.333333,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.333333,0,0.0,0,0.0,0,0.0,2,0.666667,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [40]:
group_var = 'SK_ID_CURR'

# Need to create new column names
columns = []

# Iterate through the variables names
for var in categorical_grouped.columns.levels[0]:
    # Skip the grouping variable
    if var != group_var:
        # Iterate through the stat names
        for stat in ['count', 'count_norm']:
            # Make a new column name for the variable and stat
            columns.append('%s_%s' % (var, stat))

#  Rename the columns
categorical_grouped.columns = columns

categorical_grouped.head()

Unnamed: 0_level_0,job_tenure_groups_1-2 years_count,job_tenure_groups_1-2 years_count_norm,job_tenure_groups_2-5 years_count,job_tenure_groups_2-5 years_count_norm,job_tenure_groups_5-10 years_count,job_tenure_groups_5-10 years_count_norm,job_tenure_groups_Less than 1 year_count,job_tenure_groups_Less than 1 year_count_norm,job_tenure_groups_More than 10 years_count,job_tenure_groups_More than 10 years_count_norm,start_quarter_Q1_count,start_quarter_Q1_count_norm,start_quarter_Q2_count,start_quarter_Q2_count_norm,start_quarter_Q3_count,start_quarter_Q3_count_norm,start_quarter_Q4_count,start_quarter_Q4_count_norm,start_season_fall_count,start_season_fall_count_norm,start_season_spring_count,start_season_spring_count_norm,start_season_summer_count,start_season_summer_count_norm,start_season_winter_count,start_season_winter_count_norm,job_location_adana_count,job_location_adana_count_norm,job_location_ankara_count,job_location_ankara_count_norm,job_location_antalya_count,job_location_antalya_count_norm,job_location_area_count,job_location_area_count_norm,job_location_bursa_count,job_location_bursa_count_norm,job_location_elazig_count,job_location_elazig_count_norm,job_location_eskisehir_count,job_location_eskisehir_count_norm,job_location_gebze_count,job_location_gebze_count_norm,job_location_germany_count,job_location_germany_count_norm,job_location_istanbul_count,job_location_istanbul_count_norm,job_location_izmir_count,job_location_izmir_count_norm,job_location_kayseri_count,job_location_kayseri_count_norm,job_location_kocaeli_count,job_location_kocaeli_count_norm,job_location_konya_count,job_location_konya_count_norm,job_location_manisa_count,job_location_manisa_count_norm,job_location_mersin_count,job_location_mersin_count_norm,job_location_mugla_count,job_location_mugla_count_norm,job_location_sakarya_count,job_location_sakarya_count_norm,job_location_tekirdag_count,job_location_tekirdag_count_norm,job_location_trabzon_count,job_location_trabzon_count_norm,job_location_turkiye_count,job_location_turkiye_count_norm,job_location_united states_count,job_location_united states_count_norm
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1
0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.5,0,0.0,0,0.0,2,1.0,0,0.0,2,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,0,0.0,0,0.0,0,0.0,3,1.0,0,0.0,0,0.0,2,0.666667,0,0.0,1,0.333333,0,0.0,0,0.0,2,0.666667,1,0.333333,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,2,0.666667,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.333333,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
5,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
7,1,0.25,0,0.0,0,0.0,3,0.75,0,0.0,0,0.0,0,0.0,3,0.75,1,0.25,0,0.0,0,0.0,3,0.75,1,0.25,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.25,0,0.0,0,0.0,0,0.0,2,0.5,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
10,0,0.0,1,0.333333,0,0.0,1,0.333333,0,0.0,0,0.0,2,0.666667,1,0.333333,0,0.0,0,0.0,2,0.666667,1,0.333333,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.333333,0,0.0,0,0.0,0,0.0,2,0.666667,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [41]:
categorical_grouped.shape

(53002, 70)

In [42]:
feature_df.shape

(53002, 38)

In [88]:
train = pd.read_csv("./train_user_lang.csv")

In [89]:
test = pd.read_csv("./test_user_lang.csv")


In [90]:
train = train.merge(feature_df, on="user_id",how="left")

In [91]:
test = test.merge(feature_df, on="user_id",how="left")

In [92]:
train = train.merge(categorical_grouped, on="user_id",how="left")

In [93]:
test = test.merge(categorical_grouped, on="user_id",how="left")

In [94]:
test.head()

Unnamed: 0,user_id,is_yurtdisi,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany yurtdisi,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands yurtdisi,user_location_nevsehir,user_location_ordu,user_location_philippines yurtdisi,user_location_poland yurtdisi,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia yurtdisi,user_location_sweden yurtdisi,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom yurtdisi,user_location_united states yurtdisi,user_location_van,user_location_yalova,user_location_zonguldak,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum,language_language_albanian_count,language_language_albanian_count_norm,language_language_almanca_count,language_language_almanca_count_norm,language_language_arapca_count,language_language_arapca_count_norm,language_language_armenian_count,language_language_armenian_count_norm,language_language_arnavutca_count,language_language_arnavutca_count_norm,language_language_azerice_count,language_language_azerice_count_norm,language_language_bosnakca_count,language_language_bosnakca_count_norm,language_language_bulgarca_count,language_language_bulgarca_count_norm,language_language_cince_count,language_language_cince_count_norm,language_language_czech_count,language_language_czech_count_norm,language_language_dutch_count,language_language_dutch_count_norm,language_language_farsca_count,language_language_farsca_count_norm,language_language_fince_count,language_language_fince_count_norm,language_language_fransizca_count,language_language_fransizca_count_norm,language_language_greek_count,language_language_greek_count_norm,language_language_hebrew_count,language_language_hebrew_count_norm,language_language_ingilizce_count,language_language_ingilizce_count_norm,language_language_isaret dilleri_count,language_language_isaret dilleri_count_norm,language_language_ispanyolca_count,language_language_ispanyolca_count_norm,language_language_italyanca_count,language_language_italyanca_count_norm,language_language_japonca_count,language_language_japonca_count_norm,language_language_kazakca_count,language_language_kazakca_count_norm,language_language_korece_count,language_language_korece_count_norm,language_language_kurtce_count,language_language_kurtce_count_norm,language_language_latince_count,language_language_latince_count_norm,language_language_lehce_count,language_language_lehce_count_norm,language_language_other_count,language_language_other_count_norm,language_language_persian_count,language_language_persian_count_norm,language_language_polish_count,language_language_polish_count_norm,language_language_portekizce_count,language_language_portekizce_count_norm,language_language_rusca_count,language_language_rusca_count_norm,language_language_sirpca_count,language_language_sirpca_count_norm,language_language_swedish_count,language_language_swedish_count_norm,language_language_turkce_count,language_language_turkce_count_norm,language_language_urdu_count,language_language_urdu_count_norm,language_language_uzbek_count,language_language_uzbek_count_norm,work_job_duration_max,work_job_duration_min,work_job_duration_std,work_job_count_mean,work_min_start_year_min,work_working_years_mean,work_total_job_duration_min,work_avg_work_year_min,work_quit_job_2018_mean,work_quit_job_2018_sum,work_max_start_year_min,work_year_of_job_start_mean,work_year_of_job_start_sum,work_month_of_job_start_mean,work_month_of_job_start_max,work_month_of_job_start_min,work_month_of_job_start_sum,work_quit_job_in_2018_mean,work_quit_job_in_2018_sum,work_moving_average_mean,work_moving_average_max,work_moving_average_min,work_moving_average_sum,work_quit_job_in_2018_change_mean,work_quit_job_in_2018_change_sum,work_time_since_last_change_mean,work_time_since_last_change_sum,work_seasonal_trend_mean,work_seasonal_trend_max,work_seasonal_trend_min,work_seasonal_trend_sum,work_start_date_squared_mean,work_start_date_squared_max,work_start_date_squared_min,work_start_date_squared_sum,work_company_id2_mean,work_company_id2_sum,job_tenure_groups_1-2 years_count,job_tenure_groups_1-2 years_count_norm,job_tenure_groups_2-5 years_count,job_tenure_groups_2-5 years_count_norm,job_tenure_groups_5-10 years_count,job_tenure_groups_5-10 years_count_norm,job_tenure_groups_Less than 1 year_count,job_tenure_groups_Less than 1 year_count_norm,job_tenure_groups_More than 10 years_count,job_tenure_groups_More than 10 years_count_norm,start_quarter_Q1_count,start_quarter_Q1_count_norm,start_quarter_Q2_count,start_quarter_Q2_count_norm,start_quarter_Q3_count,start_quarter_Q3_count_norm,start_quarter_Q4_count,start_quarter_Q4_count_norm,start_season_fall_count,start_season_fall_count_norm,start_season_spring_count,start_season_spring_count_norm,start_season_summer_count,start_season_summer_count_norm,start_season_winter_count,start_season_winter_count_norm,job_location_adana_count,job_location_adana_count_norm,job_location_ankara_count,job_location_ankara_count_norm,job_location_antalya_count,job_location_antalya_count_norm,job_location_area_count,job_location_area_count_norm,job_location_bursa_count,job_location_bursa_count_norm,job_location_elazig_count,job_location_elazig_count_norm,job_location_eskisehir_count,job_location_eskisehir_count_norm,job_location_gebze_count,job_location_gebze_count_norm,job_location_germany_count,job_location_germany_count_norm,job_location_istanbul_count,job_location_istanbul_count_norm,job_location_izmir_count,job_location_izmir_count_norm,job_location_kayseri_count,job_location_kayseri_count_norm,job_location_kocaeli_count,job_location_kocaeli_count_norm,job_location_konya_count,job_location_konya_count_norm,job_location_manisa_count,job_location_manisa_count_norm,job_location_mersin_count,job_location_mersin_count_norm,job_location_mugla_count,job_location_mugla_count_norm,job_location_sakarya_count,job_location_sakarya_count_norm,job_location_tekirdag_count,job_location_tekirdag_count_norm,job_location_trabzon_count,job_location_trabzon_count_norm,job_location_turkiye_count,job_location_turkiye_count_norm,job_location_united states_count,job_location_united states_count_norm
0,17449,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11.005479,0.0,5.570471,2.0,2001.0,17.0,18.008219,9.00411,0.0,0.0,2012.0,2004.666667,6014.0,1.0,1.0,1.0,3.0,0.0,0.0,0.009724,0.029172,0.0,0.029172,0.0,0.0,0.0,0.0,0.131695,0.131695,0.131695,0.395086,4018715.0,4048144.0,4004001.0,12056146.0,0.133333,0.4,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,1.0,0.333333,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0
1,33967,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,3.5,5.0,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,2.0,0.583562,1.001573,2.0,2016.0,2.0,2.583562,1.291781,0.5,1.0,2018.0,2017.0,4034.0,6.0,6.0,6.0,12.0,0.5,1.0,0.205602,0.234584,0.17662,0.411204,0.0,0.0,,0.0,0.14075,0.14075,0.14075,0.281499,4068290.0,4072324.0,4064256.0,8136580.0,0.35474,0.35474,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2110,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3.0,2.666667,5.0,1.0,8.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,55082,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,5.509589,1.912329,2.543647,2.0,2011.0,7.0,7.421918,3.710959,0.0,0.0,2017.0,2014.0,4028.0,5.0,8.0,2.0,10.0,0.0,0.0,0.163551,0.28125,0.045852,0.327102,0.0,0.0,,0.0,0.150436,0.159553,0.141319,0.300872,4056205.0,4068289.0,4044121.0,8112410.0,0.423077,0.846154,1.0,0.5,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,37165,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,4.5,5.0,4.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.583562,2.583562,,1.0,2016.0,2.0,2.583562,2.583562,0.0,0.0,2016.0,2016.0,2016.0,6.0,6.0,6.0,6.0,0.0,0.0,0.17662,0.17662,0.17662,0.17662,0.0,0.0,,0.0,0.14075,0.14075,0.14075,0.14075,4064256.0,4064256.0,4064256.0,4064256.0,0.5,0.5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
train.head()

Unnamed: 0,user_id,is_yurtdisi,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany yurtdisi,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands yurtdisi,user_location_nevsehir,user_location_ordu,user_location_philippines yurtdisi,user_location_poland yurtdisi,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia yurtdisi,user_location_sweden yurtdisi,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom yurtdisi,user_location_united states yurtdisi,user_location_van,user_location_yalova,user_location_zonguldak,moved_after_2019,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum,language_language_albanian_count,language_language_albanian_count_norm,language_language_almanca_count,language_language_almanca_count_norm,language_language_arapca_count,language_language_arapca_count_norm,language_language_armenian_count,language_language_armenian_count_norm,language_language_arnavutca_count,language_language_arnavutca_count_norm,language_language_azerice_count,language_language_azerice_count_norm,language_language_bosnakca_count,language_language_bosnakca_count_norm,language_language_bulgarca_count,language_language_bulgarca_count_norm,language_language_cince_count,language_language_cince_count_norm,language_language_czech_count,language_language_czech_count_norm,language_language_dutch_count,language_language_dutch_count_norm,language_language_farsca_count,language_language_farsca_count_norm,language_language_fince_count,language_language_fince_count_norm,language_language_fransizca_count,language_language_fransizca_count_norm,language_language_greek_count,language_language_greek_count_norm,language_language_hebrew_count,language_language_hebrew_count_norm,language_language_ingilizce_count,language_language_ingilizce_count_norm,language_language_isaret dilleri_count,language_language_isaret dilleri_count_norm,language_language_ispanyolca_count,language_language_ispanyolca_count_norm,language_language_italyanca_count,language_language_italyanca_count_norm,language_language_japonca_count,language_language_japonca_count_norm,language_language_kazakca_count,language_language_kazakca_count_norm,language_language_korece_count,language_language_korece_count_norm,language_language_kurtce_count,language_language_kurtce_count_norm,language_language_latince_count,language_language_latince_count_norm,language_language_lehce_count,language_language_lehce_count_norm,language_language_other_count,language_language_other_count_norm,language_language_persian_count,language_language_persian_count_norm,language_language_polish_count,language_language_polish_count_norm,language_language_portekizce_count,language_language_portekizce_count_norm,language_language_rusca_count,language_language_rusca_count_norm,language_language_sirpca_count,language_language_sirpca_count_norm,language_language_swedish_count,language_language_swedish_count_norm,language_language_turkce_count,language_language_turkce_count_norm,language_language_urdu_count,language_language_urdu_count_norm,language_language_uzbek_count,language_language_uzbek_count_norm,work_job_duration_max,work_job_duration_min,work_job_duration_std,work_job_count_mean,work_min_start_year_min,work_working_years_mean,work_total_job_duration_min,work_avg_work_year_min,work_quit_job_2018_mean,work_quit_job_2018_sum,work_max_start_year_min,work_year_of_job_start_mean,work_year_of_job_start_sum,work_month_of_job_start_mean,work_month_of_job_start_max,work_month_of_job_start_min,work_month_of_job_start_sum,work_quit_job_in_2018_mean,work_quit_job_in_2018_sum,work_moving_average_mean,work_moving_average_max,work_moving_average_min,work_moving_average_sum,work_quit_job_in_2018_change_mean,work_quit_job_in_2018_change_sum,work_time_since_last_change_mean,work_time_since_last_change_sum,work_seasonal_trend_mean,work_seasonal_trend_max,work_seasonal_trend_min,work_seasonal_trend_sum,work_start_date_squared_mean,work_start_date_squared_max,work_start_date_squared_min,work_start_date_squared_sum,work_company_id2_mean,work_company_id2_sum,job_tenure_groups_1-2 years_count,job_tenure_groups_1-2 years_count_norm,job_tenure_groups_2-5 years_count,job_tenure_groups_2-5 years_count_norm,job_tenure_groups_5-10 years_count,job_tenure_groups_5-10 years_count_norm,job_tenure_groups_Less than 1 year_count,job_tenure_groups_Less than 1 year_count_norm,job_tenure_groups_More than 10 years_count,job_tenure_groups_More than 10 years_count_norm,start_quarter_Q1_count,start_quarter_Q1_count_norm,start_quarter_Q2_count,start_quarter_Q2_count_norm,start_quarter_Q3_count,start_quarter_Q3_count_norm,start_quarter_Q4_count,start_quarter_Q4_count_norm,start_season_fall_count,start_season_fall_count_norm,start_season_spring_count,start_season_spring_count_norm,start_season_summer_count,start_season_summer_count_norm,start_season_winter_count,start_season_winter_count_norm,job_location_adana_count,job_location_adana_count_norm,job_location_ankara_count,job_location_ankara_count_norm,job_location_antalya_count,job_location_antalya_count_norm,job_location_area_count,job_location_area_count_norm,job_location_bursa_count,job_location_bursa_count_norm,job_location_elazig_count,job_location_elazig_count_norm,job_location_eskisehir_count,job_location_eskisehir_count_norm,job_location_gebze_count,job_location_gebze_count_norm,job_location_germany_count,job_location_germany_count_norm,job_location_istanbul_count,job_location_istanbul_count_norm,job_location_izmir_count,job_location_izmir_count_norm,job_location_kayseri_count,job_location_kayseri_count_norm,job_location_kocaeli_count,job_location_kocaeli_count_norm,job_location_konya_count,job_location_konya_count_norm,job_location_manisa_count,job_location_manisa_count_norm,job_location_mersin_count,job_location_mersin_count_norm,job_location_mugla_count,job_location_mugla_count_norm,job_location_sakarya_count,job_location_sakarya_count_norm,job_location_tekirdag_count,job_location_tekirdag_count_norm,job_location_trabzon_count,job_location_trabzon_count_norm,job_location_turkiye_count,job_location_turkiye_count_norm,job_location_united states_count,job_location_united states_count_norm
0,1301,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2.0,4.0,5.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.416438,0.419178,1.053681,3.0,2015.0,3.0,3.671233,1.223744,0.333333,1.0,2018.0,2016.0,6048.0,6.0,10.0,3.0,18.0,0.333333,1.0,0.188044,0.295428,0.104247,0.564131,0.0,0.0,0.666667,2.0,0.140932,0.150295,0.130749,0.422795,4064258.0,4072324.0,4060225.0,12192774.0,0.784913,2.35474,0.0,0.0,1.0,0.333333,0.0,0.0,2.0,0.666667,0.0,0.0,1.0,0.333333,1.0,0.333333,0.0,0.0,1.0,0.333333,1.0,0.333333,2.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6950,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.0,1.666667,3.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,1.0,0.333333,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.912329,0.49863,0.569431,6.0,2013.0,5.0,5.753425,0.958904,0.0,0.0,2017.0,2015.166667,12091.0,5.0,8.0,2.0,30.0,0.0,0.0,0.157705,0.28125,0.059347,0.946232,0.0,0.0,0.5,2.0,0.144526,0.159553,0.124661,0.867154,4060898.0,4068289.0,4052169.0,24365391.0,0.365522,2.193131,3.0,0.5,0.0,0.0,0.0,0.0,3.0,0.5,0.0,0.0,2.0,0.333333,2.0,0.333333,2.0,0.333333,0.0,0.0,0.0,0.0,1.0,0.166667,3.0,0.5,2.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.668493,0.747945,0.393513,4.0,2014.0,4.0,5.0,1.25,0.0,0.0,2017.0,2015.5,8062.0,4.25,8.0,1.0,17.0,0.0,0.0,0.181361,0.297016,0.077922,0.725446,0.0,0.0,2.0,8.0,0.138515,0.150295,0.130749,0.554058,4062242.0,4068289.0,4056196.0,16248966.0,0.09375,0.375,3.0,0.75,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,2.0,0.5,1.0,0.25,1.0,0.25,0.0,0.0,0.0,0.0,2.0,0.5,1.0,0.25,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,26046,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.252055,1.082192,1.35696,4.0,2009.0,9.0,9.336986,2.334247,0.0,0.0,2014.0,2011.25,8045.0,10.25,12.0,9.0,41.0,0.0,0.0,0.041784,0.08548,0.014599,0.167136,0.0,0.0,0.0,0.0,0.133489,0.141751,0.110077,0.533957,4045130.0,4056196.0,4036081.0,16180521.0,0.35474,1.41896,2.0,0.5,2.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,3.0,0.75,3.0,0.75,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11005,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,4.0,5.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,6.671233,0.915068,3.110291,3.0,2009.0,9.0,9.336986,3.112329,0.0,0.0,2017.0,2012.0,6036.0,7.0,9.0,4.0,21.0,0.0,0.0,0.102134,0.265915,0.017964,0.306401,0.0,0.0,0.666667,2.0,0.135452,0.141319,0.124661,0.406357,4048157.0,4068289.0,4036081.0,12144470.0,0.299816,0.899447,1.0,0.333333,0.0,0.0,1.0,0.333333,1.0,0.333333,0.0,0.0,0.0,0.0,1.0,0.333333,2.0,0.666667,0.0,0.0,1.0,0.333333,1.0,0.333333,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
train.to_csv("train_user_lang_work_revise.csv", index=False)
test.to_csv("test_user_lang_work_revise.csv", index=False)

## do company_id target(mean) encode with library

## Skills

In [17]:
skills = pd.read_csv("../data/skills.csv")

In [3]:
def count_skills(skill_data):
    skill_data = skill_data.copy()
    return skill_data.groupby("user_id").agg({"skill": "nunique"}).reset_index().rename(columns={"skill": "skill_count_raw"})

In [4]:
skills_count = count_skills(skills)

In [5]:
skills_count

Unnamed: 0,user_id,skill_count_raw
0,1,2
1,2,6
2,3,3
3,5,11
4,6,6
...,...,...
62397,66269,31
62398,66270,8
62399,66271,47
62400,66272,36


In [9]:
skills.skill.value_counts()[:200]

Java                                 24360
JavaScript                           24072
SQL                                  23590
C#                                   23422
HTML                                 16872
CSS                                  15109
C++                                  14762
Microsoft SQL Server                 14593
C                                    14121
Microsoft Office                     13854
Python                               13353
MySQL                                13320
Git                                  12651
Linux                                11552
Software Development                 10978
jQuery                               10497
ASP.NET                              10178
HTML5                                 9380
.NET                                  8963
ASP.NET MVC                           8806
PHP                                   8444
Yazılım Geliştirme                    8243
OOP (Nesne Yönelimli Programlama)     8196
Visual Stud

In [35]:
train = pd.read_csv("./train_user_lang_work_revise.csv")
test = pd.read_csv("./test_user_lang_work_revise.csv")

In [36]:
skills_train_data = pd.merge(skills,train[['user_id','moved_after_2019']],on='user_id', how='right')
skills_test_data = pd.merge(skills,test[['user_id']],on='user_id', how='right')


In [37]:
skills_test_data

Unnamed: 0,user_id,skill
0,17449,Bioinformatics
1,17449,Machine Learning
2,17449,Artificial Intelligence (AI)
3,17449,Android Development
4,17449,iOS development
...,...,...
282638,16036,Perl
282639,16036,Embedded Software
282640,16036,Simulink
282641,16036,Swift


In [38]:
target_train = skills_train_data[['moved_after_2019']]
train_skills = skills_train_data.drop('moved_after_2019', axis = 1)
test_skills = skills_test_data.copy()

In [39]:
def encode_test(skills_train):
    from sklearn.model_selection import KFold
    from catboost import CatBoostClassifier
    import lightgbm as lgb
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, recall_score, precision_score, accuracy_score
    import re

    

    train = pd.read_csv("./train_user_lang_work_revise.csv")
    train = train.merge(skills_train, on="user_id", how="left")
    train = train.merge(skills_count, on="user_id", how="left")



    X_train_user_id = train["user_id"]

    y = train["moved_after_2019"]
    X = train.drop(columns=["user_id", "moved_after_2019", "work_company_id2_mean","work_company_id2_sum"])
    X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

    auc_errs = []
    ap_errs = []
    prec_errs = []
    recall_errs = []
    f1_errs = []
    acc_scores = []

    models = []

    kf = KFold(n_splits=10)

    for split_train, split_val in list(kf.split(X)):
        
        split_train = X.index[split_train]
        split_val = X.index[split_val]
        
        X_train, y_train = X.loc[split_train], y.loc[split_train]
        X_val, y_val = X.loc[split_val], y.loc[split_val]
        print("Train shape:", X_train.shape, "|", "Val Shape:", X_val.shape)
        print("Positive Count in Val Split:", y_val.sum())

        model = lgb.LGBMClassifier()
        

        model.fit(X_train, y_train, eval_metric = 'accuracy',
                    eval_set = [(X_val, y_val), (X_train, y_train)],
                    eval_names = ['valid', 'train'],
                    early_stopping_rounds = 100, verbose = 200)
        
        preds = model.predict(X_val)
        pred_probas = model.predict_proba(X_val)[:,1]

        print("Positive Count in Predictions:", preds.sum())
        
        acc = accuracy_score(y_val, preds)
        print("Accuracy: ", acc)
        acc_scores.append(acc)
        
        f1_err = f1_score(y_val, preds)
        print("Fold F1: ", f1_err)
        f1_errs.append(f1_err)

        prec_err = precision_score(y_val, preds)
        print("Fold Precision: ", prec_err)
        prec_errs.append(prec_err)

        recall_err = recall_score(y_val, preds)
        print("Fold Recall: ", recall_err)
        recall_errs.append(recall_err)
        
        auc_err = roc_auc_score(y_val, pred_probas)
        print("Fold AUC: ", auc_err)
        auc_errs.append(auc_err)

        ap_err = average_precision_score(y_val, pred_probas)
        print("Fold AP: ", ap_err)
        ap_errs.append(ap_err)
        
        models.append(model)
        
        print("\n", "*"*70, "\n")

    import plotly.express as px
    importance = [model.feature_importances_ for model in models]

    f_importance = pd.concat([pd.Series(X.columns.to_list(),name='Feature'),
                            pd.Series(np.mean(importance,axis=0),name="Importance")],
                            axis=1).sort_values(by='Importance',
                                                ascending=True)

    fig = px.bar(f_importance.tail(20),x='Importance',y='Feature')
    fig.update_layout(
        title_text="First 20 Important Features - CatBoost Average of Folds"
    )
    fig.show()

    return models


        

        

In [40]:
def cat_agg(categorical_grouped):
    group_var = 'user_id'

    # Need to create new column names
    columns = []

    # Iterate through the variables names
    for var in categorical_grouped.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in ['count', 'count_norm']:
                # Make a new column name for the variable and stat
                columns.append('%s_%s' % (var, stat))

    #  Rename the columns
    categorical_grouped.columns = columns
    print(categorical_grouped.head())

In [40]:
import category_encoders as ce
cbe_encoder = ce.cat_boost.CatBoostEncoder()
cbe_encoder.fit(train_skills, target_train)
skills_catboost_train = cbe_encoder.transform(train_skills)
skills_catboost_test = cbe_encoder.transform(test_skills)


In [41]:
skills_catboost_train_agg = skills_catboost_train.groupby('user_id').agg(['sum', 'mean'])

In [42]:
skills_catboost_test_agg = skills_catboost_test.groupby('user_id').agg(['sum', 'mean'])

In [43]:
cat_agg(skills_catboost_test_agg)


         skill_count  skill_count_norm
user_id                               
1           0.736579          0.368290
2           2.576923          0.429487
4           0.250454          0.250454
11         16.082303          0.459494
14          4.656453          0.465645


In [44]:
cat_agg(skills_catboost_train_agg)


         skill_count  skill_count_norm
user_id                               
0           0.250454          0.250454
3           1.459549          0.486516
5           5.307578          0.482507
6           2.460933          0.410155
7           2.928670          0.418381


In [45]:
test = pd.read_csv("./test_user_lang_work_revise.csv")
test = test.merge(skills_count, on="user_id", how="left")
test = test.merge(skills_catboost_test_agg, on="user_id", how="left")
test.drop(columns=["user_id"],inplace=True)

models = encode_test(skills_catboost_train_agg)
model_preds = [model.predict(test) for model in models]
model_preds

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2070



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1752
Accuracy:  0.8208223311957752
Fold F1:  0.7514390371533229
Fold Precision:  0.819634703196347
Fold Recall:  0.693719806763285
Fold AUC:  0.9076666158463672
Fold AP:  0.8773957523420316

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2063



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1743
Accuracy:  0.8234628442097321
Fold F1:  0.7540725170782975
Fold Precision:  0.8232931726907631
Fold Recall:  0.6955889481337858
Fold AUC:  0.9022063864465688
Fold AP:  0.8722981636426326

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 1989



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1727
Accuracy:  0.8268577895133912
Fold F1:  0.7529601722282023
Fold Precision:  0.8100752750434279
Fold Recall:  0.7033685268979387
Fold AUC:  0.9056746151524299
Fold AP:  0.8731835730380374

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 1990



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1716
Accuracy:  0.8276122218030931
Fold F1:  0.7533729087965461
Fold Precision:  0.8135198135198135
Fold Recall:  0.7015075376884422
Fold AUC:  0.9057322390940207
Fold AP:  0.871169808284765

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2061



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1758
Accuracy:  0.8206337231233497
Fold F1:  0.7509819324430479
Fold Precision:  0.8156996587030717
Fold Recall:  0.6957787481804949
Fold AUC:  0.9075474336351282
Fold AP:  0.876916892622669

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2025



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1780
Accuracy:  0.8213881554130517
Fold F1:  0.7511169513797634
Fold Precision:  0.8028089887640449
Fold Recall:  0.705679012345679
Fold AUC:  0.9022301337040428
Fold AP:  0.870250200045088

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2095



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1765
Accuracy:  0.8253489249339871
Fold F1:  0.7601036269430052
Fold Precision:  0.8311614730878187
Fold Recall:  0.7002386634844868
Fold AUC:  0.9073907539667478
Fold AP:  0.8823577608381425

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1755
Accuracy:  0.8332704639758581
Fold F1:  0.7688284518828452
Fold Precision:  0.8376068376068376
Fold Recall:  0.7104881585306911
Fold AUC:  0.9127045480265813
Fold AP:  0.8862193084966272

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1762
Accuracy:  0.8293096944549226
Fold F1:  0.7637692508483426
Fold Precision:  0.8303064699205448
Fold Recall:  0.7071048815853069
Fold AUC:  0.911227886896802
Fold AP:  0.8839024180274394

 ********************************************************************** 

Train shape: (47718, 380) | Val Shape: (5301, 380)
Positive Count in Val Split: 2092



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1741
Accuracy:  0.8251273344651953
Fold F1:  0.758152882859379
Fold Precision:  0.8345778288340034
Fold Recall:  0.6945506692160612
Fold AUC:  0.9036719741978076
Fold AP:  0.8748559963327756

 ********************************************************************** 



[array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64)]

In [32]:
df = pd.read_csv("../data/test_users.csv")

In [47]:
sample_submission = pd.DataFrame()
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,1
2,2110,1
3,55082,1
4,37165,1


In [48]:
sample_submission.moved_after_2019.value_counts()

1    10810
0     2445
Name: moved_after_2019, dtype: int64

In [32]:
sample_submission.to_csv('submission_skills_catboost_encode.csv',index=False)


In [98]:
js_enc = ce.james_stein.JamesSteinEncoder()
js_enc.fit(train_skills, target_train)
skills_js_train = js_enc.transform(train_skills)
skills_js_test = js_enc.transform(test_skills)



In [99]:
skills_js_test

Unnamed: 0,user_id,skill
0,17449,0.388180
1,17449,0.442282
2,17449,0.458244
3,17449,0.408829
4,17449,0.353964
...,...,...
282638,16036,0.407164
282639,16036,0.381352
282640,16036,0.452682
282641,16036,0.435389


In [100]:
skills_js_train_agg = skills_js_train.groupby('user_id').agg(['sum', 'mean'])

In [101]:
skills_js_test_agg = skills_js_test.groupby('user_id').agg(['sum', 'mean'])

In [102]:
cat_agg(skills_js_train_agg)


         skill_count  skill_count_norm
user_id                               
0           0.320234          0.320234
3           1.343833          0.447944
5           4.905428          0.445948
6           2.458637          0.409773
7           2.895970          0.413710


In [103]:
cat_agg(skills_js_test_agg)

         skill_count  skill_count_norm
user_id                               
1           0.776445          0.388222
2           2.518453          0.419742
4           0.320234          0.320234
11         15.307604          0.437360
14          4.373484          0.437348


In [104]:
test = pd.read_csv("./test_user_lang_work_revise.csv")
test = test.merge(skills_count, on="user_id", how="left")
test = test.merge(skills_js_test_agg, on="user_id", how="left")
test.drop(columns=["user_id", "work_company_id2_mean","work_company_id2_sum"],inplace=True)

models = encode_test(skills_js_train_agg)
model_preds = [model.predict(test) for model in models]
model_preds

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 2070



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1666
Accuracy:  0.8170501697472652
Fold F1:  0.7403640256959314
Fold Precision:  0.8301320528211285
Fold Recall:  0.6681159420289855
Fold AUC:  0.9016998642799063
Fold AP:  0.8755175002327718

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 2063



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1671
Accuracy:  0.8140324405884571
Fold F1:  0.7359400107123728
Fold Precision:  0.822262118491921
Fold Recall:  0.666020358700921
Fold AUC:  0.8896411688795829
Fold AP:  0.8623468258957777

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 1989



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1619
Accuracy:  0.8264805733685402
Fold F1:  0.7450110864745012
Fold Precision:  0.830142063001853
Fold Recall:  0.6757164404223228
Fold AUC:  0.9054379528092709
Fold AP:  0.8739751203244582

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 1990



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1648
Accuracy:  0.8294983025273481
Fold F1:  0.7515118196811436
Fold Precision:  0.8294902912621359
Fold Recall:  0.6869346733668342
Fold AUC:  0.9025567905954897
Fold AP:  0.8742754364976753

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 2061



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1667
Accuracy:  0.8219539796303281
Fold F1:  0.7467811158798283
Fold Precision:  0.8350329934013198
Fold Recall:  0.6754002911208151
Fold AUC:  0.9030947642716344
Fold AP:  0.8761826804206337

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 2025



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1654
Accuracy:  0.8161071293851376
Fold F1:  0.734982332155477
Fold Precision:  0.8174123337363967
Fold Recall:  0.6676543209876543
Fold AUC:  0.9023342638742903
Fold AP:  0.8698096455912946

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 2095



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1698
Accuracy:  0.8179932101093926
Fold F1:  0.745583970471922
Fold Precision:  0.8327444051825678
Fold Recall:  0.6749403341288783
Fold AUC:  0.9001317970162227
Fold AP:  0.8754385172215007

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1696
Accuracy:  0.8228970199924557
Fold F1:  0.750597609561753
Fold Precision:  0.8331367924528302
Fold Recall:  0.6829386176897052
Fold AUC:  0.9045141803570209
Fold AP:  0.8783136916330745

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1651
Accuracy:  0.8208223311957752
Fold F1:  0.7446236559139784
Fold Precision:  0.838885523924894
Fold Recall:  0.6694055099081682
Fold AUC:  0.9008340762111124
Fold AP:  0.8758746720128651

 ********************************************************************** 

Train shape: (47718, 378) | Val Shape: (5301, 378)
Positive Count in Val Split: 2092



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1650
Accuracy:  0.8181475193359743
Fold F1:  0.7423837520042759
Fold Precision:  0.8418181818181818
Fold Recall:  0.6639579349904398
Fold AUC:  0.9014992489455148
Fold AP:  0.8766852818687806

 ********************************************************************** 



[array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64)]

In [105]:
sample_submission = pd.DataFrame()
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,1
1,33967,1
2,2110,1
3,55082,1
4,37165,1


In [106]:
sample_submission.moved_after_2019.value_counts()

1    12444
0      811
Name: moved_after_2019, dtype: int64

In [61]:
sample_submission.to_csv('submission_skills_js_encode.csv',index=False)


In [72]:
target_enc = ce.target_encoder.TargetEncoder()
target_enc.fit(train_skills, target_train)
skills_target_train = target_enc.transform(train_skills)
skills_target_test = target_enc.transform(test_skills)


In [73]:
skills_target_train_agg = skills_target_train.groupby('user_id').agg(['sum', 'mean'])

In [74]:
skills_target_test_agg = skills_target_test.groupby('user_id').agg(['sum', 'mean'])

In [75]:
cat_agg(skills_target_train_agg)


         skill_count  skill_count_norm
user_id                               
0           0.250402          0.250402
3           1.460252          0.486751
5           5.292196          0.481109
6           2.460852          0.410142
7           2.928474          0.418353


In [76]:
cat_agg(skills_target_test_agg)

         skill_count  skill_count_norm
user_id                               
1           0.736436          0.368218
2           2.576957          0.429493
4           0.250402          0.250402
11         15.598652          0.445676
14          4.674605          0.467461


In [82]:
test = pd.read_csv("./test_user_lang_work_revise.csv")
test = test.merge(skills_count, on="user_id", how="left")
test = test.merge(skills_target_test_agg, on="user_id", how="left")
test.drop(columns=["user_id"],inplace=True)

models = encode_test(skills_target_train_agg)
model_preds = [model.predict(test) for model in models]
model_preds

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2070



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1742
Accuracy:  0.8098830629950962
Fold F1:  0.7355718782791186
Fold Precision:  0.8048220436280138
Fold Recall:  0.6772946859903382
Fold AUC:  0.8948450279810589
Fold AP:  0.8593997008472214

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2063



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1727
Accuracy:  0.8049792531120332
Fold F1:  0.7271767810026385
Fold Precision:  0.7979154603358425
Fold Recall:  0.6679592825981581
Fold AUC:  0.891287293718087
Fold AP:  0.8562761449330267

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 1989



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1736
Accuracy:  0.8161071293851376
Fold F1:  0.738255033557047
Fold Precision:  0.7920506912442397
Fold Recall:  0.6913021618903972
Fold AUC:  0.8937898253251317
Fold AP:  0.8563828635750328

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 1990



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1726
Accuracy:  0.8091286307053942
Fold F1:  0.7276641550053821
Fold Precision:  0.7833140208574739
Fold Recall:  0.6793969849246231
Fold AUC:  0.8939178834996238
Fold AP:  0.8535286548989808

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2061



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1759
Accuracy:  0.8091286307053942
Fold F1:  0.7350785340314137
Fold Precision:  0.7981807845366685
Fold Recall:  0.6812227074235808
Fold AUC:  0.895922961222366
Fold AP:  0.86009808948793

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2025



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1766
Accuracy:  0.8074311580535647
Fold F1:  0.7306779213927723
Fold Precision:  0.7842582106455266
Fold Recall:  0.6839506172839506
Fold AUC:  0.8886905744112539
Fold AP:  0.8510063832330541

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2095



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1735
Accuracy:  0.8159185213127121
Fold F1:  0.7451697127937337
Fold Precision:  0.822478386167147
Fold Recall:  0.681145584725537
Fold AUC:  0.8972111275082177
Fold AP:  0.8696052538996346

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1780
Accuracy:  0.8187476423990947
Fold F1:  0.7503247596778383
Fold Precision:  0.8112359550561797
Fold Recall:  0.6979217013049782
Fold AUC:  0.9010141458978571
Fold AP:  0.8696783322701852

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1720
Accuracy:  0.8161071293851376
Fold F1:  0.7426761678543151
Fold Precision:  0.8180232558139535
Fold Recall:  0.680038666022233
Fold AUC:  0.9011537765225306
Fold AP:  0.8697796415959813

 ********************************************************************** 

Train shape: (47718, 380) | Val Shape: (5301, 380)
Positive Count in Val Split: 2092



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1758
Accuracy:  0.8072061875117902
Fold F1:  0.7345454545454546
Fold Precision:  0.8043230944254836
Fold Recall:  0.6759082217973231
Fold AUC:  0.8915212622005391
Fold AP:  0.8582810049162124

 ********************************************************************** 



[array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([0, 1, 1, ..., 1, 1, 1], dtype=int64)]

In [83]:
sample_submission = pd.DataFrame()
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,1
2,2110,1
3,55082,1
4,37165,1


In [84]:
sample_submission.moved_after_2019.value_counts()

1    9906
0    3349
Name: moved_after_2019, dtype: int64

In [85]:
sample_submission.to_csv('submission_skills_target_encode.csv',index=False)


In [86]:
count_enc = ce.CountEncoder()
count_enc.fit(train_skills, target_train)
skills_countencode_train = count_enc.transform(train_skills)
skills_countencode_test = count_enc.transform(test_skills)


In [87]:
skills_countencode_train

Unnamed: 0,user_id,skill
0,1301,766
1,1301,349
2,1301,9
3,1301,329
4,1301,185
...,...,...
1119667,248,1438
1119668,248,12145
1119669,248,13513
1119670,248,1


In [88]:
skills_countencode_test

Unnamed: 0,user_id,skill
0,17449,60.0
1,17449,2129.0
2,17449,567.0
3,17449,2420.0
4,17449,371.0
...,...,...
282638,16036,292.0
282639,16036,757.0
282640,16036,1027.0
282641,16036,1097.0


In [89]:
skills_countencode_train_agg = skills_countencode_train.groupby('user_id').agg(['sum', 'mean'])

In [90]:
skills_countencode_test_agg = skills_countencode_test.groupby('user_id').agg(['sum', 'mean'])

In [91]:
cat_agg(skills_countencode_train_agg)


         skill_count  skill_count_norm
user_id                               
0               3107       3107.000000
3               5805       1935.000000
5              10396        945.090909
6               9504       1584.000000
7              80157      11451.000000


In [92]:
cat_agg(skills_countencode_test_agg)

         skill_count  skill_count_norm
user_id                               
1             3463.0       1731.500000
2            38670.0       6445.000000
4             3107.0       3107.000000
11          205990.0       5885.428571
14           35572.0       3557.200000


In [93]:
test = pd.read_csv("./test_user_lang_work_revise.csv")
test = test.merge(skills_count, on="user_id", how="left")
test = test.merge(skills_countencode_test_agg, on="user_id", how="left")
test.drop(columns=["user_id"],inplace=True)

models = encode_test(skills_countencode_train_agg)
model_preds = [model.predict(test) for model in models]
model_preds

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2070



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1663
Accuracy:  0.7904564315352697
Fold F1:  0.7023841414412001
Fold Precision:  0.788334335538184
Fold Recall:  0.6333333333333333
Fold AUC:  0.876961813029129
Fold AP:  0.8377910894190997

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2063



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1641
Accuracy:  0.7913994718973972
Fold F1:  0.7014038876889849
Fold Precision:  0.7915904936014625
Fold Recall:  0.6296655356277266
Fold AUC:  0.8778094080909515
Fold AP:  0.8388396577802841

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 1989



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1626
Accuracy:  0.7998868351565447
Fold F1:  0.7065006915629323
Fold Precision:  0.7853628536285363
Fold Recall:  0.6420311714429362
Fold AUC:  0.8776135482248654
Fold AP:  0.8357433215326825

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 1990



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1630
Accuracy:  0.7974349302150132
Fold F1:  0.7033149171270718
Fold Precision:  0.7809815950920246
Fold Recall:  0.6396984924623116
Fold AUC:  0.8755884646663268
Fold AP:  0.8297748763459433

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2061



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1683
Accuracy:  0.7917766880422482
Fold F1:  0.7051282051282052
Fold Precision:  0.7843137254901961
Fold Recall:  0.6404657933042213
Fold AUC:  0.8766630422529391
Fold AP:  0.838675445591725

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2025



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1703
Accuracy:  0.7819690682761222
Fold F1:  0.6899141630901288
Fold Precision:  0.7551379917792131
Fold Recall:  0.6350617283950617
Fold AUC:  0.8691885456812728
Fold AP:  0.8243906474232134

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2095



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1649
Accuracy:  0.7902678234628442
Fold F1:  0.7029914529914529
Fold Precision:  0.7980594299575501
Fold Recall:  0.6281622911694511
Fold AUC:  0.8763723894553457
Fold AP:  0.8434857952637553

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1678
Accuracy:  0.7897019992455677
Fold F1:  0.7024286095543101
Fold Precision:  0.7842669845053635
Fold Recall:  0.6360560657322378
Fold AUC:  0.8820578683725722
Fold AP:  0.8461433328053293

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1665
Accuracy:  0.7940399849113542
Fold F1:  0.7075522228173541
Fold Precision:  0.7933933933933934
Fold Recall:  0.638472692121798
Fold AUC:  0.8819005222992649
Fold AP:  0.8450017425241713

 ********************************************************************** 

Train shape: (47718, 380) | Val Shape: (5301, 380)
Positive Count in Val Split: 2092



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1670
Accuracy:  0.7936238445576307
Fold F1:  0.709197235513025
Fold Precision:  0.7988023952095809
Fold Recall:  0.6376673040152964
Fold AUC:  0.8783273113917776
Fold AP:  0.8427419991171192

 ********************************************************************** 



[array([0, 0, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 1], dtype=int64)]

In [94]:
sample_submission = pd.DataFrame()
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,0
2,2110,0
3,55082,0
4,37165,1


In [95]:
sample_submission.moved_after_2019.value_counts()

0    9603
1    3652
Name: moved_after_2019, dtype: int64

In [96]:
sample_submission.to_csv('submission_skills_count_encode.csv',index=False)


In [18]:
used_skills = skills['skill'].value_counts().iloc[:140].index # we only took most common 20 skills, you can increase it

skills = skills[skills['skill'].isin(used_skills)]
skills['experience'] = True
skills = skills.drop_duplicates(['user_id', 'skill'])
skills = pd.pivot(skills, index='user_id', columns='skill', values='experience')
skills = skills.fillna(0).astype(int)
skills.head()

skill,.NET,.NET Core,.NET Framework,ADO.NET,AJAX,ASP.NET,ASP.NET MVC,Active Directory,Agile Methodologies,Agile Metotları,Algorithms,Amazon Web Services (AWS),Analitik Beceriler,Android,Android Development,Android Uygulama Geliştirme,Angular,AngularJS,Araştırma,Arduino,AutoCAD,Bootstrap,Business Analysis,C,C (Programming Language),C#,C++,CSS,Cascading Style Sheets (CSS),Data Analysis,Databases,Design Patterns,Django,Docker,Eclipse,Ekip Çalışması,Embedded Systems,Engineering,English,Entity Framework,Flutter,Git,HTML,HTML5,Hibernate,Integration,JIRA,JSON,JSP,Java,Java Enterprise Edition,JavaScript,Jenkins,Kubernetes,LINQ,Leadership,Liderlik,Linux,MVC,Machine Learning,Makine Öğrenimi,Management,Matlab,Maven,Microservices,Microsoft Excel,Microsoft Office,Microsoft SQL Server,Microsoft Word,Mobile Applications,MongoDB,MySQL,Mühendislik,Node.js,OOP,OOP (Nesne Yönelimli Programlama),Object Oriented Design,Object-Oriented Programming (OOP),Oracle,Oracle SQL Developer,PHP,PL/SQL,Photoshop,PostgreSQL,PowerPoint,Problem Solving,Programlama,Programming,Proje Yönetimi,Project Management,Project Planning,Python,Python (Programming Language),RESTful WebServices,React Native,React.js,Redis,Research,SOAP,SQL,Scrum,Software Design,Software Development,Software Engineering,Software Project Management,SolidWorks,Sorun Çözme,Spring Boot,Spring Framework,T-SQL,TCP/IP,TFS,Team Management,Teamwork,Telecommunications,Tomcat,TypeScript,UML,Unity,Veri Analizi,Visual Basic,Visual Studio,Vue.js,WCF,Web Applications,Web Development,Web Geliştirme,Web Services,Web Tasarımı,Web Uygulamaları,Windows,Windows Server,WordPress,XML,Yazılım,Yazılım Geliştirme,Yönetim,jQuery,İletişim,İngilizce
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [19]:
skills.rename(columns={"C#": "Csharp","C++": "Cplusplus"}, inplace=True)
skills.head()


skill,.NET,.NET Core,.NET Framework,ADO.NET,AJAX,ASP.NET,ASP.NET MVC,Active Directory,Agile Methodologies,Agile Metotları,Algorithms,Amazon Web Services (AWS),Analitik Beceriler,Android,Android Development,Android Uygulama Geliştirme,Angular,AngularJS,Araştırma,Arduino,AutoCAD,Bootstrap,Business Analysis,C,C (Programming Language),Csharp,Cplusplus,CSS,Cascading Style Sheets (CSS),Data Analysis,Databases,Design Patterns,Django,Docker,Eclipse,Ekip Çalışması,Embedded Systems,Engineering,English,Entity Framework,Flutter,Git,HTML,HTML5,Hibernate,Integration,JIRA,JSON,JSP,Java,Java Enterprise Edition,JavaScript,Jenkins,Kubernetes,LINQ,Leadership,Liderlik,Linux,MVC,Machine Learning,Makine Öğrenimi,Management,Matlab,Maven,Microservices,Microsoft Excel,Microsoft Office,Microsoft SQL Server,Microsoft Word,Mobile Applications,MongoDB,MySQL,Mühendislik,Node.js,OOP,OOP (Nesne Yönelimli Programlama),Object Oriented Design,Object-Oriented Programming (OOP),Oracle,Oracle SQL Developer,PHP,PL/SQL,Photoshop,PostgreSQL,PowerPoint,Problem Solving,Programlama,Programming,Proje Yönetimi,Project Management,Project Planning,Python,Python (Programming Language),RESTful WebServices,React Native,React.js,Redis,Research,SOAP,SQL,Scrum,Software Design,Software Development,Software Engineering,Software Project Management,SolidWorks,Sorun Çözme,Spring Boot,Spring Framework,T-SQL,TCP/IP,TFS,Team Management,Teamwork,Telecommunications,Tomcat,TypeScript,UML,Unity,Veri Analizi,Visual Basic,Visual Studio,Vue.js,WCF,Web Applications,Web Development,Web Geliştirme,Web Services,Web Tasarımı,Web Uygulamaları,Windows,Windows Server,WordPress,XML,Yazılım,Yazılım Geliştirme,Yönetim,jQuery,İletişim,İngilizce
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [20]:
train = pd.read_csv("./train_user_lang_work_ed_revise.csv")
test = pd.read_csv("./test_user_lang_work_ed_revise.csv")

In [21]:
train.shape

(53019, 480)

In [22]:
test.shape

(13255, 479)

In [23]:
train = train.merge(skills, on="user_id", how="left")
train = train.merge(skills_count, on="user_id", how="left")
train.shape

(53019, 621)

In [24]:
test = test.merge(skills, on="user_id", how="left")
test = test.merge(skills_count, on="user_id", how="left")
test.shape

(13255, 620)

In [25]:
train.to_csv("train_user_lang_work_ed_skill_revise.csv", index=False)
test.to_csv("test_user_lang_work_ed_skill_revise.csv", index=False)

In [27]:
print("done")

done


In [101]:

from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, recall_score, precision_score, accuracy_score
import re



train = pd.read_csv("./train_user_lang_work_revise.csv")
train = train.merge(skills, on="user_id", how="left")
train = train.merge(skills_count, on="user_id", how="left")



X_train_user_id = train["user_id"]

y = train["moved_after_2019"]
X = train.drop(columns=["user_id", "moved_after_2019"])
X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

auc_errs = []
ap_errs = []
prec_errs = []
recall_errs = []
f1_errs = []
acc_scores = []

models = []

kf = KFold(n_splits=10)

for split_train, split_val in list(kf.split(X)):
    
    split_train = X.index[split_train]
    split_val = X.index[split_val]
    
    X_train, y_train = X.loc[split_train], y.loc[split_train]
    X_val, y_val = X.loc[split_val], y.loc[split_val]
    print("Train shape:", X_train.shape, "|", "Val Shape:", X_val.shape)
    print("Positive Count in Val Split:", y_val.sum())

    model = lgb.LGBMClassifier()
    

    model.fit(X_train, y_train, eval_metric = 'accuracy',
                eval_set = [(X_val, y_val), (X_train, y_train)],
                eval_names = ['valid', 'train'],
                early_stopping_rounds = 100, verbose = 200)
    
    preds = model.predict(X_val)
    pred_probas = model.predict_proba(X_val)[:,1]

    print("Positive Count in Predictions:", preds.sum())
    
    acc = accuracy_score(y_val, preds)
    print("Accuracy: ", acc)
    acc_scores.append(acc)
    
    f1_err = f1_score(y_val, preds)
    print("Fold F1: ", f1_err)
    f1_errs.append(f1_err)

    prec_err = precision_score(y_val, preds)
    print("Fold Precision: ", prec_err)
    prec_errs.append(prec_err)

    recall_err = recall_score(y_val, preds)
    print("Fold Recall: ", recall_err)
    recall_errs.append(recall_err)
    
    auc_err = roc_auc_score(y_val, pred_probas)
    print("Fold AUC: ", auc_err)
    auc_errs.append(auc_err)

    ap_err = average_precision_score(y_val, pred_probas)
    print("Fold AP: ", ap_err)
    ap_errs.append(ap_err)
    
    models.append(model)
    
    print("\n", "*"*70, "\n")

import plotly.express as px
importance = [model.feature_importances_ for model in models]

f_importance = pd.concat([pd.Series(X.columns.to_list(),name='Feature'),
                        pd.Series(np.mean(importance,axis=0),name="Importance")],
                        axis=1).sort_values(by='Importance',
                                            ascending=True)

fig = px.bar(f_importance.tail(20),x='Importance',y='Feature')
fig.update_layout(
    title_text="First 20 Important Features - CatBoost Average of Folds"
)
fig.show()


    

    

Train shape: (47717, 398) | Val Shape: (5302, 398)
Positive Count in Val Split: 2070



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1650
Accuracy:  0.7913994718973972
Fold F1:  0.7026881720430107
Fold Precision:  0.7921212121212121
Fold Recall:  0.6314009661835749
Fold AUC:  0.877164944755345
Fold AP:  0.8384734126073782

 ********************************************************************** 

Train shape: (47717, 398) | Val Shape: (5302, 398)
Positive Count in Val Split: 2063



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1655
Accuracy:  0.7868728781591852
Fold F1:  0.6960731576116191
Fold Precision:  0.7818731117824773
Fold Recall:  0.6272418807561804
Fold AUC:  0.8771376838000634
Fold AP:  0.8394677195462042

 ********************************************************************** 

Train shape: (47717, 398) | Val Shape: (5302, 398)
Positive Count in Val Split: 1989



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1654
Accuracy:  0.7987551867219918
Fold F1:  0.7071095251166621
Fold Precision:  0.7787182587666264
Fold Recall:  0.6475615887380594
Fold AUC:  0.8778491482811362
Fold AP:  0.8372716386670165

 ********************************************************************** 

Train shape: (47717, 398) | Val Shape: (5302, 398)
Positive Count in Val Split: 1990



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1619
Accuracy:  0.7953602414183327
Fold F1:  0.6993627043502355
Fold Precision:  0.7794935145151328
Fold Recall:  0.6341708542713568
Fold AUC:  0.8765321929696793
Fold AP:  0.8309512647623181

 ********************************************************************** 

Train shape: (47717, 398) | Val Shape: (5302, 398)
Positive Count in Val Split: 2061



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1679
Accuracy:  0.7978121463598642
Fold F1:  0.7133689839572194
Fold Precision:  0.7945205479452054
Fold Recall:  0.6472586123241145
Fold AUC:  0.8773201075916421
Fold AP:  0.8397506171079114

 ********************************************************************** 

Train shape: (47717, 398) | Val Shape: (5302, 398)
Positive Count in Val Split: 2025



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1722
Accuracy:  0.7866842700867597
Fold F1:  0.6981585268214572
Fold Precision:  0.759581881533101
Fold Recall:  0.6459259259259259
Fold AUC:  0.8714988490677638
Fold AP:  0.8276691245912008

 ********************************************************************** 

Train shape: (47717, 398) | Val Shape: (5302, 398)
Positive Count in Val Split: 2095



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1640
Accuracy:  0.7930969445492267
Fold F1:  0.7062918340026774
Fold Precision:  0.8042682926829269
Fold Recall:  0.6295942720763723
Fold AUC:  0.8777437184321588
Fold AP:  0.8457230235161879

 ********************************************************************** 

Train shape: (47717, 398) | Val Shape: (5302, 398)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1687
Accuracy:  0.7978121463598642
Fold F1:  0.7145899893503729
Fold Precision:  0.7954949614700652
Fold Recall:  0.6486225229579508
Fold AUC:  0.8837291004424077
Fold AP:  0.8468061850937031

 ********************************************************************** 

Train shape: (47717, 398) | Val Shape: (5302, 398)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1676
Accuracy:  0.7972463221425877
Fold F1:  0.712950600801068
Fold Precision:  0.7965393794749404
Fold Recall:  0.6452392460125664
Fold AUC:  0.8838290394922947
Fold AP:  0.8479604248909354

 ********************************************************************** 

Train shape: (47718, 398) | Val Shape: (5301, 398)
Positive Count in Val Split: 2092



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1675
Accuracy:  0.7949443501226183
Fold F1:  0.7114414653570481
Fold Precision:  0.8
Fold Recall:  0.6405353728489483
Fold AUC:  0.8793869357632424
Fold AP:  0.8447974413735947

 ********************************************************************** 



In [21]:
skill_normalize = skills['skill'].value_counts(normalize=True)

# Replace the skill column with the frequency values
skills['skill'] = skills['skill'].map(skill_normalize)

In [22]:
skills

Unnamed: 0,user_id,skill
0,1,0.002591
1,1,0.000507
2,2,0.004067
3,2,0.017419
4,2,0.000225
...,...,...
1398438,66273,0.000139
1398439,66273,0.000166
1398440,66273,0.002606
1398441,66273,0.000333


In [23]:
def add_noise(x, noise_level):
    return x * (1 + noise_level * np.random.randn())

# Add noise to the encoded values
skills['skill'] = skills['skill'].apply(lambda x: add_noise(x, 0.1))

In [24]:
skills

Unnamed: 0,user_id,skill
0,1,0.002846
1,1,0.000487
2,2,0.003809
3,2,0.017856
4,2,0.000265
...,...,...
1398438,66273,0.000152
1398439,66273,0.000148
1398440,66273,0.003073
1398441,66273,0.000351


In [25]:
skills_freq_test = skills.groupby('user_id').agg(['sum', 'mean'])

In [26]:
cat_agg(skills_freq_test)


         skill_count  skill_count_norm
user_id                               
1           0.003333          0.001667
2           0.034320          0.005720
3           0.005305          0.001768
5           0.009665          0.000879
6           0.009043          0.001507


In [127]:
test = pd.read_csv("./test_user_lang_work_revise.csv")
test = test.merge(skills_count, on="user_id", how="left")
test = test.merge(skills_freq_test, on="user_id", how="left")
test.drop(columns=["user_id"],inplace=True)

models = encode_test(skills_freq_test)
model_preds = [model.predict(test) for model in models]
model_preds

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2070



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1652
Accuracy:  0.7902678234628442
Fold F1:  0.7012358946802795
Fold Precision:  0.7899515738498789
Fold Recall:  0.6304347826086957
Fold AUC:  0.8772876608313005
Fold AP:  0.8389534139914324

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2063



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1655
Accuracy:  0.7906450396076952
Fold F1:  0.7014523937600862
Fold Precision:  0.7879154078549849
Fold Recall:  0.6320891904992729
Fold AUC:  0.8780769155366379
Fold AP:  0.8393526424519306

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 1989



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1618
Accuracy:  0.7957374575631837
Fold F1:  0.6997504851677294
Fold Precision:  0.7799752781211372
Fold Recall:  0.6344896933132227
Fold AUC:  0.8767339746814543
Fold AP:  0.835031562240228

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 1990



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1633
Accuracy:  0.7964918898528857
Fold F1:  0.7021805133866961
Fold Precision:  0.7789344764237599
Fold Recall:  0.6391959798994975
Fold AUC:  0.8749742067827058
Fold AP:  0.8287917579915661

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2061



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1683
Accuracy:  0.7944172010562052
Fold F1:  0.7088675213675213
Fold Precision:  0.7884729649435532
Fold Recall:  0.6438622028141678
Fold AUC:  0.8771870926557941
Fold AP:  0.8395295657161232

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2025



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1701
Accuracy:  0.7827235005658242
Fold F1:  0.6908212560386473
Fold Precision:  0.7566137566137566
Fold Recall:  0.6355555555555555
Fold AUC:  0.8686260016501091
Fold AP:  0.8238491587614551

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2095



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1641
Accuracy:  0.7895133911731422
Fold F1:  0.7012847965738758
Fold Precision:  0.7982937233394272
Fold Recall:  0.6252983293556086
Fold AUC:  0.8764040177624572
Fold AP:  0.8443680967403056

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1696
Accuracy:  0.7949830252734817
Fold F1:  0.7112881806108897
Fold Precision:  0.7895047169811321
Fold Recall:  0.6471725471242146
Fold AUC:  0.8820366397337032
Fold AP:  0.8459984981301084

 ********************************************************************** 

Train shape: (47717, 380) | Val Shape: (5302, 380)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1667
Accuracy:  0.7966804979253111
Fold F1:  0.711456102783726
Fold Precision:  0.7972405518896221
Fold Recall:  0.6423392943450943
Fold AUC:  0.8817821203134603
Fold AP:  0.8453013051177896

 ********************************************************************** 

Train shape: (47718, 380) | Val Shape: (5301, 380)
Positive Count in Val Split: 2092



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1674
Accuracy:  0.7932465572533485
Fold F1:  0.7089750398300585
Fold Precision:  0.7974910394265233
Fold Recall:  0.6381453154875717
Fold AUC:  0.8786477980488671
Fold AP:  0.8427487883136399

 ********************************************************************** 



[array([0, 0, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 0, 0, ..., 1, 1, 1], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 1], dtype=int64)]

In [128]:
sample_submission = pd.DataFrame()
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,0
2,2110,0
3,55082,0
4,37165,0


In [129]:
sample_submission.moved_after_2019.value_counts()

0    10054
1     3201
Name: moved_after_2019, dtype: int64

In [130]:
sample_submission.to_csv('submission_freq_encode.csv',index=False)


In [10]:
work = pd.read_csv("../data/work_experiences.csv")
work

Unnamed: 0,user_id,company_id,location,start_year_month
0,53442,2651,"Istanbul, Turkey",201509
1,34558,815,"Istanbul, Turkey",201210
2,63761,26354,,200010
3,10738,89,,201610
4,8711,3113,"Istanbul, Turkey",201801
...,...,...,...,...
187014,22180,15065,Gebze,201205
187015,55822,25076,stajer,201506
187016,13750,1607,,201901
187017,3679,1414,"Ankara, Turkey",201807


In [11]:
work_normalize = work['company_id'].value_counts(normalize=True)

# Replace the skill column with the frequency values
work['company_id'] = work['company_id'].map(work_normalize)

In [12]:
work

Unnamed: 0,user_id,company_id,location,start_year_month
0,53442,0.000428,"Istanbul, Turkey",201509
1,34558,0.000577,"Istanbul, Turkey",201210
2,63761,0.000005,,200010
3,10738,0.008668,,201610
4,8711,0.000914,"Istanbul, Turkey",201801
...,...,...,...,...
187014,22180,0.000075,Gebze,201205
187015,55822,0.000005,stajer,201506
187016,13750,0.007037,,201901
187017,3679,0.001069,"Ankara, Turkey",201807


In [13]:
def add_noise(x, noise_level):
    return x * (1 + noise_level * np.random.randn())

# Add noise to the encoded values
work['company_id'] = work['company_id'].apply(lambda x: add_noise(x, 0.1))

In [14]:
work

Unnamed: 0,user_id,company_id,location,start_year_month
0,53442,0.000493,"Istanbul, Turkey",201509
1,34558,0.000569,"Istanbul, Turkey",201210
2,63761,0.000005,,200010
3,10738,0.009712,,201610
4,8711,0.000904,"Istanbul, Turkey",201801
...,...,...,...,...
187014,22180,0.000066,Gebze,201205
187015,55822,0.000006,stajer,201506
187016,13750,0.006794,,201901
187017,3679,0.001123,"Ankara, Turkey",201807


In [15]:
work.drop(columns=["location", "start_year_month"], inplace=True)

In [16]:
work_freq_test = work.groupby('user_id').agg(['sum', 'mean'])

In [19]:
cat_agg(work_freq_test)


         company_id_count  company_id_count_norm
user_id                                         
0                0.000053               0.000026
2                0.000060               0.000020
5                0.000015               0.000015
7                0.002689               0.000672
10               0.024308               0.008103


In [35]:
test = pd.read_csv("./test_user_lang_work_revise.csv")
test = test.merge(skills_count, on="user_id", how="left")
test = test.merge(skills_freq_test, on="user_id", how="left")
test.drop(columns=["user_id","work_company_id2_mean","work_company_id2_sum"],inplace=True)

In [36]:

from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, recall_score, precision_score, accuracy_score
import re



train = pd.read_csv("./train_user_lang_work_revise.csv")
train = train.merge(skills_freq_test, on="user_id", how="left")
train = train.merge(skills_count, on="user_id", how="left")



X_train_user_id = train["user_id"]

y = train["moved_after_2019"]
X = train.drop(columns=["user_id", "moved_after_2019", "work_company_id2_mean","work_company_id2_sum"])
X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

auc_errs = []
ap_errs = []
prec_errs = []
recall_errs = []
f1_errs = []
acc_scores = []

models = []

kf = KFold(n_splits=10)

for split_train, split_val in list(kf.split(X)):
    
    split_train = X.index[split_train]
    split_val = X.index[split_val]
    
    X_train, y_train = X.loc[split_train], y.loc[split_train]
    X_val, y_val = X.loc[split_val], y.loc[split_val]
    print("Train shape:", X_train.shape, "|", "Val Shape:", X_val.shape)
    print("Positive Count in Val Split:", y_val.sum())

    model = lgb.LGBMClassifier()
    

    model.fit(X_train, y_train, eval_metric = 'accuracy',
                eval_set = [(X_val, y_val), (X_train, y_train)],
                eval_names = ['valid', 'train'],
                early_stopping_rounds = 100, verbose = 200)
    
    preds = model.predict(X_val)
    pred_probas = model.predict_proba(X_val)[:,1]

    print("Positive Count in Predictions:", preds.sum())
    
    acc = accuracy_score(y_val, preds)
    print("Accuracy: ", acc)
    acc_scores.append(acc)
    
    f1_err = f1_score(y_val, preds)
    print("Fold F1: ", f1_err)
    f1_errs.append(f1_err)

    prec_err = precision_score(y_val, preds)
    print("Fold Precision: ", prec_err)
    prec_errs.append(prec_err)

    recall_err = recall_score(y_val, preds)
    print("Fold Recall: ", recall_err)
    recall_errs.append(recall_err)
    
    auc_err = roc_auc_score(y_val, pred_probas)
    print("Fold AUC: ", auc_err)
    auc_errs.append(auc_err)

    ap_err = average_precision_score(y_val, pred_probas)
    print("Fold AP: ", ap_err)
    ap_errs.append(ap_err)
    
    models.append(model)
    
    print("\n", "*"*70, "\n")

import plotly.express as px
importance = [model.feature_importances_ for model in models]

f_importance = pd.concat([pd.Series(X.columns.to_list(),name='Feature'),
                        pd.Series(np.mean(importance,axis=0),name="Importance")],
                        axis=1).sort_values(by='Importance',
                                            ascending=True)

fig = px.bar(f_importance.tail(20),x='Importance',y='Feature')
fig.update_layout(
    title_text="First 20 Important Features - CatBoost Average of Folds"
)
fig.show()



Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 2070



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1183
Accuracy:  0.6889852885703508
Fold F1:  0.49308330771595454
Fold Precision:  0.6779374471682164
Fold Recall:  0.38743961352657
Fold AUC:  0.7247666003013344
Fold AP:  0.6326024622425557

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 2063



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1164
Accuracy:  0.6938890984534138
Fold F1:  0.4970560892469786
Fold Precision:  0.6890034364261168
Fold Recall:  0.3887542413960252
Fold AUC:  0.730579520647609
Fold AP:  0.6482998247373172

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 1989



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1158
Accuracy:  0.6961523953225198
Fold F1:  0.4880838894184938
Fold Precision:  0.6632124352331606
Fold Recall:  0.3861236802413273
Fold AUC:  0.7333473251692033
Fold AP:  0.6397127487488862

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 1990



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1164
Accuracy:  0.6937004903809884
Fold F1:  0.48509828788839576
Fold Precision:  0.6572164948453608
Fold Recall:  0.3844221105527638
Fold AUC:  0.7258394933605224
Fold AP:  0.6195296820646117

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 2061



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1186
Accuracy:  0.6840814786872879
Fold F1:  0.48413920542038796
Fold Precision:  0.6627318718381113
Fold Recall:  0.3813682678311499
Fold AUC:  0.7201775947755745
Fold AP:  0.6304795970570974

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 2025



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1205
Accuracy:  0.6861561674839683
Fold F1:  0.48482972136222907
Fold Precision:  0.649792531120332
Fold Recall:  0.38666666666666666
Fold AUC:  0.7202194117624898
Fold AP:  0.6171451404896475

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 2095



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1246
Accuracy:  0.6840814786872879
Fold F1:  0.49865309787488776
Fold Precision:  0.6685393258426966
Fold Recall:  0.39761336515513124
Fold AUC:  0.7269806576157615
Fold AP:  0.6430207995508965

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1199
Accuracy:  0.6846473029045643
Fold F1:  0.48837209302325585
Fold Precision:  0.6655546288573811
Fold Recall:  0.38569357177380376
Fold AUC:  0.7381490749770111
Fold AP:  0.6433678801148843

 ********************************************************************** 

Train shape: (47717, 378) | Val Shape: (5302, 378)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1167
Accuracy:  0.6820067898906074
Fold F1:  0.4789864029666254
Fold Precision:  0.6640959725792631
Fold Recall:  0.37457709038182696
Fold AUC:  0.7292251980355436
Fold AP:  0.6360659260206414

 ********************************************************************** 

Train shape: (47718, 378) | Val Shape: (5301, 378)
Positive Count in Val Split: 2092



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1165
Accuracy:  0.6817581588379551
Fold F1:  0.48203868590727666
Fold Precision:  0.6738197424892703
Fold Recall:  0.37523900573613767
Fold AUC:  0.7201456586905733
Fold AP:  0.6403016629248502

 ********************************************************************** 



In [37]:
    
model_preds = [model.predict(test) for model in models]
model_preds

[array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 1], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 1], dtype=int64)]

In [38]:
sample_submission = pd.DataFrame()
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,1
2,2110,0
3,55082,0
4,37165,0


In [39]:
sample_submission.moved_after_2019.value_counts()

0    11356
1     1899
Name: moved_after_2019, dtype: int64

In [40]:
sample_submission.to_csv('submission_freq_encode_compid_dropped.csv',index=False)


In [2]:
work_company = pd.read_csv("../data/work_experiences.csv",  usecols = ['user_id','company_id'])

In [3]:
work_company.head()

Unnamed: 0,user_id,company_id
0,53442,2651
1,34558,815
2,63761,26354
3,10738,89
4,8711,3113


In [4]:
work_company.groupby("user_id").company_id.nunique().value_counts()

1     14868
2     14547
3     12075
4      7646
5      4069
6      2047
7       942
8       426
9       186
10      101
11       64
14       30
13       23
12       21
16       14
15       10
19        3
23        2
18        2
17        1
24        1
20        1
Name: company_id, dtype: int64

In [5]:
df_new = pd.DataFrame()
df_new['company(1st)'] = work_company.groupby("user_id")['company_id'].nth(-1).astype(str)
df_new['company(2nd)'] = work_company.groupby("user_id")['company_id'].nth(-2).astype(str)
df_new['company(3rd)'] = work_company.groupby("user_id")['company_id'].nth(-3).astype(str)
df_new['company(4th)'] = work_company.groupby("user_id")['company_id'].nth(-4).astype(str)

work_company = df_new
work_company.head()

Unnamed: 0_level_0,company(1st),company(2nd),company(3rd),company(4th)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,0.0,,
2,10,7.0,9.0,
5,15,,,
7,21,21.0,21.0,20.0
10,26,26.0,27.0,


In [6]:
train = pd.read_csv("./train_user_lang_work_revise.csv")
train.head()

Unnamed: 0,user_id,is_yurtdisi,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany yurtdisi,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands yurtdisi,user_location_nevsehir,user_location_ordu,user_location_philippines yurtdisi,user_location_poland yurtdisi,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia yurtdisi,user_location_sweden yurtdisi,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom yurtdisi,user_location_united states yurtdisi,user_location_van,user_location_yalova,user_location_zonguldak,moved_after_2019,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum,language_language_albanian_count,language_language_albanian_count_norm,language_language_almanca_count,language_language_almanca_count_norm,language_language_arapca_count,language_language_arapca_count_norm,language_language_armenian_count,language_language_armenian_count_norm,language_language_arnavutca_count,language_language_arnavutca_count_norm,language_language_azerice_count,language_language_azerice_count_norm,language_language_bosnakca_count,language_language_bosnakca_count_norm,language_language_bulgarca_count,language_language_bulgarca_count_norm,language_language_cince_count,language_language_cince_count_norm,language_language_czech_count,language_language_czech_count_norm,language_language_dutch_count,language_language_dutch_count_norm,language_language_farsca_count,language_language_farsca_count_norm,language_language_fince_count,language_language_fince_count_norm,language_language_fransizca_count,language_language_fransizca_count_norm,language_language_greek_count,language_language_greek_count_norm,language_language_hebrew_count,language_language_hebrew_count_norm,language_language_ingilizce_count,language_language_ingilizce_count_norm,language_language_isaret dilleri_count,language_language_isaret dilleri_count_norm,language_language_ispanyolca_count,language_language_ispanyolca_count_norm,language_language_italyanca_count,language_language_italyanca_count_norm,language_language_japonca_count,language_language_japonca_count_norm,language_language_kazakca_count,language_language_kazakca_count_norm,language_language_korece_count,language_language_korece_count_norm,language_language_kurtce_count,language_language_kurtce_count_norm,language_language_latince_count,language_language_latince_count_norm,language_language_lehce_count,language_language_lehce_count_norm,language_language_other_count,language_language_other_count_norm,language_language_persian_count,language_language_persian_count_norm,language_language_polish_count,language_language_polish_count_norm,language_language_portekizce_count,language_language_portekizce_count_norm,language_language_rusca_count,language_language_rusca_count_norm,language_language_sirpca_count,language_language_sirpca_count_norm,language_language_swedish_count,language_language_swedish_count_norm,language_language_turkce_count,language_language_turkce_count_norm,language_language_urdu_count,language_language_urdu_count_norm,language_language_uzbek_count,language_language_uzbek_count_norm,work_job_duration_max,work_job_duration_min,work_job_duration_std,work_job_count_mean,work_min_start_year_min,work_working_years_mean,work_total_job_duration_min,work_avg_work_year_min,work_quit_job_2018_mean,work_quit_job_2018_sum,work_max_start_year_min,work_year_of_job_start_mean,work_year_of_job_start_sum,work_month_of_job_start_mean,work_month_of_job_start_max,work_month_of_job_start_min,work_month_of_job_start_sum,work_quit_job_in_2018_mean,work_quit_job_in_2018_sum,work_moving_average_mean,work_moving_average_max,work_moving_average_min,work_moving_average_sum,work_quit_job_in_2018_change_mean,work_quit_job_in_2018_change_sum,work_time_since_last_change_mean,work_time_since_last_change_sum,work_seasonal_trend_mean,work_seasonal_trend_max,work_seasonal_trend_min,work_seasonal_trend_sum,work_start_date_squared_mean,work_start_date_squared_max,work_start_date_squared_min,work_start_date_squared_sum,work_company_id2_mean,work_company_id2_sum,job_tenure_groups_1-2 years_count,job_tenure_groups_1-2 years_count_norm,job_tenure_groups_2-5 years_count,job_tenure_groups_2-5 years_count_norm,job_tenure_groups_5-10 years_count,job_tenure_groups_5-10 years_count_norm,job_tenure_groups_Less than 1 year_count,job_tenure_groups_Less than 1 year_count_norm,job_tenure_groups_More than 10 years_count,job_tenure_groups_More than 10 years_count_norm,start_quarter_Q1_count,start_quarter_Q1_count_norm,start_quarter_Q2_count,start_quarter_Q2_count_norm,start_quarter_Q3_count,start_quarter_Q3_count_norm,start_quarter_Q4_count,start_quarter_Q4_count_norm,start_season_fall_count,start_season_fall_count_norm,start_season_spring_count,start_season_spring_count_norm,start_season_summer_count,start_season_summer_count_norm,start_season_winter_count,start_season_winter_count_norm,job_location_adana_count,job_location_adana_count_norm,job_location_ankara_count,job_location_ankara_count_norm,job_location_antalya_count,job_location_antalya_count_norm,job_location_area_count,job_location_area_count_norm,job_location_bursa_count,job_location_bursa_count_norm,job_location_elazig_count,job_location_elazig_count_norm,job_location_eskisehir_count,job_location_eskisehir_count_norm,job_location_gebze_count,job_location_gebze_count_norm,job_location_germany_count,job_location_germany_count_norm,job_location_istanbul_count,job_location_istanbul_count_norm,job_location_izmir_count,job_location_izmir_count_norm,job_location_kayseri_count,job_location_kayseri_count_norm,job_location_kocaeli_count,job_location_kocaeli_count_norm,job_location_konya_count,job_location_konya_count_norm,job_location_manisa_count,job_location_manisa_count_norm,job_location_mersin_count,job_location_mersin_count_norm,job_location_mugla_count,job_location_mugla_count_norm,job_location_sakarya_count,job_location_sakarya_count_norm,job_location_tekirdag_count,job_location_tekirdag_count_norm,job_location_trabzon_count,job_location_trabzon_count_norm,job_location_turkiye_count,job_location_turkiye_count_norm,job_location_united states_count,job_location_united states_count_norm
0,1301,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2.0,4.0,5.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.416438,0.419178,1.053681,3.0,2015.0,3.0,3.671233,1.223744,0.333333,1.0,2018.0,2016.0,6048.0,6.0,10.0,3.0,18.0,0.333333,1.0,0.188044,0.295428,0.104247,0.564131,0.0,0.0,0.666667,2.0,0.140932,0.150295,0.130749,0.422795,4064258.0,4072324.0,4060225.0,12192774.0,0.784913,2.35474,0.0,0.0,1.0,0.333333,0.0,0.0,2.0,0.666667,0.0,0.0,1.0,0.333333,1.0,0.333333,0.0,0.0,1.0,0.333333,1.0,0.333333,2.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6950,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.0,1.666667,3.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,1.0,0.333333,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.912329,0.49863,0.569431,6.0,2013.0,5.0,5.753425,0.958904,0.0,0.0,2017.0,2015.166667,12091.0,5.0,8.0,2.0,30.0,0.0,0.0,0.157705,0.28125,0.059347,0.946232,0.0,0.0,0.5,2.0,0.144526,0.159553,0.124661,0.867154,4060898.0,4068289.0,4052169.0,24365391.0,0.365522,2.193131,3.0,0.5,0.0,0.0,0.0,0.0,3.0,0.5,0.0,0.0,2.0,0.333333,2.0,0.333333,2.0,0.333333,0.0,0.0,0.0,0.0,1.0,0.166667,3.0,0.5,2.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.668493,0.747945,0.393513,4.0,2014.0,4.0,5.0,1.25,0.0,0.0,2017.0,2015.5,8062.0,4.25,8.0,1.0,17.0,0.0,0.0,0.181361,0.297016,0.077922,0.725446,0.0,0.0,2.0,8.0,0.138515,0.150295,0.130749,0.554058,4062242.0,4068289.0,4056196.0,16248966.0,0.09375,0.375,3.0,0.75,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,2.0,0.5,1.0,0.25,1.0,0.25,0.0,0.0,0.0,0.0,2.0,0.5,1.0,0.25,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,26046,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.252055,1.082192,1.35696,4.0,2009.0,9.0,9.336986,2.334247,0.0,0.0,2014.0,2011.25,8045.0,10.25,12.0,9.0,41.0,0.0,0.0,0.041784,0.08548,0.014599,0.167136,0.0,0.0,0.0,0.0,0.133489,0.141751,0.110077,0.533957,4045130.0,4056196.0,4036081.0,16180521.0,0.35474,1.41896,2.0,0.5,2.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,3.0,0.75,3.0,0.75,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11005,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,4.0,5.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,6.671233,0.915068,3.110291,3.0,2009.0,9.0,9.336986,3.112329,0.0,0.0,2017.0,2012.0,6036.0,7.0,9.0,4.0,21.0,0.0,0.0,0.102134,0.265915,0.017964,0.306401,0.0,0.0,0.666667,2.0,0.135452,0.141319,0.124661,0.406357,4048157.0,4068289.0,4036081.0,12144470.0,0.299816,0.899447,1.0,0.333333,0.0,0.0,1.0,0.333333,1.0,0.333333,0.0,0.0,0.0,0.0,1.0,0.333333,2.0,0.666667,0.0,0.0,1.0,0.333333,1.0,0.333333,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
test = pd.read_csv("./test_user_lang_work_revise.csv")
test.head()

Unnamed: 0,user_id,is_yurtdisi,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany yurtdisi,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands yurtdisi,user_location_nevsehir,user_location_ordu,user_location_philippines yurtdisi,user_location_poland yurtdisi,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia yurtdisi,user_location_sweden yurtdisi,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom yurtdisi,user_location_united states yurtdisi,user_location_van,user_location_yalova,user_location_zonguldak,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum,language_language_albanian_count,language_language_albanian_count_norm,language_language_almanca_count,language_language_almanca_count_norm,language_language_arapca_count,language_language_arapca_count_norm,language_language_armenian_count,language_language_armenian_count_norm,language_language_arnavutca_count,language_language_arnavutca_count_norm,language_language_azerice_count,language_language_azerice_count_norm,language_language_bosnakca_count,language_language_bosnakca_count_norm,language_language_bulgarca_count,language_language_bulgarca_count_norm,language_language_cince_count,language_language_cince_count_norm,language_language_czech_count,language_language_czech_count_norm,language_language_dutch_count,language_language_dutch_count_norm,language_language_farsca_count,language_language_farsca_count_norm,language_language_fince_count,language_language_fince_count_norm,language_language_fransizca_count,language_language_fransizca_count_norm,language_language_greek_count,language_language_greek_count_norm,language_language_hebrew_count,language_language_hebrew_count_norm,language_language_ingilizce_count,language_language_ingilizce_count_norm,language_language_isaret dilleri_count,language_language_isaret dilleri_count_norm,language_language_ispanyolca_count,language_language_ispanyolca_count_norm,language_language_italyanca_count,language_language_italyanca_count_norm,language_language_japonca_count,language_language_japonca_count_norm,language_language_kazakca_count,language_language_kazakca_count_norm,language_language_korece_count,language_language_korece_count_norm,language_language_kurtce_count,language_language_kurtce_count_norm,language_language_latince_count,language_language_latince_count_norm,language_language_lehce_count,language_language_lehce_count_norm,language_language_other_count,language_language_other_count_norm,language_language_persian_count,language_language_persian_count_norm,language_language_polish_count,language_language_polish_count_norm,language_language_portekizce_count,language_language_portekizce_count_norm,language_language_rusca_count,language_language_rusca_count_norm,language_language_sirpca_count,language_language_sirpca_count_norm,language_language_swedish_count,language_language_swedish_count_norm,language_language_turkce_count,language_language_turkce_count_norm,language_language_urdu_count,language_language_urdu_count_norm,language_language_uzbek_count,language_language_uzbek_count_norm,work_job_duration_max,work_job_duration_min,work_job_duration_std,work_job_count_mean,work_min_start_year_min,work_working_years_mean,work_total_job_duration_min,work_avg_work_year_min,work_quit_job_2018_mean,work_quit_job_2018_sum,work_max_start_year_min,work_year_of_job_start_mean,work_year_of_job_start_sum,work_month_of_job_start_mean,work_month_of_job_start_max,work_month_of_job_start_min,work_month_of_job_start_sum,work_quit_job_in_2018_mean,work_quit_job_in_2018_sum,work_moving_average_mean,work_moving_average_max,work_moving_average_min,work_moving_average_sum,work_quit_job_in_2018_change_mean,work_quit_job_in_2018_change_sum,work_time_since_last_change_mean,work_time_since_last_change_sum,work_seasonal_trend_mean,work_seasonal_trend_max,work_seasonal_trend_min,work_seasonal_trend_sum,work_start_date_squared_mean,work_start_date_squared_max,work_start_date_squared_min,work_start_date_squared_sum,work_company_id2_mean,work_company_id2_sum,job_tenure_groups_1-2 years_count,job_tenure_groups_1-2 years_count_norm,job_tenure_groups_2-5 years_count,job_tenure_groups_2-5 years_count_norm,job_tenure_groups_5-10 years_count,job_tenure_groups_5-10 years_count_norm,job_tenure_groups_Less than 1 year_count,job_tenure_groups_Less than 1 year_count_norm,job_tenure_groups_More than 10 years_count,job_tenure_groups_More than 10 years_count_norm,start_quarter_Q1_count,start_quarter_Q1_count_norm,start_quarter_Q2_count,start_quarter_Q2_count_norm,start_quarter_Q3_count,start_quarter_Q3_count_norm,start_quarter_Q4_count,start_quarter_Q4_count_norm,start_season_fall_count,start_season_fall_count_norm,start_season_spring_count,start_season_spring_count_norm,start_season_summer_count,start_season_summer_count_norm,start_season_winter_count,start_season_winter_count_norm,job_location_adana_count,job_location_adana_count_norm,job_location_ankara_count,job_location_ankara_count_norm,job_location_antalya_count,job_location_antalya_count_norm,job_location_area_count,job_location_area_count_norm,job_location_bursa_count,job_location_bursa_count_norm,job_location_elazig_count,job_location_elazig_count_norm,job_location_eskisehir_count,job_location_eskisehir_count_norm,job_location_gebze_count,job_location_gebze_count_norm,job_location_germany_count,job_location_germany_count_norm,job_location_istanbul_count,job_location_istanbul_count_norm,job_location_izmir_count,job_location_izmir_count_norm,job_location_kayseri_count,job_location_kayseri_count_norm,job_location_kocaeli_count,job_location_kocaeli_count_norm,job_location_konya_count,job_location_konya_count_norm,job_location_manisa_count,job_location_manisa_count_norm,job_location_mersin_count,job_location_mersin_count_norm,job_location_mugla_count,job_location_mugla_count_norm,job_location_sakarya_count,job_location_sakarya_count_norm,job_location_tekirdag_count,job_location_tekirdag_count_norm,job_location_trabzon_count,job_location_trabzon_count_norm,job_location_turkiye_count,job_location_turkiye_count_norm,job_location_united states_count,job_location_united states_count_norm
0,17449,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11.005479,0.0,5.570471,2.0,2001.0,17.0,18.008219,9.00411,0.0,0.0,2012.0,2004.666667,6014.0,1.0,1.0,1.0,3.0,0.0,0.0,0.009724,0.029172,0.0,0.029172,0.0,0.0,0.0,0.0,0.131695,0.131695,0.131695,0.395086,4018715.0,4048144.0,4004001.0,12056146.0,0.133333,0.4,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,1.0,0.333333,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0
1,33967,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,3.5,5.0,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,2.0,0.583562,1.001573,2.0,2016.0,2.0,2.583562,1.291781,0.5,1.0,2018.0,2017.0,4034.0,6.0,6.0,6.0,12.0,0.5,1.0,0.205602,0.234584,0.17662,0.411204,0.0,0.0,,0.0,0.14075,0.14075,0.14075,0.281499,4068290.0,4072324.0,4064256.0,8136580.0,0.35474,0.35474,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2110,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3.0,2.666667,5.0,1.0,8.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,55082,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,5.509589,1.912329,2.543647,2.0,2011.0,7.0,7.421918,3.710959,0.0,0.0,2017.0,2014.0,4028.0,5.0,8.0,2.0,10.0,0.0,0.0,0.163551,0.28125,0.045852,0.327102,0.0,0.0,,0.0,0.150436,0.159553,0.141319,0.300872,4056205.0,4068289.0,4044121.0,8112410.0,0.423077,0.846154,1.0,0.5,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,37165,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,4.5,5.0,4.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.583562,2.583562,,1.0,2016.0,2.0,2.583562,2.583562,0.0,0.0,2016.0,2016.0,2016.0,6.0,6.0,6.0,6.0,0.0,0.0,0.17662,0.17662,0.17662,0.17662,0.0,0.0,,0.0,0.14075,0.14075,0.14075,0.14075,4064256.0,4064256.0,4064256.0,4064256.0,0.5,0.5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
train.shape

(53019, 379)

In [9]:
test.shape

(13255, 378)

In [10]:
train.drop(columns=["work_company_id2_mean", "work_company_id2_sum"], inplace=True)
test.drop(columns=["work_company_id2_mean", "work_company_id2_sum"], inplace=True)

In [11]:
train.shape

(53019, 377)

In [12]:
test.shape

(13255, 376)

In [13]:
cat_cols = work_company.columns

for col in cat_cols:
    work_company[col] = work_company[col].factorize()[0]

work_company[cat_cols] = work_company[cat_cols].astype('category')

In [14]:
work_company.head()

Unnamed: 0_level_0,company(1st),company(2nd),company(3rd),company(4th)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,0,-1,-1
2,1,1,0,-1
5,2,-1,-1,-1
7,3,2,1,0
10,4,3,2,-1


In [15]:
work_company["company(1st)"].value_counts()

131      906
320      734
4        698
7        639
345      475
        ... 
7325       1
7324       1
7323       1
7322       1
12982      1
Name: company(1st), Length: 12983, dtype: int64

In [16]:
work_company.dtypes

company(1st)    category
company(2nd)    category
company(3rd)    category
company(4th)    category
dtype: object

In [17]:
train = train.merge(work_company, on="user_id", how="left")

In [18]:
train.shape


(53019, 381)

In [19]:
test = test.merge(work_company, on="user_id", how="left")

In [20]:
test.shape


(13255, 380)

In [39]:
train.dtypes

user_id                                              int64
is_yurtdisi                                          int64
industry_-1                                          int64
industry_Accounting                                  int64
industry_Airlines/Aviation                           int64
industry_Apparel & Fashion                           int64
industry_Architecture & Planning                     int64
industry_Automotive                                  int64
industry_Aviation & Aerospace                        int64
industry_Banking                                     int64
industry_Biotechnology                               int64
industry_Broadcast Media                             int64
industry_Building Materials                          int64
industry_Capital Markets                             int64
industry_Chemicals                                   int64
industry_Civic & Social Organization                 int64
industry_Civil Engineering                           int

In [21]:

from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, recall_score, precision_score, accuracy_score
import re






X_train_user_id = train["user_id"]

y = train["moved_after_2019"]
X = train.drop(columns=["user_id", "moved_after_2019"])
X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

auc_errs = []
ap_errs = []
prec_errs = []
recall_errs = []
f1_errs = []
acc_scores = []

models = []

kf = KFold(n_splits=10)

for split_train, split_val in list(kf.split(X)):
    
    split_train = X.index[split_train]
    split_val = X.index[split_val]
    
    X_train, y_train = X.loc[split_train], y.loc[split_train]
    X_val, y_val = X.loc[split_val], y.loc[split_val]
    print("Train shape:", X_train.shape, "|", "Val Shape:", X_val.shape)
    print("Positive Count in Val Split:", y_val.sum())

    model = lgb.LGBMClassifier()
    

    model.fit(X_train, y_train, eval_metric = 'accuracy',
                eval_set = [(X_val, y_val), (X_train, y_train)],
                eval_names = ['valid', 'train'],
                early_stopping_rounds = 100, verbose = 200)
    
    preds = model.predict(X_val)
    pred_probas = model.predict_proba(X_val)[:,1]

    print("Positive Count in Predictions:", preds.sum())
    
    acc = accuracy_score(y_val, preds)
    print("Accuracy: ", acc)
    acc_scores.append(acc)
    
    f1_err = f1_score(y_val, preds)
    print("Fold F1: ", f1_err)
    f1_errs.append(f1_err)

    prec_err = precision_score(y_val, preds)
    print("Fold Precision: ", prec_err)
    prec_errs.append(prec_err)

    recall_err = recall_score(y_val, preds)
    print("Fold Recall: ", recall_err)
    recall_errs.append(recall_err)
    
    auc_err = roc_auc_score(y_val, pred_probas)
    print("Fold AUC: ", auc_err)
    auc_errs.append(auc_err)

    ap_err = average_precision_score(y_val, pred_probas)
    print("Fold AP: ", ap_err)
    ap_errs.append(ap_err)
    
    models.append(model)
    
    print("\n", "*"*70, "\n")

import plotly.express as px
importance = [model.feature_importances_ for model in models]

f_importance = pd.concat([pd.Series(X.columns.to_list(),name='Feature'),
                        pd.Series(np.mean(importance,axis=0),name="Importance")],
                        axis=1).sort_values(by='Importance',
                                            ascending=True)

fig = px.bar(f_importance.tail(20),x='Importance',y='Feature')
fig.update_layout(
    title_text="First 20 Important Features - CatBoost Average of Folds"
)
fig.show()



Train shape: (47717, 379) | Val Shape: (5302, 379)
Positive Count in Val Split: 2070




Positive Count in Predictions: 1874
Accuracy:  0.9147491512636741
Fold F1:  0.885395537525355
Fold Precision:  0.9316969050160085
Fold Recall:  0.8434782608695652
Fold AUC:  0.9778269538910412
Fold AP:  0.9701324038858476

 ********************************************************************** 

Train shape: (47717, 379) | Val Shape: (5302, 379)
Positive Count in Val Split: 2063




Positive Count in Predictions: 1909
Accuracy:  0.9204073934364391
Fold F1:  0.8937562940584088
Fold Precision:  0.929806181246726
Fold Recall:  0.8603974793989336
Fold AUC:  0.9807375184018933
Fold AP:  0.9739506631501856

 ********************************************************************** 

Train shape: (47717, 379) | Val Shape: (5302, 379)
Positive Count in Val Split: 1989




Positive Count in Predictions: 1807
Accuracy:  0.9155035835533761
Fold F1:  0.8819810326659642
Fold Precision:  0.9263973436635307
Fold Recall:  0.8416289592760181
Fold AUC:  0.9784999204043611
Fold AP:  0.9694071187109767

 ********************************************************************** 

Train shape: (47717, 379) | Val Shape: (5302, 379)
Positive Count in Val Split: 1990




Positive Count in Predictions: 1803
Accuracy:  0.9179554884949076
Fold F1:  0.8853150540469287
Fold Precision:  0.9312257348863006
Fold Recall:  0.8437185929648241
Fold AUC:  0.9801774573349841
Fold AP:  0.9713118562564065

 ********************************************************************** 

Train shape: (47717, 379) | Val Shape: (5302, 379)
Positive Count in Val Split: 2061




Positive Count in Predictions: 1865
Accuracy:  0.9124858543945681
Fold F1:  0.8818135506877229
Fold Precision:  0.9281501340482574
Fold Recall:  0.8398835516739447
Fold AUC:  0.976183963922936
Fold AP:  0.9676222396512898

 ********************************************************************** 

Train shape: (47717, 379) | Val Shape: (5302, 379)
Positive Count in Val Split: 2025




Positive Count in Predictions: 1863
Accuracy:  0.9117314221048661
Fold F1:  0.8796296296296297
Fold Precision:  0.9178743961352657
Fold Recall:  0.8444444444444444
Fold AUC:  0.9779782019838983
Fold AP:  0.9690830803605925

 ********************************************************************** 

Train shape: (47717, 379) | Val Shape: (5302, 379)
Positive Count in Val Split: 2095




Positive Count in Predictions: 1880
Accuracy:  0.9209732176537155
Fold F1:  0.8945911949685534
Fold Precision:  0.9457446808510638
Fold Recall:  0.8486873508353222
Fold AUC:  0.9809304378176319
Fold AP:  0.9745519028412348

 ********************************************************************** 

Train shape: (47717, 379) | Val Shape: (5302, 379)
Positive Count in Val Split: 2069




Positive Count in Predictions: 1905
Accuracy:  0.9192757450018861
Fold F1:  0.8922999496728736
Fold Precision:  0.9307086614173228
Fold Recall:  0.8569357177380377
Fold AUC:  0.9807339637441758
Fold AP:  0.9739934506155519

 ********************************************************************** 

Train shape: (47717, 379) | Val Shape: (5302, 379)
Positive Count in Val Split: 2069




Positive Count in Predictions: 1872
Accuracy:  0.9122972463221426
Fold F1:  0.8820096422227861
Fold Precision:  0.9284188034188035
Fold Recall:  0.8400193330111165
Fold AUC:  0.9783800515377532
Fold AP:  0.9704226749215756

 ********************************************************************** 

Train shape: (47718, 379) | Val Shape: (5301, 379)
Positive Count in Val Split: 2092




Positive Count in Predictions: 1933
Accuracy:  0.9058668175815884
Fold F1:  0.8760248447204968
Fold Precision:  0.9120538023797207
Fold Recall:  0.8427342256214149
Fold AUC:  0.975880008842244
Fold AP:  0.967848024242992

 ********************************************************************** 



In [22]:
z = test.copy()

In [23]:
z.head()

Unnamed: 0,user_id,is_yurtdisi,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany yurtdisi,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands yurtdisi,user_location_nevsehir,user_location_ordu,user_location_philippines yurtdisi,user_location_poland yurtdisi,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia yurtdisi,user_location_sweden yurtdisi,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom yurtdisi,user_location_united states yurtdisi,user_location_van,user_location_yalova,user_location_zonguldak,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum,language_language_albanian_count,language_language_albanian_count_norm,language_language_almanca_count,language_language_almanca_count_norm,language_language_arapca_count,language_language_arapca_count_norm,language_language_armenian_count,language_language_armenian_count_norm,language_language_arnavutca_count,language_language_arnavutca_count_norm,language_language_azerice_count,language_language_azerice_count_norm,language_language_bosnakca_count,language_language_bosnakca_count_norm,language_language_bulgarca_count,language_language_bulgarca_count_norm,language_language_cince_count,language_language_cince_count_norm,language_language_czech_count,language_language_czech_count_norm,language_language_dutch_count,language_language_dutch_count_norm,language_language_farsca_count,language_language_farsca_count_norm,language_language_fince_count,language_language_fince_count_norm,language_language_fransizca_count,language_language_fransizca_count_norm,language_language_greek_count,language_language_greek_count_norm,language_language_hebrew_count,language_language_hebrew_count_norm,language_language_ingilizce_count,language_language_ingilizce_count_norm,language_language_isaret dilleri_count,language_language_isaret dilleri_count_norm,language_language_ispanyolca_count,language_language_ispanyolca_count_norm,language_language_italyanca_count,language_language_italyanca_count_norm,language_language_japonca_count,language_language_japonca_count_norm,language_language_kazakca_count,language_language_kazakca_count_norm,language_language_korece_count,language_language_korece_count_norm,language_language_kurtce_count,language_language_kurtce_count_norm,language_language_latince_count,language_language_latince_count_norm,language_language_lehce_count,language_language_lehce_count_norm,language_language_other_count,language_language_other_count_norm,language_language_persian_count,language_language_persian_count_norm,language_language_polish_count,language_language_polish_count_norm,language_language_portekizce_count,language_language_portekizce_count_norm,language_language_rusca_count,language_language_rusca_count_norm,language_language_sirpca_count,language_language_sirpca_count_norm,language_language_swedish_count,language_language_swedish_count_norm,language_language_turkce_count,language_language_turkce_count_norm,language_language_urdu_count,language_language_urdu_count_norm,language_language_uzbek_count,language_language_uzbek_count_norm,work_job_duration_max,work_job_duration_min,work_job_duration_std,work_job_count_mean,work_min_start_year_min,work_working_years_mean,work_total_job_duration_min,work_avg_work_year_min,work_quit_job_2018_mean,work_quit_job_2018_sum,work_max_start_year_min,work_year_of_job_start_mean,work_year_of_job_start_sum,work_month_of_job_start_mean,work_month_of_job_start_max,work_month_of_job_start_min,work_month_of_job_start_sum,work_quit_job_in_2018_mean,work_quit_job_in_2018_sum,work_moving_average_mean,work_moving_average_max,work_moving_average_min,work_moving_average_sum,work_quit_job_in_2018_change_mean,work_quit_job_in_2018_change_sum,work_time_since_last_change_mean,work_time_since_last_change_sum,work_seasonal_trend_mean,work_seasonal_trend_max,work_seasonal_trend_min,work_seasonal_trend_sum,work_start_date_squared_mean,work_start_date_squared_max,work_start_date_squared_min,work_start_date_squared_sum,job_tenure_groups_1-2 years_count,job_tenure_groups_1-2 years_count_norm,job_tenure_groups_2-5 years_count,job_tenure_groups_2-5 years_count_norm,job_tenure_groups_5-10 years_count,job_tenure_groups_5-10 years_count_norm,job_tenure_groups_Less than 1 year_count,job_tenure_groups_Less than 1 year_count_norm,job_tenure_groups_More than 10 years_count,job_tenure_groups_More than 10 years_count_norm,start_quarter_Q1_count,start_quarter_Q1_count_norm,start_quarter_Q2_count,start_quarter_Q2_count_norm,start_quarter_Q3_count,start_quarter_Q3_count_norm,start_quarter_Q4_count,start_quarter_Q4_count_norm,start_season_fall_count,start_season_fall_count_norm,start_season_spring_count,start_season_spring_count_norm,start_season_summer_count,start_season_summer_count_norm,start_season_winter_count,start_season_winter_count_norm,job_location_adana_count,job_location_adana_count_norm,job_location_ankara_count,job_location_ankara_count_norm,job_location_antalya_count,job_location_antalya_count_norm,job_location_area_count,job_location_area_count_norm,job_location_bursa_count,job_location_bursa_count_norm,job_location_elazig_count,job_location_elazig_count_norm,job_location_eskisehir_count,job_location_eskisehir_count_norm,job_location_gebze_count,job_location_gebze_count_norm,job_location_germany_count,job_location_germany_count_norm,job_location_istanbul_count,job_location_istanbul_count_norm,job_location_izmir_count,job_location_izmir_count_norm,job_location_kayseri_count,job_location_kayseri_count_norm,job_location_kocaeli_count,job_location_kocaeli_count_norm,job_location_konya_count,job_location_konya_count_norm,job_location_manisa_count,job_location_manisa_count_norm,job_location_mersin_count,job_location_mersin_count_norm,job_location_mugla_count,job_location_mugla_count_norm,job_location_sakarya_count,job_location_sakarya_count_norm,job_location_tekirdag_count,job_location_tekirdag_count_norm,job_location_trabzon_count,job_location_trabzon_count_norm,job_location_turkiye_count,job_location_turkiye_count_norm,job_location_united states_count,job_location_united states_count_norm,company(1st),company(2nd),company(3rd),company(4th)
0,17449,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11.005479,0.0,5.570471,2.0,2001.0,17.0,18.008219,9.00411,0.0,0.0,2012.0,2004.666667,6014.0,1.0,1.0,1.0,3.0,0.0,0.0,0.009724,0.029172,0.0,0.029172,0.0,0.0,0.0,0.0,0.131695,0.131695,0.131695,0.395086,4018715.0,4048144.0,4004001.0,12056146.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,1.0,0.333333,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,79.0,4586.0,3419.0,-1.0
1,33967,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,3.5,5.0,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,2.0,0.583562,1.001573,2.0,2016.0,2.0,2.583562,1.291781,0.5,1.0,2018.0,2017.0,4034.0,6.0,6.0,6.0,12.0,0.5,1.0,0.205602,0.234584,0.17662,0.411204,0.0,0.0,,0.0,0.14075,0.14075,0.14075,0.281499,4068290.0,4072324.0,4064256.0,8136580.0,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,7458.0,-1.0,-1.0
2,2110,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3.0,2.666667,5.0,1.0,8.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,55082,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,5.509589,1.912329,2.543647,2.0,2011.0,7.0,7.421918,3.710959,0.0,0.0,2017.0,2014.0,4028.0,5.0,8.0,2.0,10.0,0.0,0.0,0.163551,0.28125,0.045852,0.327102,0.0,0.0,,0.0,0.150436,0.159553,0.141319,0.300872,4056205.0,4068289.0,4044121.0,8112410.0,1.0,0.5,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4541.0,2209.0,-1.0,-1.0
4,37165,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,4.5,5.0,4.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.583562,2.583562,,1.0,2016.0,2.0,2.583562,2.583562,0.0,0.0,2016.0,2016.0,2016.0,6.0,6.0,6.0,6.0,0.0,0.0,0.17662,0.17662,0.17662,0.17662,0.0,0.0,,0.0,0.14075,0.14075,0.14075,0.14075,4064256.0,4064256.0,4064256.0,4064256.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9284.0,-1.0,-1.0,-1.0


In [24]:
z = z.drop(columns=["user_id"])
z.head()

Unnamed: 0,is_yurtdisi,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany yurtdisi,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands yurtdisi,user_location_nevsehir,user_location_ordu,user_location_philippines yurtdisi,user_location_poland yurtdisi,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia yurtdisi,user_location_sweden yurtdisi,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom yurtdisi,user_location_united states yurtdisi,user_location_van,user_location_yalova,user_location_zonguldak,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum,language_language_albanian_count,language_language_albanian_count_norm,language_language_almanca_count,language_language_almanca_count_norm,language_language_arapca_count,language_language_arapca_count_norm,language_language_armenian_count,language_language_armenian_count_norm,language_language_arnavutca_count,language_language_arnavutca_count_norm,language_language_azerice_count,language_language_azerice_count_norm,language_language_bosnakca_count,language_language_bosnakca_count_norm,language_language_bulgarca_count,language_language_bulgarca_count_norm,language_language_cince_count,language_language_cince_count_norm,language_language_czech_count,language_language_czech_count_norm,language_language_dutch_count,language_language_dutch_count_norm,language_language_farsca_count,language_language_farsca_count_norm,language_language_fince_count,language_language_fince_count_norm,language_language_fransizca_count,language_language_fransizca_count_norm,language_language_greek_count,language_language_greek_count_norm,language_language_hebrew_count,language_language_hebrew_count_norm,language_language_ingilizce_count,language_language_ingilizce_count_norm,language_language_isaret dilleri_count,language_language_isaret dilleri_count_norm,language_language_ispanyolca_count,language_language_ispanyolca_count_norm,language_language_italyanca_count,language_language_italyanca_count_norm,language_language_japonca_count,language_language_japonca_count_norm,language_language_kazakca_count,language_language_kazakca_count_norm,language_language_korece_count,language_language_korece_count_norm,language_language_kurtce_count,language_language_kurtce_count_norm,language_language_latince_count,language_language_latince_count_norm,language_language_lehce_count,language_language_lehce_count_norm,language_language_other_count,language_language_other_count_norm,language_language_persian_count,language_language_persian_count_norm,language_language_polish_count,language_language_polish_count_norm,language_language_portekizce_count,language_language_portekizce_count_norm,language_language_rusca_count,language_language_rusca_count_norm,language_language_sirpca_count,language_language_sirpca_count_norm,language_language_swedish_count,language_language_swedish_count_norm,language_language_turkce_count,language_language_turkce_count_norm,language_language_urdu_count,language_language_urdu_count_norm,language_language_uzbek_count,language_language_uzbek_count_norm,work_job_duration_max,work_job_duration_min,work_job_duration_std,work_job_count_mean,work_min_start_year_min,work_working_years_mean,work_total_job_duration_min,work_avg_work_year_min,work_quit_job_2018_mean,work_quit_job_2018_sum,work_max_start_year_min,work_year_of_job_start_mean,work_year_of_job_start_sum,work_month_of_job_start_mean,work_month_of_job_start_max,work_month_of_job_start_min,work_month_of_job_start_sum,work_quit_job_in_2018_mean,work_quit_job_in_2018_sum,work_moving_average_mean,work_moving_average_max,work_moving_average_min,work_moving_average_sum,work_quit_job_in_2018_change_mean,work_quit_job_in_2018_change_sum,work_time_since_last_change_mean,work_time_since_last_change_sum,work_seasonal_trend_mean,work_seasonal_trend_max,work_seasonal_trend_min,work_seasonal_trend_sum,work_start_date_squared_mean,work_start_date_squared_max,work_start_date_squared_min,work_start_date_squared_sum,job_tenure_groups_1-2 years_count,job_tenure_groups_1-2 years_count_norm,job_tenure_groups_2-5 years_count,job_tenure_groups_2-5 years_count_norm,job_tenure_groups_5-10 years_count,job_tenure_groups_5-10 years_count_norm,job_tenure_groups_Less than 1 year_count,job_tenure_groups_Less than 1 year_count_norm,job_tenure_groups_More than 10 years_count,job_tenure_groups_More than 10 years_count_norm,start_quarter_Q1_count,start_quarter_Q1_count_norm,start_quarter_Q2_count,start_quarter_Q2_count_norm,start_quarter_Q3_count,start_quarter_Q3_count_norm,start_quarter_Q4_count,start_quarter_Q4_count_norm,start_season_fall_count,start_season_fall_count_norm,start_season_spring_count,start_season_spring_count_norm,start_season_summer_count,start_season_summer_count_norm,start_season_winter_count,start_season_winter_count_norm,job_location_adana_count,job_location_adana_count_norm,job_location_ankara_count,job_location_ankara_count_norm,job_location_antalya_count,job_location_antalya_count_norm,job_location_area_count,job_location_area_count_norm,job_location_bursa_count,job_location_bursa_count_norm,job_location_elazig_count,job_location_elazig_count_norm,job_location_eskisehir_count,job_location_eskisehir_count_norm,job_location_gebze_count,job_location_gebze_count_norm,job_location_germany_count,job_location_germany_count_norm,job_location_istanbul_count,job_location_istanbul_count_norm,job_location_izmir_count,job_location_izmir_count_norm,job_location_kayseri_count,job_location_kayseri_count_norm,job_location_kocaeli_count,job_location_kocaeli_count_norm,job_location_konya_count,job_location_konya_count_norm,job_location_manisa_count,job_location_manisa_count_norm,job_location_mersin_count,job_location_mersin_count_norm,job_location_mugla_count,job_location_mugla_count_norm,job_location_sakarya_count,job_location_sakarya_count_norm,job_location_tekirdag_count,job_location_tekirdag_count_norm,job_location_trabzon_count,job_location_trabzon_count_norm,job_location_turkiye_count,job_location_turkiye_count_norm,job_location_united states_count,job_location_united states_count_norm,company(1st),company(2nd),company(3rd),company(4th)
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11.005479,0.0,5.570471,2.0,2001.0,17.0,18.008219,9.00411,0.0,0.0,2012.0,2004.666667,6014.0,1.0,1.0,1.0,3.0,0.0,0.0,0.009724,0.029172,0.0,0.029172,0.0,0.0,0.0,0.0,0.131695,0.131695,0.131695,0.395086,4018715.0,4048144.0,4004001.0,12056146.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,1.0,0.333333,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,79.0,4586.0,3419.0,-1.0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,3.5,5.0,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,2.0,0.583562,1.001573,2.0,2016.0,2.0,2.583562,1.291781,0.5,1.0,2018.0,2017.0,4034.0,6.0,6.0,6.0,12.0,0.5,1.0,0.205602,0.234584,0.17662,0.411204,0.0,0.0,,0.0,0.14075,0.14075,0.14075,0.281499,4068290.0,4072324.0,4064256.0,8136580.0,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,7458.0,-1.0,-1.0
2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3.0,2.666667,5.0,1.0,8.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,5.509589,1.912329,2.543647,2.0,2011.0,7.0,7.421918,3.710959,0.0,0.0,2017.0,2014.0,4028.0,5.0,8.0,2.0,10.0,0.0,0.0,0.163551,0.28125,0.045852,0.327102,0.0,0.0,,0.0,0.150436,0.159553,0.141319,0.300872,4056205.0,4068289.0,4044121.0,8112410.0,1.0,0.5,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4541.0,2209.0,-1.0,-1.0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,4.5,5.0,4.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.583562,2.583562,,1.0,2016.0,2.0,2.583562,2.583562,0.0,0.0,2016.0,2016.0,2016.0,6.0,6.0,6.0,6.0,0.0,0.0,0.17662,0.17662,0.17662,0.17662,0.0,0.0,,0.0,0.14075,0.14075,0.14075,0.14075,4064256.0,4064256.0,4064256.0,4064256.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9284.0,-1.0,-1.0,-1.0


In [25]:
    
model_preds = [model.predict(z) for model in models]
model_preds

[array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64)]

In [26]:
df = pd.read_csv("../data/test_users.csv")

In [27]:
sample_submission = pd.DataFrame()
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,0
2,2110,0
3,55082,0
4,37165,0


In [29]:
sample_submission.moved_after_2019.value_counts()

0    12401
1      854
Name: moved_after_2019, dtype: int64

In [30]:
sample_submission.to_csv("submission_first_4_comp.csv", index=False)

In [2]:
train = pd.read_csv("./train_user_lang_work_revise.csv")
train.head()

Unnamed: 0,user_id,is_yurtdisi,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany yurtdisi,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands yurtdisi,user_location_nevsehir,user_location_ordu,user_location_philippines yurtdisi,user_location_poland yurtdisi,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia yurtdisi,user_location_sweden yurtdisi,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom yurtdisi,user_location_united states yurtdisi,user_location_van,user_location_yalova,user_location_zonguldak,moved_after_2019,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum,language_language_albanian_count,language_language_albanian_count_norm,language_language_almanca_count,language_language_almanca_count_norm,language_language_arapca_count,language_language_arapca_count_norm,language_language_armenian_count,language_language_armenian_count_norm,language_language_arnavutca_count,language_language_arnavutca_count_norm,language_language_azerice_count,language_language_azerice_count_norm,language_language_bosnakca_count,language_language_bosnakca_count_norm,language_language_bulgarca_count,language_language_bulgarca_count_norm,language_language_cince_count,language_language_cince_count_norm,language_language_czech_count,language_language_czech_count_norm,language_language_dutch_count,language_language_dutch_count_norm,language_language_farsca_count,language_language_farsca_count_norm,language_language_fince_count,language_language_fince_count_norm,language_language_fransizca_count,language_language_fransizca_count_norm,language_language_greek_count,language_language_greek_count_norm,language_language_hebrew_count,language_language_hebrew_count_norm,language_language_ingilizce_count,language_language_ingilizce_count_norm,language_language_isaret dilleri_count,language_language_isaret dilleri_count_norm,language_language_ispanyolca_count,language_language_ispanyolca_count_norm,language_language_italyanca_count,language_language_italyanca_count_norm,language_language_japonca_count,language_language_japonca_count_norm,language_language_kazakca_count,language_language_kazakca_count_norm,language_language_korece_count,language_language_korece_count_norm,language_language_kurtce_count,language_language_kurtce_count_norm,language_language_latince_count,language_language_latince_count_norm,language_language_lehce_count,language_language_lehce_count_norm,language_language_other_count,language_language_other_count_norm,language_language_persian_count,language_language_persian_count_norm,language_language_polish_count,language_language_polish_count_norm,language_language_portekizce_count,language_language_portekizce_count_norm,language_language_rusca_count,language_language_rusca_count_norm,language_language_sirpca_count,language_language_sirpca_count_norm,language_language_swedish_count,language_language_swedish_count_norm,language_language_turkce_count,language_language_turkce_count_norm,language_language_urdu_count,language_language_urdu_count_norm,language_language_uzbek_count,language_language_uzbek_count_norm,work_job_duration_max,work_job_duration_min,work_job_duration_std,work_job_count_mean,work_min_start_year_min,work_working_years_mean,work_total_job_duration_min,work_avg_work_year_min,work_quit_job_2018_mean,work_quit_job_2018_sum,work_max_start_year_min,work_year_of_job_start_mean,work_year_of_job_start_sum,work_month_of_job_start_mean,work_month_of_job_start_max,work_month_of_job_start_min,work_month_of_job_start_sum,work_quit_job_in_2018_mean,work_quit_job_in_2018_sum,work_moving_average_mean,work_moving_average_max,work_moving_average_min,work_moving_average_sum,work_quit_job_in_2018_change_mean,work_quit_job_in_2018_change_sum,work_time_since_last_change_mean,work_time_since_last_change_sum,work_seasonal_trend_mean,work_seasonal_trend_max,work_seasonal_trend_min,work_seasonal_trend_sum,work_start_date_squared_mean,work_start_date_squared_max,work_start_date_squared_min,work_start_date_squared_sum,work_company_id2_mean,work_company_id2_sum,job_tenure_groups_1-2 years_count,job_tenure_groups_1-2 years_count_norm,job_tenure_groups_2-5 years_count,job_tenure_groups_2-5 years_count_norm,job_tenure_groups_5-10 years_count,job_tenure_groups_5-10 years_count_norm,job_tenure_groups_Less than 1 year_count,job_tenure_groups_Less than 1 year_count_norm,job_tenure_groups_More than 10 years_count,job_tenure_groups_More than 10 years_count_norm,start_quarter_Q1_count,start_quarter_Q1_count_norm,start_quarter_Q2_count,start_quarter_Q2_count_norm,start_quarter_Q3_count,start_quarter_Q3_count_norm,start_quarter_Q4_count,start_quarter_Q4_count_norm,start_season_fall_count,start_season_fall_count_norm,start_season_spring_count,start_season_spring_count_norm,start_season_summer_count,start_season_summer_count_norm,start_season_winter_count,start_season_winter_count_norm,job_location_adana_count,job_location_adana_count_norm,job_location_ankara_count,job_location_ankara_count_norm,job_location_antalya_count,job_location_antalya_count_norm,job_location_area_count,job_location_area_count_norm,job_location_bursa_count,job_location_bursa_count_norm,job_location_elazig_count,job_location_elazig_count_norm,job_location_eskisehir_count,job_location_eskisehir_count_norm,job_location_gebze_count,job_location_gebze_count_norm,job_location_germany_count,job_location_germany_count_norm,job_location_istanbul_count,job_location_istanbul_count_norm,job_location_izmir_count,job_location_izmir_count_norm,job_location_kayseri_count,job_location_kayseri_count_norm,job_location_kocaeli_count,job_location_kocaeli_count_norm,job_location_konya_count,job_location_konya_count_norm,job_location_manisa_count,job_location_manisa_count_norm,job_location_mersin_count,job_location_mersin_count_norm,job_location_mugla_count,job_location_mugla_count_norm,job_location_sakarya_count,job_location_sakarya_count_norm,job_location_tekirdag_count,job_location_tekirdag_count_norm,job_location_trabzon_count,job_location_trabzon_count_norm,job_location_turkiye_count,job_location_turkiye_count_norm,job_location_united states_count,job_location_united states_count_norm
0,1301,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2.0,4.0,5.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.416438,0.419178,1.053681,3.0,2015.0,3.0,3.671233,1.223744,0.333333,1.0,2018.0,2016.0,6048.0,6.0,10.0,3.0,18.0,0.333333,1.0,0.188044,0.295428,0.104247,0.564131,0.0,0.0,0.666667,2.0,0.140932,0.150295,0.130749,0.422795,4064258.0,4072324.0,4060225.0,12192774.0,0.784913,2.35474,0.0,0.0,1.0,0.333333,0.0,0.0,2.0,0.666667,0.0,0.0,1.0,0.333333,1.0,0.333333,0.0,0.0,1.0,0.333333,1.0,0.333333,2.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6950,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.0,1.666667,3.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,1.0,0.333333,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.912329,0.49863,0.569431,6.0,2013.0,5.0,5.753425,0.958904,0.0,0.0,2017.0,2015.166667,12091.0,5.0,8.0,2.0,30.0,0.0,0.0,0.157705,0.28125,0.059347,0.946232,0.0,0.0,0.5,2.0,0.144526,0.159553,0.124661,0.867154,4060898.0,4068289.0,4052169.0,24365391.0,0.365522,2.193131,3.0,0.5,0.0,0.0,0.0,0.0,3.0,0.5,0.0,0.0,2.0,0.333333,2.0,0.333333,2.0,0.333333,0.0,0.0,0.0,0.0,1.0,0.166667,3.0,0.5,2.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.668493,0.747945,0.393513,4.0,2014.0,4.0,5.0,1.25,0.0,0.0,2017.0,2015.5,8062.0,4.25,8.0,1.0,17.0,0.0,0.0,0.181361,0.297016,0.077922,0.725446,0.0,0.0,2.0,8.0,0.138515,0.150295,0.130749,0.554058,4062242.0,4068289.0,4056196.0,16248966.0,0.09375,0.375,3.0,0.75,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,2.0,0.5,1.0,0.25,1.0,0.25,0.0,0.0,0.0,0.0,2.0,0.5,1.0,0.25,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,26046,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.252055,1.082192,1.35696,4.0,2009.0,9.0,9.336986,2.334247,0.0,0.0,2014.0,2011.25,8045.0,10.25,12.0,9.0,41.0,0.0,0.0,0.041784,0.08548,0.014599,0.167136,0.0,0.0,0.0,0.0,0.133489,0.141751,0.110077,0.533957,4045130.0,4056196.0,4036081.0,16180521.0,0.35474,1.41896,2.0,0.5,2.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,3.0,0.75,3.0,0.75,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11005,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,4.0,5.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,6.671233,0.915068,3.110291,3.0,2009.0,9.0,9.336986,3.112329,0.0,0.0,2017.0,2012.0,6036.0,7.0,9.0,4.0,21.0,0.0,0.0,0.102134,0.265915,0.017964,0.306401,0.0,0.0,0.666667,2.0,0.135452,0.141319,0.124661,0.406357,4048157.0,4068289.0,4036081.0,12144470.0,0.299816,0.899447,1.0,0.333333,0.0,0.0,1.0,0.333333,1.0,0.333333,0.0,0.0,0.0,0.0,1.0,0.333333,2.0,0.666667,0.0,0.0,1.0,0.333333,1.0,0.333333,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
test = pd.read_csv("./test_user_lang_work_revise.csv")
test.head()

Unnamed: 0,user_id,is_yurtdisi,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany yurtdisi,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands yurtdisi,user_location_nevsehir,user_location_ordu,user_location_philippines yurtdisi,user_location_poland yurtdisi,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia yurtdisi,user_location_sweden yurtdisi,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom yurtdisi,user_location_united states yurtdisi,user_location_van,user_location_yalova,user_location_zonguldak,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum,language_language_albanian_count,language_language_albanian_count_norm,language_language_almanca_count,language_language_almanca_count_norm,language_language_arapca_count,language_language_arapca_count_norm,language_language_armenian_count,language_language_armenian_count_norm,language_language_arnavutca_count,language_language_arnavutca_count_norm,language_language_azerice_count,language_language_azerice_count_norm,language_language_bosnakca_count,language_language_bosnakca_count_norm,language_language_bulgarca_count,language_language_bulgarca_count_norm,language_language_cince_count,language_language_cince_count_norm,language_language_czech_count,language_language_czech_count_norm,language_language_dutch_count,language_language_dutch_count_norm,language_language_farsca_count,language_language_farsca_count_norm,language_language_fince_count,language_language_fince_count_norm,language_language_fransizca_count,language_language_fransizca_count_norm,language_language_greek_count,language_language_greek_count_norm,language_language_hebrew_count,language_language_hebrew_count_norm,language_language_ingilizce_count,language_language_ingilizce_count_norm,language_language_isaret dilleri_count,language_language_isaret dilleri_count_norm,language_language_ispanyolca_count,language_language_ispanyolca_count_norm,language_language_italyanca_count,language_language_italyanca_count_norm,language_language_japonca_count,language_language_japonca_count_norm,language_language_kazakca_count,language_language_kazakca_count_norm,language_language_korece_count,language_language_korece_count_norm,language_language_kurtce_count,language_language_kurtce_count_norm,language_language_latince_count,language_language_latince_count_norm,language_language_lehce_count,language_language_lehce_count_norm,language_language_other_count,language_language_other_count_norm,language_language_persian_count,language_language_persian_count_norm,language_language_polish_count,language_language_polish_count_norm,language_language_portekizce_count,language_language_portekizce_count_norm,language_language_rusca_count,language_language_rusca_count_norm,language_language_sirpca_count,language_language_sirpca_count_norm,language_language_swedish_count,language_language_swedish_count_norm,language_language_turkce_count,language_language_turkce_count_norm,language_language_urdu_count,language_language_urdu_count_norm,language_language_uzbek_count,language_language_uzbek_count_norm,work_job_duration_max,work_job_duration_min,work_job_duration_std,work_job_count_mean,work_min_start_year_min,work_working_years_mean,work_total_job_duration_min,work_avg_work_year_min,work_quit_job_2018_mean,work_quit_job_2018_sum,work_max_start_year_min,work_year_of_job_start_mean,work_year_of_job_start_sum,work_month_of_job_start_mean,work_month_of_job_start_max,work_month_of_job_start_min,work_month_of_job_start_sum,work_quit_job_in_2018_mean,work_quit_job_in_2018_sum,work_moving_average_mean,work_moving_average_max,work_moving_average_min,work_moving_average_sum,work_quit_job_in_2018_change_mean,work_quit_job_in_2018_change_sum,work_time_since_last_change_mean,work_time_since_last_change_sum,work_seasonal_trend_mean,work_seasonal_trend_max,work_seasonal_trend_min,work_seasonal_trend_sum,work_start_date_squared_mean,work_start_date_squared_max,work_start_date_squared_min,work_start_date_squared_sum,work_company_id2_mean,work_company_id2_sum,job_tenure_groups_1-2 years_count,job_tenure_groups_1-2 years_count_norm,job_tenure_groups_2-5 years_count,job_tenure_groups_2-5 years_count_norm,job_tenure_groups_5-10 years_count,job_tenure_groups_5-10 years_count_norm,job_tenure_groups_Less than 1 year_count,job_tenure_groups_Less than 1 year_count_norm,job_tenure_groups_More than 10 years_count,job_tenure_groups_More than 10 years_count_norm,start_quarter_Q1_count,start_quarter_Q1_count_norm,start_quarter_Q2_count,start_quarter_Q2_count_norm,start_quarter_Q3_count,start_quarter_Q3_count_norm,start_quarter_Q4_count,start_quarter_Q4_count_norm,start_season_fall_count,start_season_fall_count_norm,start_season_spring_count,start_season_spring_count_norm,start_season_summer_count,start_season_summer_count_norm,start_season_winter_count,start_season_winter_count_norm,job_location_adana_count,job_location_adana_count_norm,job_location_ankara_count,job_location_ankara_count_norm,job_location_antalya_count,job_location_antalya_count_norm,job_location_area_count,job_location_area_count_norm,job_location_bursa_count,job_location_bursa_count_norm,job_location_elazig_count,job_location_elazig_count_norm,job_location_eskisehir_count,job_location_eskisehir_count_norm,job_location_gebze_count,job_location_gebze_count_norm,job_location_germany_count,job_location_germany_count_norm,job_location_istanbul_count,job_location_istanbul_count_norm,job_location_izmir_count,job_location_izmir_count_norm,job_location_kayseri_count,job_location_kayseri_count_norm,job_location_kocaeli_count,job_location_kocaeli_count_norm,job_location_konya_count,job_location_konya_count_norm,job_location_manisa_count,job_location_manisa_count_norm,job_location_mersin_count,job_location_mersin_count_norm,job_location_mugla_count,job_location_mugla_count_norm,job_location_sakarya_count,job_location_sakarya_count_norm,job_location_tekirdag_count,job_location_tekirdag_count_norm,job_location_trabzon_count,job_location_trabzon_count_norm,job_location_turkiye_count,job_location_turkiye_count_norm,job_location_united states_count,job_location_united states_count_norm
0,17449,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11.005479,0.0,5.570471,2.0,2001.0,17.0,18.008219,9.00411,0.0,0.0,2012.0,2004.666667,6014.0,1.0,1.0,1.0,3.0,0.0,0.0,0.009724,0.029172,0.0,0.029172,0.0,0.0,0.0,0.0,0.131695,0.131695,0.131695,0.395086,4018715.0,4048144.0,4004001.0,12056146.0,0.133333,0.4,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,1.0,0.333333,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0
1,33967,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,3.5,5.0,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,2.0,0.583562,1.001573,2.0,2016.0,2.0,2.583562,1.291781,0.5,1.0,2018.0,2017.0,4034.0,6.0,6.0,6.0,12.0,0.5,1.0,0.205602,0.234584,0.17662,0.411204,0.0,0.0,,0.0,0.14075,0.14075,0.14075,0.281499,4068290.0,4072324.0,4064256.0,8136580.0,0.35474,0.35474,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2110,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3.0,2.666667,5.0,1.0,8.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,55082,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,5.509589,1.912329,2.543647,2.0,2011.0,7.0,7.421918,3.710959,0.0,0.0,2017.0,2014.0,4028.0,5.0,8.0,2.0,10.0,0.0,0.0,0.163551,0.28125,0.045852,0.327102,0.0,0.0,,0.0,0.150436,0.159553,0.141319,0.300872,4056205.0,4068289.0,4044121.0,8112410.0,0.423077,0.846154,1.0,0.5,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,37165,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,4.5,5.0,4.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.583562,2.583562,,1.0,2016.0,2.0,2.583562,2.583562,0.0,0.0,2016.0,2016.0,2016.0,6.0,6.0,6.0,6.0,0.0,0.0,0.17662,0.17662,0.17662,0.17662,0.0,0.0,,0.0,0.14075,0.14075,0.14075,0.14075,4064256.0,4064256.0,4064256.0,4064256.0,0.5,0.5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
train.shape

(53019, 379)

In [5]:
test.shape

(13255, 378)

In [6]:
train.drop(columns=["work_company_id2_mean", "work_company_id2_sum"], inplace=True)
test.drop(columns=["work_company_id2_mean", "work_company_id2_sum"], inplace=True)

In [7]:
train.shape

(53019, 377)

In [8]:
test.shape

(13255, 376)

In [9]:

from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, recall_score, precision_score, accuracy_score
import re






X_train_user_id = train["user_id"]

y = train["moved_after_2019"]
X = train.drop(columns=["user_id", "moved_after_2019"])
X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

auc_errs = []
ap_errs = []
prec_errs = []
recall_errs = []
f1_errs = []
acc_scores = []

models = []

kf = KFold(n_splits=10)

for split_train, split_val in list(kf.split(X)):
    
    split_train = X.index[split_train]
    split_val = X.index[split_val]
    
    X_train, y_train = X.loc[split_train], y.loc[split_train]
    X_val, y_val = X.loc[split_val], y.loc[split_val]
    print("Train shape:", X_train.shape, "|", "Val Shape:", X_val.shape)
    print("Positive Count in Val Split:", y_val.sum())

    model = lgb.LGBMClassifier()
    

    model.fit(X_train, y_train, eval_metric = 'accuracy',
                eval_set = [(X_val, y_val), (X_train, y_train)],
                eval_names = ['valid', 'train'],
                early_stopping_rounds = 100, verbose = 200)
    
    preds = model.predict(X_val)
    pred_probas = model.predict_proba(X_val)[:,1]

    print("Positive Count in Predictions:", preds.sum())
    
    acc = accuracy_score(y_val, preds)
    print("Accuracy: ", acc)
    acc_scores.append(acc)
    
    f1_err = f1_score(y_val, preds)
    print("Fold F1: ", f1_err)
    f1_errs.append(f1_err)

    prec_err = precision_score(y_val, preds)
    print("Fold Precision: ", prec_err)
    prec_errs.append(prec_err)

    recall_err = recall_score(y_val, preds)
    print("Fold Recall: ", recall_err)
    recall_errs.append(recall_err)
    
    auc_err = roc_auc_score(y_val, pred_probas)
    print("Fold AUC: ", auc_err)
    auc_errs.append(auc_err)

    ap_err = average_precision_score(y_val, pred_probas)
    print("Fold AP: ", ap_err)
    ap_errs.append(ap_err)
    
    models.append(model)
    
    print("\n", "*"*70, "\n")

import plotly.express as px
importance = [model.feature_importances_ for model in models]

f_importance = pd.concat([pd.Series(X.columns.to_list(),name='Feature'),
                        pd.Series(np.mean(importance,axis=0),name="Importance")],
                        axis=1).sort_values(by='Importance',
                                            ascending=True)

fig = px.bar(f_importance.tail(20),x='Importance',y='Feature')
fig.update_layout(
    title_text="First 20 Important Features - CatBoost Average of Folds"
)
fig.show()



Train shape: (47717, 375) | Val Shape: (5302, 375)
Positive Count in Val Split: 2070




Positive Count in Predictions: 1076
Accuracy:  0.6793662768766503
Fold F1:  0.459631277813096
Fold Precision:  0.6719330855018587
Fold Recall:  0.3492753623188406
Fold AUC:  0.7134309083082221
Fold AP:  0.6182453018405127

 ********************************************************************** 

Train shape: (47717, 375) | Val Shape: (5302, 375)
Positive Count in Val Split: 2063




Positive Count in Predictions: 1057
Accuracy:  0.6838928706148624
Fold F1:  0.4628205128205128
Fold Precision:  0.6830652790917692
Fold Recall:  0.34997576345128456
Fold AUC:  0.7179906576672423
Fold AP:  0.630913076238624

 ********************************************************************** 

Train shape: (47717, 375) | Val Shape: (5302, 375)
Positive Count in Val Split: 1989




Positive Count in Predictions: 1043
Accuracy:  0.6948321388155413
Fold F1:  0.4663588390501319
Fold Precision:  0.6778523489932886
Fold Recall:  0.35545500251382606
Fold AUC:  0.724913298420516
Fold AP:  0.6286816419493539

 ********************************************************************** 

Train shape: (47717, 375) | Val Shape: (5302, 375)
Positive Count in Val Split: 1990




Positive Count in Predictions: 998
Accuracy:  0.6823840060354583
Fold F1:  0.4364123159303882
Fold Precision:  0.6533066132264529
Fold Recall:  0.32763819095477387
Fold AUC:  0.714606547228898
Fold AP:  0.6031395076362658

 ********************************************************************** 

Train shape: (47717, 375) | Val Shape: (5302, 375)
Positive Count in Val Split: 2061




Positive Count in Predictions: 1091
Accuracy:  0.6816295737457563
Fold F1:  0.46446700507614214
Fold Precision:  0.6709440879926672
Fold Recall:  0.3551673944687045
Fold AUC:  0.7104429524614948
Fold AP:  0.6167179243362286

 ********************************************************************** 

Train shape: (47717, 375) | Val Shape: (5302, 375)
Positive Count in Val Split: 2025




Positive Count in Predictions: 1055
Accuracy:  0.6801207091663524
Fold F1:  0.4493506493506494
Fold Precision:  0.6559241706161137
Fold Recall:  0.3417283950617284
Fold AUC:  0.7035576803535302
Fold AP:  0.5998306390914883

 ********************************************************************** 

Train shape: (47717, 375) | Val Shape: (5302, 375)
Positive Count in Val Split: 2095




Positive Count in Predictions: 1081
Accuracy:  0.6778574122972463
Fold F1:  0.4622166246851386
Fold Precision:  0.6790009250693803
Fold Recall:  0.35035799522673033
Fold AUC:  0.7137422389715814
Fold AP:  0.6311078426239743

 ********************************************************************** 

Train shape: (47717, 375) | Val Shape: (5302, 375)
Positive Count in Val Split: 2069




Positive Count in Predictions: 1072
Accuracy:  0.6803093172387779
Fold F1:  0.4603629417382999
Fold Precision:  0.6744402985074627
Fold Recall:  0.34944417593040117
Fold AUC:  0.7306428076698774
Fold AP:  0.6333502795523138

 ********************************************************************** 

Train shape: (47717, 375) | Val Shape: (5302, 375)
Positive Count in Val Split: 2069




Positive Count in Predictions: 1026
Accuracy:  0.6705016974726519
Fold F1:  0.4355411954765751
Fold Precision:  0.6569200779727096
Fold Recall:  0.32576123731271145
Fold AUC:  0.7168927043297603
Fold AP:  0.6228935459722793

 ********************************************************************** 

Train shape: (47718, 375) | Val Shape: (5301, 375)
Positive Count in Val Split: 2092




Positive Count in Predictions: 1040
Accuracy:  0.6642142991888323
Fold F1:  0.43167305236270753
Fold Precision:  0.65
Fold Recall:  0.3231357552581262
Fold AUC:  0.7046910964442142
Fold AP:  0.6180363597345875

 ********************************************************************** 



In [10]:
test = test.drop(columns="user_id")

In [11]:
    
model_preds = [model.predict(test) for model in models]
model_preds

[array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 0], dtype=int64)]

In [14]:
df = pd.read_csv("../data/test_users.csv")

In [15]:
sample_submission = pd.DataFrame()
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,1
2,2110,0
3,55082,0
4,37165,0


In [16]:
sample_submission.moved_after_2019.value_counts()

0    10676
1     2579
Name: moved_after_2019, dtype: int64

In [17]:
sample_submission.to_csv("submission_skill_company_id_dropped.csv", index=False)

In [168]:
train = pd.read_csv("./train_user_lang_work_revise.csv")
train.head()

Unnamed: 0,user_id,is_yurtdisi,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany yurtdisi,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands yurtdisi,user_location_nevsehir,user_location_ordu,user_location_philippines yurtdisi,user_location_poland yurtdisi,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia yurtdisi,user_location_sweden yurtdisi,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom yurtdisi,user_location_united states yurtdisi,user_location_van,user_location_yalova,user_location_zonguldak,moved_after_2019,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum,language_language_albanian_count,language_language_albanian_count_norm,language_language_almanca_count,language_language_almanca_count_norm,language_language_arapca_count,language_language_arapca_count_norm,language_language_armenian_count,language_language_armenian_count_norm,language_language_arnavutca_count,language_language_arnavutca_count_norm,language_language_azerice_count,language_language_azerice_count_norm,language_language_bosnakca_count,language_language_bosnakca_count_norm,language_language_bulgarca_count,language_language_bulgarca_count_norm,language_language_cince_count,language_language_cince_count_norm,language_language_czech_count,language_language_czech_count_norm,language_language_dutch_count,language_language_dutch_count_norm,language_language_farsca_count,language_language_farsca_count_norm,language_language_fince_count,language_language_fince_count_norm,language_language_fransizca_count,language_language_fransizca_count_norm,language_language_greek_count,language_language_greek_count_norm,language_language_hebrew_count,language_language_hebrew_count_norm,language_language_ingilizce_count,language_language_ingilizce_count_norm,language_language_isaret dilleri_count,language_language_isaret dilleri_count_norm,language_language_ispanyolca_count,language_language_ispanyolca_count_norm,language_language_italyanca_count,language_language_italyanca_count_norm,language_language_japonca_count,language_language_japonca_count_norm,language_language_kazakca_count,language_language_kazakca_count_norm,language_language_korece_count,language_language_korece_count_norm,language_language_kurtce_count,language_language_kurtce_count_norm,language_language_latince_count,language_language_latince_count_norm,language_language_lehce_count,language_language_lehce_count_norm,language_language_other_count,language_language_other_count_norm,language_language_persian_count,language_language_persian_count_norm,language_language_polish_count,language_language_polish_count_norm,language_language_portekizce_count,language_language_portekizce_count_norm,language_language_rusca_count,language_language_rusca_count_norm,language_language_sirpca_count,language_language_sirpca_count_norm,language_language_swedish_count,language_language_swedish_count_norm,language_language_turkce_count,language_language_turkce_count_norm,language_language_urdu_count,language_language_urdu_count_norm,language_language_uzbek_count,language_language_uzbek_count_norm,work_job_duration_max,work_job_duration_min,work_job_duration_std,work_job_count_mean,work_min_start_year_min,work_working_years_mean,work_total_job_duration_min,work_avg_work_year_min,work_quit_job_2018_mean,work_quit_job_2018_sum,work_max_start_year_min,work_year_of_job_start_mean,work_year_of_job_start_sum,work_month_of_job_start_mean,work_month_of_job_start_max,work_month_of_job_start_min,work_month_of_job_start_sum,work_quit_job_in_2018_mean,work_quit_job_in_2018_sum,work_moving_average_mean,work_moving_average_max,work_moving_average_min,work_moving_average_sum,work_quit_job_in_2018_change_mean,work_quit_job_in_2018_change_sum,work_time_since_last_change_mean,work_time_since_last_change_sum,work_seasonal_trend_mean,work_seasonal_trend_max,work_seasonal_trend_min,work_seasonal_trend_sum,work_start_date_squared_mean,work_start_date_squared_max,work_start_date_squared_min,work_start_date_squared_sum,work_company_id2_mean,work_company_id2_sum,job_tenure_groups_1-2 years_count,job_tenure_groups_1-2 years_count_norm,job_tenure_groups_2-5 years_count,job_tenure_groups_2-5 years_count_norm,job_tenure_groups_5-10 years_count,job_tenure_groups_5-10 years_count_norm,job_tenure_groups_Less than 1 year_count,job_tenure_groups_Less than 1 year_count_norm,job_tenure_groups_More than 10 years_count,job_tenure_groups_More than 10 years_count_norm,start_quarter_Q1_count,start_quarter_Q1_count_norm,start_quarter_Q2_count,start_quarter_Q2_count_norm,start_quarter_Q3_count,start_quarter_Q3_count_norm,start_quarter_Q4_count,start_quarter_Q4_count_norm,start_season_fall_count,start_season_fall_count_norm,start_season_spring_count,start_season_spring_count_norm,start_season_summer_count,start_season_summer_count_norm,start_season_winter_count,start_season_winter_count_norm,job_location_adana_count,job_location_adana_count_norm,job_location_ankara_count,job_location_ankara_count_norm,job_location_antalya_count,job_location_antalya_count_norm,job_location_area_count,job_location_area_count_norm,job_location_bursa_count,job_location_bursa_count_norm,job_location_elazig_count,job_location_elazig_count_norm,job_location_eskisehir_count,job_location_eskisehir_count_norm,job_location_gebze_count,job_location_gebze_count_norm,job_location_germany_count,job_location_germany_count_norm,job_location_istanbul_count,job_location_istanbul_count_norm,job_location_izmir_count,job_location_izmir_count_norm,job_location_kayseri_count,job_location_kayseri_count_norm,job_location_kocaeli_count,job_location_kocaeli_count_norm,job_location_konya_count,job_location_konya_count_norm,job_location_manisa_count,job_location_manisa_count_norm,job_location_mersin_count,job_location_mersin_count_norm,job_location_mugla_count,job_location_mugla_count_norm,job_location_sakarya_count,job_location_sakarya_count_norm,job_location_tekirdag_count,job_location_tekirdag_count_norm,job_location_trabzon_count,job_location_trabzon_count_norm,job_location_turkiye_count,job_location_turkiye_count_norm,job_location_united states_count,job_location_united states_count_norm
0,1301,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2.0,4.0,5.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.416438,0.419178,1.053681,3.0,2015.0,3.0,3.671233,1.223744,0.333333,1.0,2018.0,2016.0,6048.0,6.0,10.0,3.0,18.0,0.333333,1.0,0.188044,0.295428,0.104247,0.564131,0.0,0.0,0.666667,2.0,0.140932,0.150295,0.130749,0.422795,4064258.0,4072324.0,4060225.0,12192774.0,0.784913,2.35474,0.0,0.0,1.0,0.333333,0.0,0.0,2.0,0.666667,0.0,0.0,1.0,0.333333,1.0,0.333333,0.0,0.0,1.0,0.333333,1.0,0.333333,2.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6950,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.0,1.666667,3.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,1.0,0.333333,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.912329,0.49863,0.569431,6.0,2013.0,5.0,5.753425,0.958904,0.0,0.0,2017.0,2015.166667,12091.0,5.0,8.0,2.0,30.0,0.0,0.0,0.157705,0.28125,0.059347,0.946232,0.0,0.0,0.5,2.0,0.144526,0.159553,0.124661,0.867154,4060898.0,4068289.0,4052169.0,24365391.0,0.365522,2.193131,3.0,0.5,0.0,0.0,0.0,0.0,3.0,0.5,0.0,0.0,2.0,0.333333,2.0,0.333333,2.0,0.333333,0.0,0.0,0.0,0.0,1.0,0.166667,3.0,0.5,2.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.668493,0.747945,0.393513,4.0,2014.0,4.0,5.0,1.25,0.0,0.0,2017.0,2015.5,8062.0,4.25,8.0,1.0,17.0,0.0,0.0,0.181361,0.297016,0.077922,0.725446,0.0,0.0,2.0,8.0,0.138515,0.150295,0.130749,0.554058,4062242.0,4068289.0,4056196.0,16248966.0,0.09375,0.375,3.0,0.75,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,2.0,0.5,1.0,0.25,1.0,0.25,0.0,0.0,0.0,0.0,2.0,0.5,1.0,0.25,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,26046,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.252055,1.082192,1.35696,4.0,2009.0,9.0,9.336986,2.334247,0.0,0.0,2014.0,2011.25,8045.0,10.25,12.0,9.0,41.0,0.0,0.0,0.041784,0.08548,0.014599,0.167136,0.0,0.0,0.0,0.0,0.133489,0.141751,0.110077,0.533957,4045130.0,4056196.0,4036081.0,16180521.0,0.35474,1.41896,2.0,0.5,2.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,3.0,0.75,3.0,0.75,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11005,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,4.0,5.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,6.671233,0.915068,3.110291,3.0,2009.0,9.0,9.336986,3.112329,0.0,0.0,2017.0,2012.0,6036.0,7.0,9.0,4.0,21.0,0.0,0.0,0.102134,0.265915,0.017964,0.306401,0.0,0.0,0.666667,2.0,0.135452,0.141319,0.124661,0.406357,4048157.0,4068289.0,4036081.0,12144470.0,0.299816,0.899447,1.0,0.333333,0.0,0.0,1.0,0.333333,1.0,0.333333,0.0,0.0,0.0,0.0,1.0,0.333333,2.0,0.666667,0.0,0.0,1.0,0.333333,1.0,0.333333,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [169]:
test = pd.read_csv("./test_user_lang_work_revise.csv")
test.head()

Unnamed: 0,user_id,is_yurtdisi,industry_-1,industry_Accounting,industry_Airlines/Aviation,industry_Apparel & Fashion,industry_Architecture & Planning,industry_Automotive,industry_Aviation & Aerospace,industry_Banking,industry_Biotechnology,industry_Broadcast Media,industry_Building Materials,industry_Capital Markets,industry_Chemicals,industry_Civic & Social Organization,industry_Civil Engineering,industry_Commercial Real Estate,industry_Computer & Network Security,industry_Computer Games,industry_Computer Hardware,industry_Computer Networking,industry_Computer Software,industry_Construction,industry_Consumer Electronics,industry_Consumer Goods,industry_Consumer Services,industry_Cosmetics,industry_Defense & Space,industry_Design,industry_E-Learning,industry_Education Management,industry_Electrical/Electronic Manufacturing,industry_Entertainment,industry_Environmental Services,industry_Events Services,industry_Executive Office,industry_Facilities Services,industry_Farming,industry_Financial Services,industry_Fine Art,industry_Fishery,industry_Food & Beverages,industry_Food Production,industry_Furniture,industry_Gambling & Casinos,"industry_Glass, Ceramics & Concrete",industry_Government Administration,industry_Government Relations,industry_Graphic Design,"industry_Health, Wellness and Fitness",industry_Higher Education,industry_Hospital & Health Care,industry_Hospitality,industry_Human Resources,industry_Import and Export,industry_Industrial Automation,industry_Information Services,industry_Information Technology and Services,industry_Insurance,industry_International Affairs,industry_International Trade and Development,industry_Internet,industry_Investment Banking,industry_Investment Management,industry_Law Practice,industry_Legal Services,industry_Legislative Office,"industry_Leisure, Travel & Tourism",industry_Logistics and Supply Chain,industry_Luxury Goods & Jewelry,industry_Machinery,industry_Management Consulting,industry_Maritime,industry_Market Research,industry_Marketing and Advertising,industry_Mechanical or Industrial Engineering,industry_Media Production,industry_Medical Devices,industry_Medical Practice,industry_Military,industry_Mining & Metals,industry_Mobile Games,industry_Motion Pictures and Film,industry_Music,industry_Nanotechnology,industry_Nonprofit Organization Management,industry_Oil & Energy,industry_Online Media,industry_Outsourcing/Offshoring,industry_Package/Freight Delivery,industry_Packaging and Containers,industry_Paper & Forest Products,industry_Performing Arts,industry_Pharmaceuticals,industry_Photography,industry_Plastics,industry_Political Organization,industry_Primary/Secondary Education,industry_Printing,industry_Professional Training & Coaching,industry_Program Development,industry_Public Policy,industry_Public Relations and Communications,industry_Public Safety,industry_Publishing,industry_Railroad Manufacture,industry_Real Estate,industry_Recreational Facilities and Services,industry_Renewables & Environment,industry_Research,industry_Restaurants,industry_Retail,industry_Security and Investigations,industry_Semiconductors,industry_Shipbuilding,industry_Sporting Goods,industry_Sports,industry_Staffing and Recruiting,industry_Telecommunications,industry_Textiles,industry_Think Tanks,industry_Tobacco,industry_Translation and Localization,industry_Transportation/Trucking/Railroad,industry_Utilities,industry_Veterinary,industry_Wholesale,industry_Wireless,industry_Writing and Editing,user_location_adana,user_location_adiyaman,user_location_afyonkarahisar,user_location_aksaray,user_location_ankara,user_location_antalya,user_location_ardahan,user_location_aydin,user_location_balikesir,user_location_bilecik,user_location_bitlis,user_location_bolu,user_location_bursa,user_location_canakkale,user_location_corum,user_location_denizli,user_location_diyarbakir,user_location_duzce,user_location_edirne,user_location_elazig,user_location_erzincan,user_location_erzurum,user_location_eskisehir,user_location_gaziantep,user_location_germany yurtdisi,user_location_giresun,user_location_hatay,user_location_isparta,user_location_istanbul,user_location_izmir,user_location_kahramanmaras,user_location_kayseri,user_location_kirikkale,user_location_kirklareli,user_location_kocaeli,user_location_konya,user_location_kutahya,user_location_malatya,user_location_manisa,user_location_mersin,user_location_mugla,user_location_netherlands yurtdisi,user_location_nevsehir,user_location_ordu,user_location_philippines yurtdisi,user_location_poland yurtdisi,user_location_rize,user_location_sakarya,user_location_samsun,user_location_sanliurfa,user_location_sinop,user_location_sirnak,user_location_sivas,user_location_somalia yurtdisi,user_location_sweden yurtdisi,user_location_tekirdag,user_location_tokat,user_location_trabzon,user_location_turkey,user_location_united kingdom yurtdisi,user_location_united states yurtdisi,user_location_van,user_location_yalova,user_location_zonguldak,language_proficiency_count,language_proficiency_mean,language_proficiency_max,language_proficiency_min,language_proficiency_sum,language_language_albanian_count,language_language_albanian_count_norm,language_language_almanca_count,language_language_almanca_count_norm,language_language_arapca_count,language_language_arapca_count_norm,language_language_armenian_count,language_language_armenian_count_norm,language_language_arnavutca_count,language_language_arnavutca_count_norm,language_language_azerice_count,language_language_azerice_count_norm,language_language_bosnakca_count,language_language_bosnakca_count_norm,language_language_bulgarca_count,language_language_bulgarca_count_norm,language_language_cince_count,language_language_cince_count_norm,language_language_czech_count,language_language_czech_count_norm,language_language_dutch_count,language_language_dutch_count_norm,language_language_farsca_count,language_language_farsca_count_norm,language_language_fince_count,language_language_fince_count_norm,language_language_fransizca_count,language_language_fransizca_count_norm,language_language_greek_count,language_language_greek_count_norm,language_language_hebrew_count,language_language_hebrew_count_norm,language_language_ingilizce_count,language_language_ingilizce_count_norm,language_language_isaret dilleri_count,language_language_isaret dilleri_count_norm,language_language_ispanyolca_count,language_language_ispanyolca_count_norm,language_language_italyanca_count,language_language_italyanca_count_norm,language_language_japonca_count,language_language_japonca_count_norm,language_language_kazakca_count,language_language_kazakca_count_norm,language_language_korece_count,language_language_korece_count_norm,language_language_kurtce_count,language_language_kurtce_count_norm,language_language_latince_count,language_language_latince_count_norm,language_language_lehce_count,language_language_lehce_count_norm,language_language_other_count,language_language_other_count_norm,language_language_persian_count,language_language_persian_count_norm,language_language_polish_count,language_language_polish_count_norm,language_language_portekizce_count,language_language_portekizce_count_norm,language_language_rusca_count,language_language_rusca_count_norm,language_language_sirpca_count,language_language_sirpca_count_norm,language_language_swedish_count,language_language_swedish_count_norm,language_language_turkce_count,language_language_turkce_count_norm,language_language_urdu_count,language_language_urdu_count_norm,language_language_uzbek_count,language_language_uzbek_count_norm,work_job_duration_max,work_job_duration_min,work_job_duration_std,work_job_count_mean,work_min_start_year_min,work_working_years_mean,work_total_job_duration_min,work_avg_work_year_min,work_quit_job_2018_mean,work_quit_job_2018_sum,work_max_start_year_min,work_year_of_job_start_mean,work_year_of_job_start_sum,work_month_of_job_start_mean,work_month_of_job_start_max,work_month_of_job_start_min,work_month_of_job_start_sum,work_quit_job_in_2018_mean,work_quit_job_in_2018_sum,work_moving_average_mean,work_moving_average_max,work_moving_average_min,work_moving_average_sum,work_quit_job_in_2018_change_mean,work_quit_job_in_2018_change_sum,work_time_since_last_change_mean,work_time_since_last_change_sum,work_seasonal_trend_mean,work_seasonal_trend_max,work_seasonal_trend_min,work_seasonal_trend_sum,work_start_date_squared_mean,work_start_date_squared_max,work_start_date_squared_min,work_start_date_squared_sum,work_company_id2_mean,work_company_id2_sum,job_tenure_groups_1-2 years_count,job_tenure_groups_1-2 years_count_norm,job_tenure_groups_2-5 years_count,job_tenure_groups_2-5 years_count_norm,job_tenure_groups_5-10 years_count,job_tenure_groups_5-10 years_count_norm,job_tenure_groups_Less than 1 year_count,job_tenure_groups_Less than 1 year_count_norm,job_tenure_groups_More than 10 years_count,job_tenure_groups_More than 10 years_count_norm,start_quarter_Q1_count,start_quarter_Q1_count_norm,start_quarter_Q2_count,start_quarter_Q2_count_norm,start_quarter_Q3_count,start_quarter_Q3_count_norm,start_quarter_Q4_count,start_quarter_Q4_count_norm,start_season_fall_count,start_season_fall_count_norm,start_season_spring_count,start_season_spring_count_norm,start_season_summer_count,start_season_summer_count_norm,start_season_winter_count,start_season_winter_count_norm,job_location_adana_count,job_location_adana_count_norm,job_location_ankara_count,job_location_ankara_count_norm,job_location_antalya_count,job_location_antalya_count_norm,job_location_area_count,job_location_area_count_norm,job_location_bursa_count,job_location_bursa_count_norm,job_location_elazig_count,job_location_elazig_count_norm,job_location_eskisehir_count,job_location_eskisehir_count_norm,job_location_gebze_count,job_location_gebze_count_norm,job_location_germany_count,job_location_germany_count_norm,job_location_istanbul_count,job_location_istanbul_count_norm,job_location_izmir_count,job_location_izmir_count_norm,job_location_kayseri_count,job_location_kayseri_count_norm,job_location_kocaeli_count,job_location_kocaeli_count_norm,job_location_konya_count,job_location_konya_count_norm,job_location_manisa_count,job_location_manisa_count_norm,job_location_mersin_count,job_location_mersin_count_norm,job_location_mugla_count,job_location_mugla_count_norm,job_location_sakarya_count,job_location_sakarya_count_norm,job_location_tekirdag_count,job_location_tekirdag_count_norm,job_location_trabzon_count,job_location_trabzon_count_norm,job_location_turkiye_count,job_location_turkiye_count_norm,job_location_united states_count,job_location_united states_count_norm
0,17449,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11.005479,0.0,5.570471,2.0,2001.0,17.0,18.008219,9.00411,0.0,0.0,2012.0,2004.666667,6014.0,1.0,1.0,1.0,3.0,0.0,0.0,0.009724,0.029172,0.0,0.029172,0.0,0.0,0.0,0.0,0.131695,0.131695,0.131695,0.395086,4018715.0,4048144.0,4004001.0,12056146.0,0.133333,0.4,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,1.0,0.333333,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0
1,33967,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,3.5,5.0,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,2.0,0.583562,1.001573,2.0,2016.0,2.0,2.583562,1.291781,0.5,1.0,2018.0,2017.0,4034.0,6.0,6.0,6.0,12.0,0.5,1.0,0.205602,0.234584,0.17662,0.411204,0.0,0.0,,0.0,0.14075,0.14075,0.14075,0.281499,4068290.0,4072324.0,4064256.0,8136580.0,0.35474,0.35474,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2110,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3.0,2.666667,5.0,1.0,8.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,55082,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,5.509589,1.912329,2.543647,2.0,2011.0,7.0,7.421918,3.710959,0.0,0.0,2017.0,2014.0,4028.0,5.0,8.0,2.0,10.0,0.0,0.0,0.163551,0.28125,0.045852,0.327102,0.0,0.0,,0.0,0.150436,0.159553,0.141319,0.300872,4056205.0,4068289.0,4044121.0,8112410.0,0.423077,0.846154,1.0,0.5,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,37165,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,4.5,5.0,4.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,2.583562,2.583562,,1.0,2016.0,2.0,2.583562,2.583562,0.0,0.0,2016.0,2016.0,2016.0,6.0,6.0,6.0,6.0,0.0,0.0,0.17662,0.17662,0.17662,0.17662,0.0,0.0,,0.0,0.14075,0.14075,0.14075,0.14075,4064256.0,4064256.0,4064256.0,4064256.0,0.5,0.5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [170]:
train.shape

(53019, 379)

In [171]:
test.shape

(13255, 378)

In [172]:
train.drop(columns=["work_company_id2_mean", "work_company_id2_sum"], inplace=True)
test.drop(columns=["work_company_id2_mean", "work_company_id2_sum"], inplace=True)

In [173]:
train.shape

(53019, 377)

In [174]:
test.shape

(13255, 376)

In [175]:
df_skill = pd.read_csv("../data/skills.csv")
df_skill

Unnamed: 0,user_id,skill
0,1,Mühendislik
1,1,Eğitim
2,2,Android
3,2,Java
4,2,3D Studio Max
...,...,...
1398438,66273,CI/CD
1398439,66273,Terraform
1398440,66273,MongoDB
1398441,66273,Go (Programming Language)


In [176]:
used_skills = df_skill['skill'].value_counts().iloc[:140].index # we only took most common 20 skills, you can increase it

df_skill = df_skill[df_skill['skill'].isin(used_skills)]
df_skill['experience'] = True
df_skill = df_skill.drop_duplicates(['user_id', 'skill'])
df_skill = pd.pivot(df_skill, index='user_id', columns='skill', values='experience')
df_skill = df_skill.fillna(0).astype(int)
df_skill.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



skill,.NET,.NET Core,.NET Framework,ADO.NET,AJAX,ASP.NET,ASP.NET MVC,Active Directory,Agile Methodologies,Agile Metotları,Algorithms,Algoritmalar,Amazon Web Services (AWS),Analitik Beceriler,Android,Android Development,Android Geliştirme,Android Uygulama Geliştirme,Angular,AngularJS,Araştırma,Arduino,AutoCAD,Back-End Web Development,Bash,Bootstrap,Business Analysis,Business Intelligence,C,C (Programming Language),C#,C++,CRM,CSS,CSS3,Cascading Style Sheets (CSS),Cloud Computing,Communication,Computer Science,Data Analysis,Data Structures,Database Design,Databases,Deep Learning,Design Patterns,DevOps,Django,Docker,Eclipse,Ekip Liderliği,Ekip Çalışması,Embedded Systems,Engineering,English,Entity Framework,Firebase,Flutter,Front-end Development,Git,Github,HTML,HTML 5,HTML5,Hibernate,IIS,IP,Image Processing,Integration,JIRA,JSF,JSON,JSP,JUnit,Java,Java Enterprise Edition,JavaScript,Jenkins,Kotlin,Kubernetes,LINQ,Laravel,Leadership,Liderlik,Linux,MVC,Machine Learning,Makine Öğrenimi,Management,Matlab,Maven,Microservices,Microsoft Excel,Microsoft Office,Microsoft SQL Server,Microsoft Word,Mobil Uygulama Geliştirme,Mobil Uygulamalar,Mobile Applications,MongoDB,MySQL,Mühendislik,Network Security,Networking,NoSQL,Node.js,OOP,OOP (Nesne Yönelimli Programlama),Object Oriented Design,Object-Oriented Programming (OOP),Objective-C,Oracle,Oracle SQL Developer,Oracle Veritabanı,PHP,PL/SQL,Photoshop,PostgreSQL,PowerPoint,Problem Solving,Product Development,Product Management,Programlama,Programming,Proje Planlama,Proje Yönetimi,Project Management,Project Planning,Python,Python (Programming Language),REST (Sunumsal Durum Aktarımı),RESTful WebServices,RabbitMQ,React Native,React.js,Redis,Requirements Analysis,Research,Routing,SAP,SOA,SOAP,SQL,Scrum,Selenium,Simulink,Software Design,Software Development,Software Engineering,Software Project Management,SolidWorks,Sorun Çözme,Sosyal Medya,Spring,Spring Boot,Spring Framework,Strategic Planning,Subversion,Swift,T-SQL,TCP/IP,TFS,Team Leadership,Team Management,Teamwork,Telecommunications,Testing,Tomcat,TypeScript,UML,Ubuntu,Unity,Unix,VMware,Veri Analizi,Veritabanları,Visual Basic,Visual Studio,Vue.js,WCF,WPF,Web Applications,Web Design,Web Development,Web Geliştirme,Web Services,Web Tasarımı,Web Uygulamaları,Windows,Windows Server,WordPress,XML,Yapay Zeka,Yazılım,Yazılım Geliştirme,Yönetim,iOS,jQuery,İletişim,İngilizce,İş Analizi
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [177]:
df_skill.rename(columns={"C#": "Csharp","C++": "Cplusplus"}, inplace=True)


In [178]:
train = train.merge(df_skill, on="user_id", how="left")
test = test.merge(df_skill, on="user_id", how="left")

train = train.merge(skills_count, on="user_id", how="left")
test = test.merge(skills_count, on="user_id", how="left")

In [179]:

from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, recall_score, precision_score, accuracy_score
import re






X_train_user_id = train["user_id"]

y = train["moved_after_2019"]
X = train.drop(columns=["user_id", "moved_after_2019"])
X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

auc_errs = []
ap_errs = []
prec_errs = []
recall_errs = []
f1_errs = []
acc_scores = []

models = []

kf = KFold(n_splits=10)

for split_train, split_val in list(kf.split(X)):
    
    split_train = X.index[split_train]
    split_val = X.index[split_val]
    
    X_train, y_train = X.loc[split_train], y.loc[split_train]
    X_val, y_val = X.loc[split_val], y.loc[split_val]
    print("Train shape:", X_train.shape, "|", "Val Shape:", X_val.shape)
    print("Positive Count in Val Split:", y_val.sum())

    model = lgb.LGBMClassifier()
    

    model.fit(X_train, y_train, eval_metric = 'accuracy',
                eval_set = [(X_val, y_val), (X_train, y_train)],
                eval_names = ['valid', 'train'],
                early_stopping_rounds = 100, verbose = 200)
    
    preds = model.predict(X_val)
    pred_probas = model.predict_proba(X_val)[:,1]

    print("Positive Count in Predictions:", preds.sum())
    
    acc = accuracy_score(y_val, preds)
    print("Accuracy: ", acc)
    acc_scores.append(acc)
    
    f1_err = f1_score(y_val, preds)
    print("Fold F1: ", f1_err)
    f1_errs.append(f1_err)

    prec_err = precision_score(y_val, preds)
    print("Fold Precision: ", prec_err)
    prec_errs.append(prec_err)

    recall_err = recall_score(y_val, preds)
    print("Fold Recall: ", recall_err)
    recall_errs.append(recall_err)
    
    auc_err = roc_auc_score(y_val, pred_probas)
    print("Fold AUC: ", auc_err)
    auc_errs.append(auc_err)

    ap_err = average_precision_score(y_val, pred_probas)
    print("Fold AP: ", ap_err)
    ap_errs.append(ap_err)
    
    models.append(model)
    
    print("\n", "*"*70, "\n")

import plotly.express as px
importance = [model.feature_importances_ for model in models]

f_importance = pd.concat([pd.Series(X.columns.to_list(),name='Feature'),
                        pd.Series(np.mean(importance,axis=0),name="Importance")],
                        axis=1).sort_values(by='Importance',
                                            ascending=True)

fig = px.bar(f_importance.tail(20),x='Importance',y='Feature')
fig.update_layout(
    title_text="First 20 Important Features - CatBoost Average of Folds"
)
fig.show()



Train shape: (47717, 576) | Val Shape: (5302, 576)
Positive Count in Val Split: 2070



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1203
Accuracy:  0.6935118823085628
Fold F1:  0.5035135960892148
Fold Precision:  0.684954280964256
Fold Recall:  0.39806763285024155
Fold AUC:  0.7346827617544364
Fold AP:  0.6469267856830337

 ********************************************************************** 

Train shape: (47717, 576) | Val Shape: (5302, 576)
Positive Count in Val Split: 2063



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1215
Accuracy:  0.6970954356846473
Fold F1:  0.5100671140939598
Fold Precision:  0.6880658436213992
Fold Recall:  0.40523509452254
Fold AUC:  0.7382969046806994
Fold AP:  0.6575538116401256

 ********************************************************************** 

Train shape: (47717, 576) | Val Shape: (5302, 576)
Positive Count in Val Split: 1989



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1241
Accuracy:  0.7069030554507733
Fold F1:  0.5188854489164086
Fold Precision:  0.6752618855761483
Fold Recall:  0.4213172448466566
Fold AUC:  0.7444558867917828
Fold AP:  0.6451412373881136

 ********************************************************************** 

Train shape: (47717, 576) | Val Shape: (5302, 576)
Positive Count in Val Split: 1990



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1214
Accuracy:  0.6959637872500943
Fold F1:  0.49687890137328344
Fold Precision:  0.6556836902800659
Fold Recall:  0.4
Fold AUC:  0.7397734445172723
Fold AP:  0.6336160924437342

 ********************************************************************** 

Train shape: (47717, 576) | Val Shape: (5302, 576)
Positive Count in Val Split: 2061



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1240
Accuracy:  0.6863447755563938
Fold F1:  0.4962132687064525
Fold Precision:  0.660483870967742
Fold Recall:  0.39737991266375544
Fold AUC:  0.7328509913841952
Fold AP:  0.6446338263171818

 ********************************************************************** 

Train shape: (47717, 576) | Val Shape: (5302, 576)
Positive Count in Val Split: 2025



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1259
Accuracy:  0.6880422482082234
Fold F1:  0.4963459196102314
Fold Precision:  0.647339158061954
Fold Recall:  0.4024691358024691
Fold AUC:  0.7310680877194966
Fold AP:  0.6297416573341852

 ********************************************************************** 

Train shape: (47717, 576) | Val Shape: (5302, 576)
Positive Count in Val Split: 2095



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1233
Accuracy:  0.6955865711052434
Fold F1:  0.5150240384615385
Fold Precision:  0.6950527169505272
Fold Recall:  0.4090692124105012
Fold AUC:  0.7415247671970547
Fold AP:  0.6608504516631674

 ********************************************************************** 

Train shape: (47717, 576) | Val Shape: (5302, 576)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1256
Accuracy:  0.7010562052055828
Fold F1:  0.5233082706766917
Fold Precision:  0.6926751592356688
Fold Recall:  0.42049299178347027
Fold AUC:  0.7564797056454874
Fold AP:  0.6643872877282573

 ********************************************************************** 

Train shape: (47717, 576) | Val Shape: (5302, 576)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1211
Accuracy:  0.6952093549603923
Fold F1:  0.5073170731707317
Fold Precision:  0.6870355078447564
Fold Recall:  0.40212663122281295
Fold AUC:  0.743183476584288
Fold AP:  0.6592762244404207

 ********************************************************************** 

Train shape: (47718, 576) | Val Shape: (5301, 576)
Positive Count in Val Split: 2092



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1231
Accuracy:  0.6900584795321637
Fold F1:  0.5055672585013542
Fold Precision:  0.6823720552396426
Fold Recall:  0.40152963671128106
Fold AUC:  0.7332762271741702
Fold AP:  0.6515945410866276

 ********************************************************************** 



In [180]:
test = test.drop(columns="user_id")

In [181]:
    
model_preds = [model.predict(test) for model in models]
model_preds

[array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64)]

In [182]:
df = pd.read_csv("../data/test_users.csv")

In [183]:
sample_submission = pd.DataFrame()
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,1
2,2110,0
3,55082,0
4,37165,0


In [184]:
sample_submission.moved_after_2019.value_counts()

0    10161
1     3094
Name: moved_after_2019, dtype: int64

In [185]:
sample_submission.to_csv("submission_skill_first140.csv", index=False)

In [36]:
skills = pd.read_csv("../data/skills.csv")

In [37]:
def count_skills(skill_data):
    skill_data = skill_data.copy()
    return skill_data.groupby("user_id").agg({"skill": "nunique"}).reset_index().rename(columns={"skill": "skill_count_raw"})

In [38]:
skills_count = count_skills(skills)

In [39]:
skills_count

Unnamed: 0,user_id,skill_count_raw
0,1,2
1,2,6
2,3,3
3,5,11
4,6,6
...,...,...
62397,66269,31
62398,66270,8
62399,66271,47
62400,66272,36


In [32]:
train.shape

(53019, 427)

In [33]:
test.shape

(13255, 425)

In [34]:
set(train.columns) - set(test.columns)

{'moved_after_2019', 'user_id'}

In [40]:

from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, recall_score, precision_score, accuracy_score
import re


train = train.merge(skills_count, on="user_id", how="left")



X_train_user_id = train["user_id"]

y = train["moved_after_2019"]
X = train.drop(columns=["user_id", "moved_after_2019"])
X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

auc_errs = []
ap_errs = []
prec_errs = []
recall_errs = []
f1_errs = []
acc_scores = []

models = []

kf = KFold(n_splits=10)

for split_train, split_val in list(kf.split(X)):
    
    split_train = X.index[split_train]
    split_val = X.index[split_val]
    
    X_train, y_train = X.loc[split_train], y.loc[split_train]
    X_val, y_val = X.loc[split_val], y.loc[split_val]
    print("Train shape:", X_train.shape, "|", "Val Shape:", X_val.shape)
    print("Positive Count in Val Split:", y_val.sum())

    model = lgb.LGBMClassifier()
    

    model.fit(X_train, y_train, eval_metric = 'accuracy',
                eval_set = [(X_val, y_val), (X_train, y_train)],
                eval_names = ['valid', 'train'],
                early_stopping_rounds = 100, verbose = 200)
    
    preds = model.predict(X_val)
    pred_probas = model.predict_proba(X_val)[:,1]

    print("Positive Count in Predictions:", preds.sum())
    
    acc = accuracy_score(y_val, preds)
    print("Accuracy: ", acc)
    acc_scores.append(acc)
    
    f1_err = f1_score(y_val, preds)
    print("Fold F1: ", f1_err)
    f1_errs.append(f1_err)

    prec_err = precision_score(y_val, preds)
    print("Fold Precision: ", prec_err)
    prec_errs.append(prec_err)

    recall_err = recall_score(y_val, preds)
    print("Fold Recall: ", recall_err)
    recall_errs.append(recall_err)
    
    auc_err = roc_auc_score(y_val, pred_probas)
    print("Fold AUC: ", auc_err)
    auc_errs.append(auc_err)

    ap_err = average_precision_score(y_val, pred_probas)
    print("Fold AP: ", ap_err)
    ap_errs.append(ap_err)
    
    models.append(model)
    
    print("\n", "*"*70, "\n")

import plotly.express as px
importance = [model.feature_importances_ for model in models]

f_importance = pd.concat([pd.Series(X.columns.to_list(),name='Feature'),
                        pd.Series(np.mean(importance,axis=0),name="Importance")],
                        axis=1).sort_values(by='Importance',
                                            ascending=True)

fig = px.bar(f_importance.tail(20),x='Importance',y='Feature')
fig.update_layout(
    title_text="First 20 Important Features - CatBoost Average of Folds"
)
fig.show()



Train shape: (47717, 426) | Val Shape: (5302, 426)
Positive Count in Val Split: 2070



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1214
Accuracy:  0.6937004903809884
Fold F1:  0.5054811205846528
Fold Precision:  0.6836902800658978
Fold Recall:  0.40096618357487923
Fold AUC:  0.7305205044961017
Fold AP:  0.6397956593210228

 ********************************************************************** 

Train shape: (47717, 426) | Val Shape: (5302, 426)
Positive Count in Val Split: 2063



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1188
Accuracy:  0.6923802338740098
Fold F1:  0.49830821285758237
Fold Precision:  0.6818181818181818
Fold Recall:  0.3926320891904993
Fold AUC:  0.7326729628316551
Fold AP:  0.6504211977028385

 ********************************************************************** 

Train shape: (47717, 426) | Val Shape: (5302, 426)
Positive Count in Val Split: 1989



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1206
Accuracy:  0.7003017729158808
Fold F1:  0.502660406885759
Fold Precision:  0.6658374792703151
Fold Recall:  0.40372046254399196
Fold AUC:  0.7385281742004811
Fold AP:  0.6426639228347422

 ********************************************************************** 

Train shape: (47717, 426) | Val Shape: (5302, 426)
Positive Count in Val Split: 1990



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1218
Accuracy:  0.6963410033949453
Fold F1:  0.4981296758104738
Fold Precision:  0.6559934318555009
Fold Recall:  0.4015075376884422
Fold AUC:  0.737104074114534
Fold AP:  0.6281989956822364

 ********************************************************************** 

Train shape: (47717, 426) | Val Shape: (5302, 426)
Positive Count in Val Split: 2061



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1216
Accuracy:  0.6874764239909468
Fold F1:  0.4943545926151968
Fold Precision:  0.6661184210526315
Fold Recall:  0.3930131004366812
Fold AUC:  0.7297700151548698
Fold AP:  0.6342459908827605

 ********************************************************************** 

Train shape: (47717, 426) | Val Shape: (5302, 426)
Positive Count in Val Split: 2025



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1206
Accuracy:  0.6882308562806488
Fold F1:  0.48839368616527384
Fold Precision:  0.654228855721393
Fold Recall:  0.3896296296296296
Fold AUC:  0.726693113620181
Fold AP:  0.6234650118522448

 ********************************************************************** 

Train shape: (47717, 426) | Val Shape: (5302, 426)
Positive Count in Val Split: 2095



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1265
Accuracy:  0.6937004903809884
Fold F1:  0.5166666666666666
Fold Precision:  0.6861660079051384
Fold Recall:  0.4143198090692124
Fold AUC:  0.7349219673848897
Fold AP:  0.653803487319361

 ********************************************************************** 

Train shape: (47717, 426) | Val Shape: (5302, 426)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1237
Accuracy:  0.6952093549603923
Fold F1:  0.5111917725347853
Fold Precision:  0.683104284559418
Fold Recall:  0.4084098598356694
Fold AUC:  0.7536870184032863
Fold AP:  0.6604495291152335

 ********************************************************************** 

Train shape: (47717, 426) | Val Shape: (5302, 426)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1216
Accuracy:  0.6882308562806488
Fold F1:  0.4968036529680366
Fold Precision:  0.6710526315789473
Fold Recall:  0.3943934267762204
Fold AUC:  0.7407601228091709
Fold AP:  0.6514199974566153

 ********************************************************************** 

Train shape: (47718, 426) | Val Shape: (5301, 426)
Positive Count in Val Split: 2092



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1198
Accuracy:  0.6845878136200717
Fold F1:  0.4917933130699088
Fold Precision:  0.6752921535893155
Fold Recall:  0.3867112810707457
Fold AUC:  0.7294145975676678
Fold AP:  0.6460882861254078

 ********************************************************************** 



In [43]:
test["user_id"] = df["user_id"]
test = test.merge(skills_count, on="user_id", how="left")
test.drop(columns="user_id", inplace=True)

In [44]:
    
model_preds = [model.predict(test) for model in models]
model_preds

[array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 0], dtype=int64)]

In [None]:
df = pd.read_csv("../data/test_users.csv")

In [45]:
sample_submission = pd.DataFrame()
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,1
2,2110,0
3,55082,0
4,37165,0


In [46]:
sample_submission.moved_after_2019.value_counts()

0    10194
1     3061
Name: moved_after_2019, dtype: int64

In [47]:
skills = pd.read_csv("../data/skills.csv")

In [48]:
train = pd.read_csv("./train_user_lang_work_revise.csv")
test = pd.read_csv("./test_user_lang_work_revise.csv")

In [49]:
skills_train_data = pd.merge(skills,train[['user_id','moved_after_2019']],on='user_id', how='right')
skills_test_data = pd.merge(skills,test[['user_id']],on='user_id', how='right')


In [50]:
skills_test_data

Unnamed: 0,user_id,skill
0,17449,Bioinformatics
1,17449,Machine Learning
2,17449,Artificial Intelligence (AI)
3,17449,Android Development
4,17449,iOS development
...,...,...
282638,16036,Perl
282639,16036,Embedded Software
282640,16036,Simulink
282641,16036,Swift


In [51]:
target_train = skills_train_data[['moved_after_2019']]
train_skills = skills_train_data.drop('moved_after_2019', axis = 1)
test_skills = skills_test_data.copy()

In [52]:
import category_encoders as ce
cbe_encoder = ce.cat_boost.CatBoostEncoder(sigma=0.5)
cbe_encoder.fit(train_skills, target_train)
skills_catboost_train = cbe_encoder.transform(train_skills)
skills_catboost_test = cbe_encoder.transform(test_skills)


In [54]:
skills_catboost_train_agg = skills_catboost_train.groupby('user_id').agg(['sum', 'mean'])

In [55]:
skills_catboost_test_agg = skills_catboost_test.groupby('user_id').agg(['sum', 'mean'])

In [58]:
cat_agg(skills_catboost_test_agg)


         skill_count  skill_count_norm
user_id                               
1           0.736579          0.368290
2           2.576923          0.429487
4           0.250454          0.250454
11         16.082303          0.459494
14          4.656453          0.465645


In [59]:
cat_agg(skills_catboost_train_agg)


         skill_count  skill_count_norm
user_id                               
0           0.250454          0.250454
3           1.459549          0.486516
5           5.307578          0.482507
6           2.460933          0.410155
7           2.928670          0.418381


In [62]:

from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, recall_score, precision_score, accuracy_score
import re


train = train.merge(skills_count, on="user_id", how="left")

train = train.merge(skills_catboost_train_agg, on="user_id", how="left")


X_train_user_id = train["user_id"]

y = train["moved_after_2019"]
X = train.drop(columns=["user_id", "moved_after_2019"])
X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

auc_errs = []
ap_errs = []
prec_errs = []
recall_errs = []
f1_errs = []
acc_scores = []

models = []

kf = KFold(n_splits=10)

for split_train, split_val in list(kf.split(X)):
    
    split_train = X.index[split_train]
    split_val = X.index[split_val]
    
    X_train, y_train = X.loc[split_train], y.loc[split_train]
    X_val, y_val = X.loc[split_val], y.loc[split_val]
    print("Train shape:", X_train.shape, "|", "Val Shape:", X_val.shape)
    print("Positive Count in Val Split:", y_val.sum())

    model = lgb.LGBMClassifier(reg_lambda=0.5)
    

    model.fit(X_train, y_train, eval_metric = 'accuracy',
                eval_set = [(X_val, y_val), (X_train, y_train)],
                eval_names = ['valid', 'train'],
                early_stopping_rounds = 100, verbose = 200)
    
    preds = model.predict(X_val)
    pred_probas = model.predict_proba(X_val)[:,1]

    print("Positive Count in Predictions:", preds.sum())
    
    acc = accuracy_score(y_val, preds)
    print("Accuracy: ", acc)
    acc_scores.append(acc)
    
    f1_err = f1_score(y_val, preds)
    print("Fold F1: ", f1_err)
    f1_errs.append(f1_err)

    prec_err = precision_score(y_val, preds)
    print("Fold Precision: ", prec_err)
    prec_errs.append(prec_err)

    recall_err = recall_score(y_val, preds)
    print("Fold Recall: ", recall_err)
    recall_errs.append(recall_err)
    
    auc_err = roc_auc_score(y_val, pred_probas)
    print("Fold AUC: ", auc_err)
    auc_errs.append(auc_err)

    ap_err = average_precision_score(y_val, pred_probas)
    print("Fold AP: ", ap_err)
    ap_errs.append(ap_err)
    
    models.append(model)
    
    print("\n", "*"*70, "\n")

import plotly.express as px
importance = [model.feature_importances_ for model in models]

f_importance = pd.concat([pd.Series(X.columns.to_list(),name='Feature'),
                        pd.Series(np.mean(importance,axis=0),name="Importance")],
                        axis=1).sort_values(by='Importance',
                                            ascending=True)

fig = px.bar(f_importance.tail(20),x='Importance',y='Feature')
fig.update_layout(
    title_text="First 20 Important Features - CatBoost Average of Folds"
)
fig.show()



Train shape: (47717, 383) | Val Shape: (5302, 383)
Positive Count in Val Split: 2070



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1756
Accuracy:  0.8264805733685402
Fold F1:  0.7595399895452171
Fold Precision:  0.8274487471526196
Fold Recall:  0.7019323671497585
Fold AUC:  0.9074627367628066
Fold AP:  0.8779303536859856

 ********************************************************************** 

Train shape: (47717, 383) | Val Shape: (5302, 383)
Positive Count in Val Split: 2063



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1746
Accuracy:  0.8225198038476047
Fold F1:  0.7529535311105277
Fold Precision:  0.8213058419243986
Fold Recall:  0.6951042171594765
Fold AUC:  0.9018528276547176
Fold AP:  0.871684151696441

 ********************************************************************** 

Train shape: (47717, 383) | Val Shape: (5302, 383)
Positive Count in Val Split: 1989



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1736
Accuracy:  0.8240286684270087
Fold F1:  0.7495302013422819
Fold Precision:  0.804147465437788
Fold Recall:  0.7018602312719959
Fold AUC:  0.9057495822556811
Fold AP:  0.873199425989633

 ********************************************************************** 

Train shape: (47717, 383) | Val Shape: (5302, 383)
Positive Count in Val Split: 1990



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1708
Accuracy:  0.8268577895133912
Fold F1:  0.7517577068685777
Fold Precision:  0.8138173302107728
Fold Recall:  0.6984924623115578
Fold AUC:  0.9061828617726314
Fold AP:  0.870885633457884

 ********************************************************************** 

Train shape: (47717, 383) | Val Shape: (5302, 383)
Positive Count in Val Split: 2061



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1759
Accuracy:  0.8230856280648812
Fold F1:  0.7544502617801049
Fold Precision:  0.8192154633314384
Fold Recall:  0.6991751576904416
Fold AUC:  0.9080411683097791
Fold AP:  0.8779113467063462

 ********************************************************************** 

Train shape: (47717, 383) | Val Shape: (5302, 383)
Positive Count in Val Split: 2025



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1775
Accuracy:  0.8230856280648812
Fold F1:  0.7531578947368421
Fold Precision:  0.8061971830985916
Fold Recall:  0.7066666666666667
Fold AUC:  0.9028626755124568
Fold AP:  0.87175873172165

 ********************************************************************** 

Train shape: (47717, 383) | Val Shape: (5302, 383)
Positive Count in Val Split: 2095



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1760
Accuracy:  0.8281780460203697
Fold F1:  0.7636835278858625
Fold Precision:  0.8363636363636363
Fold Recall:  0.7026252983293556
Fold AUC:  0.9075707004293264
Fold AP:  0.8833051968768512

 ********************************************************************** 

Train shape: (47717, 383) | Val Shape: (5302, 383)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1756
Accuracy:  0.8323274236137307
Fold F1:  0.7675816993464052
Fold Precision:  0.835990888382688
Fold Recall:  0.7095215079748671
Fold AUC:  0.912570972048909
Fold AP:  0.8861655022095353

 ********************************************************************** 

Train shape: (47717, 383) | Val Shape: (5302, 383)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1773
Accuracy:  0.8313843832516031
Fold F1:  0.7673086933888599
Fold Precision:  0.8313592780597857
Fold Recall:  0.7124214596423393
Fold AUC:  0.912194537452626
Fold AP:  0.8844727292789846

 ********************************************************************** 

Train shape: (47718, 383) | Val Shape: (5301, 383)
Positive Count in Val Split: 2092



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1748
Accuracy:  0.8249386908130542
Fold F1:  0.7583333333333333
Fold Precision:  0.8329519450800915
Fold Recall:  0.6959847036328872
Fold AUC:  0.903487487688486
Fold AP:  0.8749520575819016

 ********************************************************************** 



# skill and company_id check tomorrow encode

In [84]:
education = pd.read_csv("../data/education.csv")
education.head()

Unnamed: 0,user_id,school_name,degree,fields_of_study,start_year_month,end_year_month
0,0,Anadolu Üniversitesi,,,,
1,0,Adıyaman Üniversitesi,,,,
2,0,Fırat Üniversitesi,,,,
3,1,Fırat Üniversitesi,Yüksek Lisans,Yazılım Mühendisliği,,
4,1,Fırat Üniversitesi,Lisans,Yazılım Mühendisliği,,


In [40]:
train = pd.read_csv("./train_user_lang_work_revise.csv")
test = pd.read_csv("./test_user_lang_work_revise.csv")

In [41]:
education_train_data = pd.merge(education,train[['user_id']],on='user_id', how='right')
education_test_data = pd.merge(education,test[['user_id']],on='user_id', how='right')


In [42]:
from unidecode import unidecode

def column_clean(string):
    string = unidecode(str(string).lower())
    if ("bac" in string) | ("lisans derecesi" in string) | ("bs" in string) | ("muhendislik" in string) | ("fak" in string) | ("engin" in string) | ("b." in string) | ("lic" in string):
        return "lisans"
    elif ("ön" in string) | ("ass" in string) | ("on lisans" in string):
        return "onlisans"
    elif ("mast" in string) | ("ms" in string) | ("mba" in string) | ("bus" in string) | ("m." in string):
        return "yuksek lisans"
    elif ("doc" in string) | ("dok" in string) | ("ph" in string):
        return "doktora"
    elif ("high" in string) | ("lise" in string):
        return "lise"
    else:
        return string
    #df.loc[df[col].str.contains(contain), col] = replace
    #string.str.contains('Executive MBA (Mini)'), 'language'] = 'turkce'
    #education_train_data.loc[education_train_data['degree'].str.contains('Executive MBA (Mini)'), 'degree'] = 'test'




In [43]:
import swifter
education_train_data["degree"] = education_train_data["degree"].swifter.apply(column_clean)
education_test_data["degree"] = education_test_data["degree"].swifter.apply(column_clean)


Pandas Apply:   0%|          | 0/114092 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/28486 [00:00<?, ?it/s]

In [44]:
education_test_data.head()


Unnamed: 0,user_id,school_name,degree,fields_of_study,start_year_month,end_year_month
0,17449,Katedra Biocybernetics&Polytechnic,doktora,,,
1,17449,Anadolu Üniversitesi,7325,Business/Managerial Economics,,
2,17449,Atatürk Üniversitesi,,,,
3,17449,Ege University,,,,
4,17449,Gebze Technical University,,,,


In [45]:
education_train_data.head()

Unnamed: 0,user_id,school_name,degree,fields_of_study,start_year_month,end_year_month
0,1301,Bahçeşehir Üniversitesi,lisans,Software Engineering,,
1,1301,Firat University,onlisans,Computer Programming,,
2,6950,"Koç University, College of Engineering, Istanbul",department,Computer Engineering,,
3,4880,Marmara University,lisans,Bilgisayar Öğretmenliği,,
4,4880,ITO Vakfı Süleyman Taştekin Anadolu Meslek Lisesi,2,Bilgisayar Yazılımı Teknolojisi/Teknisyen,,


In [46]:
mask = education_train_data.degree.map(education_train_data.degree.value_counts()) < 2068
education_train_data.degree =  education_train_data.degree.mask(mask, 'other')
education_train_data

Unnamed: 0,user_id,school_name,degree,fields_of_study,start_year_month,end_year_month
0,1301,Bahçeşehir Üniversitesi,lisans,Software Engineering,,
1,1301,Firat University,onlisans,Computer Programming,,
2,6950,"Koç University, College of Engineering, Istanbul",other,Computer Engineering,,
3,4880,Marmara University,lisans,Bilgisayar Öğretmenliği,,
4,4880,ITO Vakfı Süleyman Taştekin Anadolu Meslek Lisesi,other,Bilgisayar Yazılımı Teknolojisi/Teknisyen,,
...,...,...,...,...,...,...
114087,12165,Sakarya University,lisans,Computer Science and Engineering,,
114088,12165,Anadolu University,lisans,Management Information Systems and Services,201909.0,
114089,12165,Florya Tevfik Ercan Anadolu Lisesi,lise,[],,
114090,248,Fırat Üniversitesi,lisans,Yazılım Mühendisliği,,


In [47]:
mask = education_test_data.degree.map(education_test_data.degree.value_counts()) < 516
education_test_data.degree =  education_test_data.degree.mask(mask, 'other')
education_test_data

Unnamed: 0,user_id,school_name,degree,fields_of_study,start_year_month,end_year_month
0,17449,Katedra Biocybernetics&Polytechnic,doktora,,,
1,17449,Anadolu Üniversitesi,other,Business/Managerial Economics,,
2,17449,Atatürk Üniversitesi,,,,
3,17449,Ege University,,,,
4,17449,Gebze Technical University,,,,
...,...,...,...,...,...,...
28481,7029,Koç Üniversitesi,lisans,Electrical & Electronics Engineering,,
28482,7029,Çapa Fen Lisesi,lise,,,
28483,56130,Selçuk Üniversitesi,lisans,Computer Engineering,,
28484,16036,Dokuz Eylül Üniversitesi,lisans,Bilgisayar Mühendisliği,,


In [48]:
education_test_data.degree.value_counts()

lisans           11735
nan               7023
yuksek lisans     4046
other             2980
lise              1147
onlisans          1039
doktora            516
Name: degree, dtype: int64

In [49]:
education_train_data.degree.value_counts()


lisans           46460
nan              28822
yuksek lisans    16088
other            11939
lise              4502
onlisans          4213
doktora           2068
Name: degree, dtype: int64

In [50]:
education.school_name.value_counts()[:500]

Anadolu Üniversitesi                                                      6562
İstanbul Üniversitesi                                                     4891
Sakarya Üniversitesi                                                      4310
Kocaeli Üniversitesi                                                      3836
İstanbul Teknik Üniversitesi                                              3835
Istanbul Technical University                                             3791
Yıldız Teknik Üniversitesi                                                3773
Marmara Üniversitesi                                                      3571
Hacettepe Üniversitesi                                                    2992
Ege Üniversitesi                                                          2846
Gazi Üniversitesi                                                         2803
Bahçeşehir Üniversitesi                                                   2499
Istanbul University                                 

In [61]:
def school_clean(string):
    string = unidecode(str(string).lower())
    if ("anadolu un" in string):
        return "anadolu universitesi"
    elif ("istanbul un" in string):
        return "istanbul universitesi"
    elif ("sakarya un" in string):
        return "sakarya universitesi"
    elif ("kocaeli un" in string):
        return "kocaeli universitesi"
    elif ("istanbul te" in string):
        return "istanbul teknik universitesi"
    elif ("marm" in string):
        return "marmara universitesi"
    elif ("hace" in string):
        return "hacettepe universitesi"
    elif ("ege un" in string):
        return "ege universitesi"
    elif ("osmang" in string):
        return "eskisehir osmangazi universitesi"
    elif ("gazi un" in string):
        return "gazi universitesi"
    elif ("bahcesehir un" in string):
        return "bahcesehir universitesi"
    elif ("eylul" in string):
        return "dokuz eylul universitesi"
    elif ("yildiz te" in string):
        return "yildiz teknik universitesi"
    elif ("east te" in string) | ("odt" in string) | ("orta d" in string):
        return "orta dogu teknik universitesi"
    elif ("selcuk un" in string):
        return "selcuk universitesi"
    elif ("bogazi" in string):
        return "bogazici universitesi"
    elif ("firat" in string):
        return "firat universitesi"
    elif ("erciy" in string):
        return "erciyes universitesi"
    elif ("cukurova un" in string):
        return "cukurova universitesi"
    elif ("irel un" in string):
        return "suleyman demirel universitesi"
    elif ("koc un" in string):
        return "koc universitesi"
    elif ("fen li" in string):
        return "fen lisesi"
    elif ("anadolu li" in string):
        return "fen lisesi"
    elif ("meslek li" in string):
        return "meslek lisesi"
    elif ("gebze" in string):
        return "gebze teknik universitesi"
    elif ("izmir institute" in string):
        return "izmir yuksek teknoloji enstitusu"
    elif ("high" in string) | ("lise" in string):
        return "lise"
    else:
        return string

In [62]:
def test_clean(string):
    string = unidecode(str(string).lower())
    string = string.replace("university", "universitesi")
    string = string.replace("technical", "teknik")
    return string

In [64]:
education["school_name"] = education["school_name"].swifter.apply(test_clean)

Pandas Apply:   0%|          | 0/142575 [00:00<?, ?it/s]

In [65]:
education.school_name.value_counts()[:100]

anadolu universitesi                                               7984
istanbul teknik universitesi                                       7730
istanbul universitesi                                              7304
yildiz teknik universitesi                                         6092
sakarya universitesi                                               5278
marmara universitesi                                               4985
kocaeli universitesi                                               4859
hacettepe universitesi                                             4113
ege universitesi                                                   3832
gazi universitesi                                                  3532
bahcesehir universitesi                                            3494
bogazici universitesi                                              3198
dokuz eylul universitesi                                           3058
bilkent universitesi                                            

In [66]:
education["school_name"] = education["school_name"].swifter.apply(school_clean)


Pandas Apply:   0%|          | 0/142575 [00:00<?, ?it/s]

In [67]:
education.school_name.value_counts()[:100]

fen lisesi                                       8846
anadolu universitesi                             8073
istanbul teknik universitesi                     7950
lise                                             7936
istanbul universitesi                            7625
yildiz teknik universitesi                       6845
sakarya universitesi                             5326
marmara universitesi                             5081
kocaeli universitesi                             4878
hacettepe universitesi                           4145
ege universitesi                                 3884
bahcesehir universitesi                          3578
gazi universitesi                                3566
bogazici universitesi                            3494
dokuz eylul universitesi                         3181
bilkent universitesi                             2702
ankara universitesi                              2320
beykent universitesi                             2186
selcuk universitesi         

In [69]:
used_schools = education['school_name'].value_counts().iloc[:70].index # we only took most common 20 skills, you can increase it

df_school = education[education['school_name'].isin(used_schools)]
df_school['experience'] = True
df_school = df_school.drop_duplicates(['user_id', 'school_name'])
df_school = pd.pivot(df_school, index='user_id', columns='school_name', values='experience')
df_school = df_school.fillna(0).astype(int)
df_school.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


school_name,ahmet yesevi universitesi,akdeniz universitesi,anadolu universitesi,ankara universitesi,ataturk universitesi,atilim universitesi,bahcesehir universitesi,balikesir universitesi,baskent universitesi,beykent universitesi,bilge adam,bilkent universitesi,bilkent universitesi / bilkent universitesi,bogazici universitesi,cankaya universitesi,celal bayar universitesi,cukurova universitesi,dogus universitesi,dokuz eylul universitesi,dumlupinar universitesi,duzce universitesi,ege universitesi,erciyes universitesi,eskisehir osmangazi universitesi,eskisehir teknik universitesi,fen lisesi,firat universitesi,galatasaray universitesi,gazi universitesi,gaziantep universitesi,gebze teknik universitesi,hacettepe universitesi,isik universitesi,istanbul aydin universitesi,istanbul bilgi universitesi,istanbul kultur universitesi,istanbul teknik universitesi,istanbul ticaret universitesi,istanbul universitesi,izmir ekonomi universitesi,izmir yuksek teknoloji enstitusu,kadir has universitesi,karabuk universitesi,karadeniz teknik universitesi,kirikkale universitesi,koc universitesi,kocaeli universitesi,lise,maltepe universitesi,marmara universitesi,mersin universitesi,meslek lisesi,mugla sitki kocman universitesi,namik kemal universitesi,okan universitesi,ondokuz mayis universitesi,orta dogu teknik universitesi,ozyegin universitesi,pamukkale universitesi,sabanci universitesi,sakarya universitesi,selcuk universitesi,suleyman demirel universitesi,ted ankara koleji,tobb ekonomi ve teknoloji universitesi,trakya universitesi,udacity,uludag universitesi,yeditepe universitesi,yildiz teknik universitesi
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
education_train_data["school_name"] = education_train_data["school_name"].swifter.apply(test_clean)
education_test_data["school_name"] = education_test_data["school_name"].swifter.apply(test_clean)


Pandas Apply:   0%|          | 0/114092 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/28486 [00:00<?, ?it/s]

In [17]:
education_train_data["school_name"] = education_train_data["school_name"].swifter.apply(school_clean)
education_test_data["school_name"] = education_test_data["school_name"].swifter.apply(school_clean)


Pandas Apply:   0%|          | 0/114092 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/28486 [00:00<?, ?it/s]

In [20]:
education_train_data.school_name.value_counts().shape

(3869,)

In [24]:
education_train_data.school_name.value_counts()[:200]

fen lisesi                                                     7054
anadolu universitesi                                           6440
lise                                                           6324
istanbul teknik universitesi                                   6307
istanbul universitesi                                          6051
yildiz teknik universitesi                                     5495
sakarya universitesi                                           4284
gazi universitesi                                              4223
marmara universitesi                                           4055
kocaeli universitesi                                           3961
hacettepe universitesi                                         3344
ege universitesi                                               3102
bahcesehir universitesi                                        2840
bogazici universitesi                                          2785
dokuz eylul universitesi                        

In [87]:
def field_clean(string):
    string = unidecode(str(string).lower())
    if ("ogret" in string) | ("teach" in string):
        return "ogretmenlik"
    elif ("electric" in string) | ("electron" in string) | ("elektronik m" in string):
        return "elektrik ve elektronik muhendisligi"
    elif ("soft" in string) | ("yaz" in string):
        return "yazilim muhendisligi"
    elif ("compu" in string) | ("bilgisayar m" in string) | ("bilgisayar" in string):
        return "bilgisayar muhendisligi"
    elif ("isle" in string) | ("busines" in string) | ("mba" in string):
        return "isletme"
    elif ("chem" in string):
        return "kimya muhendisligi"
    elif ("mechan" in string) | ("makine" in string):
        return "makine muhendisligi"
    elif ("mechat" in string) | ("mekat" in string):
        return "mekatronik muhendisligi"
    elif ("indust" in string) | ("endustri m" in string):
        return "endustri muhendisligi"
    elif ("management i" in string) | ("yonetim bil" in string):
        return "yönetim bilisim sistemleri"
    elif ("haber" in string) | ("commu" in string):
        return "elektronik ve haberlesme muhendisligi"
    elif ("data" in string) | ("veri" in string):
        return "veri"
    elif ("math" in string) | ("matem" in string):
        return "matematik"
    elif ("program" in string):
        return "programcilik"
    elif ("web" in string):
        return "web"
    elif ("stati" in string):
        return "istatistik"
    elif ("phys" in string) | ("fizi" in string):
        return "fizik"
    elif ("insaa" in string) | ("civi" in string):
        return "insaat muhendisligi"
    elif ("econo" in string) | ("ekon" in string):
        return "ekonomi"
    elif ("bilgi t" in string) | ("inform" in string):
        return "bilgi teknolojisi"
    elif ("fen" in string) | ("bilim" in string) | ("science" in string):
        return "fen bilimleri"
    else:
        return string


In [85]:
education.head()

Unnamed: 0,user_id,school_name,degree,fields_of_study,start_year_month,end_year_month
0,0,Anadolu Üniversitesi,,,,
1,0,Adıyaman Üniversitesi,,,,
2,0,Fırat Üniversitesi,,,,
3,1,Fırat Üniversitesi,Yüksek Lisans,Yazılım Mühendisliği,,
4,1,Fırat Üniversitesi,Lisans,Yazılım Mühendisliği,,


In [86]:
education.fields_of_study.value_counts()[:100]

Bilgisayar Mühendisliği                                     14698
Computer Engineering                                        12662
Elektrik ve Elektronik Mühendisliği                          3562
Electrical and Electronics Engineering                       3201
Computer Science                                             3114
Bilgisayar Programlama                                       1844
Computer Programming                                         1539
Yönetim Bilişim Sistemleri                                   1315
Mechanical Engineering                                       1280
Industrial Engineering                                       1225
İşletme                                                      1179
İşletme ve Yönetim, Genel                                    1174
Business Administration and Management, General              1012
Software Engineering                                          859
Makine Mühendisliği                                           853
Elektronik

In [88]:
education["fields_of_study"] = education["fields_of_study"].swifter.apply(field_clean)


Pandas Apply:   0%|          | 0/142575 [00:00<?, ?it/s]

In [90]:
education.fields_of_study.replace("nan", np.nan, inplace=True)


In [92]:
education.fields_of_study.value_counts()[:100]

bilgisayar muhendisligi                                 43432
elektrik ve elektronik muhendisligi                     12459
isletme                                                  7467
matematik                                                4194
yazilim muhendisligi                                     4040
yönetim bilisim sistemleri                               3789
fen bilimleri                                            3652
endustri muhendisligi                                    2688
makine muhendisligi                                      2623
bilgi teknolojisi                                        2448
elektronik ve haberlesme muhendisligi                    2024
ogretmenlik                                              1833
fizik                                                    1684
kimya muhendisligi                                       1428
mekatronik muhendisligi                                  1288
ekonomi                                                  1177
programc

In [93]:
used_fields = education['fields_of_study'].value_counts().iloc[:29].index # we only took most common 20 skills, you can increase it

df_fields = education[education['fields_of_study'].isin(used_fields)]
df_fields['experience'] = True
df_fields = df_fields.drop_duplicates(['user_id', 'fields_of_study'])
df_fields = pd.pivot(df_fields, index='user_id', columns='fields_of_study', values='experience')
df_fields = df_fields.fillna(0).astype(int)
df_fields.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


fields_of_study,bilgi teknolojisi,bilgisayar muhendisligi,bilisim sistemleri muhendisligi,bilisim teknolojileri,control and automation engineering,ekonomi,elektrik ve elektronik muhendisligi,elektronik ve haberlesme muhendisligi,endustri muhendisligi,engineering management,fen bilimleri,fizik,iktisat,insaat muhendisligi,isletme,istatistik,kimya,kimya muhendisligi,makine muhendisligi,matematik,mekatronik muhendisligi,ogretmenlik,programcilik,sayisal,sosyoloji,veri,web,yazilim muhendisligi,yönetim bilisim sistemleri
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [94]:
df_fields

fields_of_study,bilgi teknolojisi,bilgisayar muhendisligi,bilisim sistemleri muhendisligi,bilisim teknolojileri,control and automation engineering,ekonomi,elektrik ve elektronik muhendisligi,elektronik ve haberlesme muhendisligi,endustri muhendisligi,engineering management,fen bilimleri,fizik,iktisat,insaat muhendisligi,isletme,istatistik,kimya,kimya muhendisligi,makine muhendisligi,matematik,mekatronik muhendisligi,ogretmenlik,programcilik,sayisal,sosyoloji,veri,web,yazilim muhendisligi,yönetim bilisim sistemleri
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66269,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
66270,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
66271,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
66272,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [95]:
df_school

school_name,ahmet yesevi universitesi,akdeniz universitesi,anadolu universitesi,ankara universitesi,ataturk universitesi,atilim universitesi,bahcesehir universitesi,balikesir universitesi,baskent universitesi,beykent universitesi,bilge adam,bilkent universitesi,bilkent universitesi / bilkent universitesi,bogazici universitesi,cankaya universitesi,celal bayar universitesi,cukurova universitesi,dogus universitesi,dokuz eylul universitesi,dumlupinar universitesi,duzce universitesi,ege universitesi,erciyes universitesi,eskisehir osmangazi universitesi,eskisehir teknik universitesi,fen lisesi,firat universitesi,galatasaray universitesi,gazi universitesi,gaziantep universitesi,gebze teknik universitesi,hacettepe universitesi,isik universitesi,istanbul aydin universitesi,istanbul bilgi universitesi,istanbul kultur universitesi,istanbul teknik universitesi,istanbul ticaret universitesi,istanbul universitesi,izmir ekonomi universitesi,izmir yuksek teknoloji enstitusu,kadir has universitesi,karabuk universitesi,karadeniz teknik universitesi,kirikkale universitesi,koc universitesi,kocaeli universitesi,lise,maltepe universitesi,marmara universitesi,mersin universitesi,meslek lisesi,mugla sitki kocman universitesi,namik kemal universitesi,okan universitesi,ondokuz mayis universitesi,orta dogu teknik universitesi,ozyegin universitesi,pamukkale universitesi,sabanci universitesi,sakarya universitesi,selcuk universitesi,suleyman demirel universitesi,ted ankara koleji,tobb ekonomi ve teknoloji universitesi,trakya universitesi,udacity,uludag universitesi,yeditepe universitesi,yildiz teknik universitesi
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66269,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
66270,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
66271,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
66272,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [99]:
from unidecode import unidecode

def degree_clean(string):
    string = unidecode(str(string).lower())
    if ("bac" in string) | ("lisans derecesi" in string) | ("bs" in string) | ("muhendislik" in string) | ("fak" in string) | ("engin" in string) | ("b." in string) | ("lic" in string):
        return "lisans"
    elif ("ön" in string) | ("ass" in string) | ("on lisans" in string):
        return "onlisans"
    elif ("mast" in string) | ("ms" in string) | ("mba" in string) | ("bus" in string) | ("m." in string):
        return "yuksek lisans"
    elif ("doc" in string) | ("dok" in string) | ("ph" in string):
        return "doktora"
    elif ("high" in string) | ("lise" in string):
        return "lise"
    else:
        return string
    #df.loc[df[col].str.contains(contain), col] = replace
    #string.str.contains('Executive MBA (Mini)'), 'language'] = 'turkce'
    #education_train_data.loc[education_train_data['degree'].str.contains('Executive MBA (Mini)'), 'degree'] = 'test'




In [100]:
import swifter
education["degree"] = education["degree"].swifter.apply(degree_clean)


Pandas Apply:   0%|          | 0/142575 [00:00<?, ?it/s]

In [101]:
education.degree.value_counts()

lisans                              58195
nan                                 35842
yuksek lisans                       20134
lise                                 5649
onlisans                             5252
                                    ...  
beginner                                1
4.5/5.0                                 1
2.97 / 4.0                              1
yazilim uzmani - teknik ogretmen        1
vocational art                          1
Name: degree, Length: 3476, dtype: int64

In [102]:
education["degree"].replace("nan", np.nan, inplace=True)

lisans                              58195
yuksek lisans                       20134
lise                                 5649
onlisans                             5252
doktora                              2584
                                    ...  
beginner                                1
4.5/5.0                                 1
2.97 / 4.0                              1
yazilim uzmani - teknik ogretmen        1
vocational art                          1
Name: degree, Length: 3475, dtype: int64

In [103]:
education.degree.value_counts()[:10]


lisans               58195
yuksek lisans        20134
lise                  5649
onlisans              5252
doktora               2584
undergraduate          357
3                      208
yuksekokul mezunu      204
4                      182
ogrenci                136
Name: degree, dtype: int64

In [104]:
mask = education.degree.map(education.degree.value_counts()) < 2584
education.degree =  education.degree.mask(mask, 'other')
education

Unnamed: 0,user_id,school_name,degree,fields_of_study,start_year_month,end_year_month
0,0,Anadolu Üniversitesi,,,,
1,0,Adıyaman Üniversitesi,,,,
2,0,Fırat Üniversitesi,,,,
3,1,Fırat Üniversitesi,yuksek lisans,yazilim muhendisligi,,
4,1,Fırat Üniversitesi,lisans,yazilim muhendisligi,,
...,...,...,...,...,...,...
142570,66272,Halmstad University,,bilgisayar muhendisligi,,
142571,66272,Bogazici University,,kimya muhendisligi,,
142572,66273,Istanbul Technical University,lisans,elektrik ve elektronik muhendisligi,,
142573,66273,Bogazici University,yuksek lisans,elektrik ve elektronik muhendisligi,201509.0,


In [105]:
education.degree.value_counts()

lisans           58195
yuksek lisans    20134
other            14919
lise              5649
onlisans          5252
doktora           2584
Name: degree, dtype: int64

In [106]:
degree_dict = [{'col': 'degree', 'mapping': {'lise':1,
            'onlisans':2,
            'other':3,
            'lisans': 4,
            'yuksek lisans': 5,
            'doktora': 6}}]

In [107]:
from category_encoders import OrdinalEncoder
oe = OrdinalEncoder(mapping=degree_dict)

In [108]:
education.degree = oe.fit_transform(education.degree)

In [109]:
education.degree.value_counts()

 4.0    58195
-1.0    35842
 5.0    20134
 3.0    14919
 1.0     5649
 2.0     5252
 6.0     2584
Name: degree, dtype: int64

In [110]:
education.degree.replace(-1, np.nan, inplace=True)


In [111]:
education.head()

Unnamed: 0,user_id,school_name,degree,fields_of_study,start_year_month,end_year_month
0,0,Anadolu Üniversitesi,,,,
1,0,Adıyaman Üniversitesi,,,,
2,0,Fırat Üniversitesi,,,,
3,1,Fırat Üniversitesi,5.0,yazilim muhendisligi,,
4,1,Fırat Üniversitesi,4.0,yazilim muhendisligi,,


In [128]:
t = education[["user_id", "degree"]].copy()

In [129]:
t.head()

Unnamed: 0,user_id,degree
0,0,
1,0,
2,0,
3,1,5.0
4,1,4.0


In [130]:
t_agg = t.groupby('user_id').agg(['sum', 'mean', 'count', 'max'])
t_agg


Unnamed: 0_level_0,degree,degree,degree,degree
Unnamed: 0_level_1,sum,mean,count,max
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0.0,,0,
1,9.0,4.5,2,5.0
2,4.0,2.0,2,3.0
3,4.0,4.0,1,4.0
4,4.0,4.0,1,4.0
...,...,...,...,...
66269,9.0,4.5,2,5.0
66270,18.0,4.5,4,5.0
66271,15.0,5.0,3,6.0
66272,4.0,4.0,1,4.0


In [131]:
def cat_agg(categorical_grouped):
    group_var = 'user_id'

    # Need to create new column names
    columns = []

    # Iterate through the variables names
    for var in categorical_grouped.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in ['sum', 'mean','count', 'max']:
                # Make a new column name for the variable and stat
                columns.append('%s_%s' % (var, stat))

    #  Rename the columns
    categorical_grouped.columns = columns
    print(categorical_grouped.head())

In [132]:
cat_agg(t_agg)

         degree_sum  degree_mean  degree_count  degree_max
user_id                                                   
0               0.0          NaN             0         NaN
1               9.0          4.5             2         5.0
2               4.0          2.0             2         3.0
3               4.0          4.0             1         4.0
4               4.0          4.0             1         4.0


In [133]:
t_agg.head()

Unnamed: 0_level_0,degree_sum,degree_mean,degree_count,degree_max
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,,0,
1,9.0,4.5,2,5.0
2,4.0,2.0,2,3.0
3,4.0,4.0,1,4.0
4,4.0,4.0,1,4.0


In [134]:
t_agg = t_agg.merge(df_fields, on="user_id", how="left")

In [135]:
t_agg.head()

Unnamed: 0_level_0,degree_sum,degree_mean,degree_count,degree_max,bilgi teknolojisi,bilgisayar muhendisligi,bilisim sistemleri muhendisligi,bilisim teknolojileri,control and automation engineering,ekonomi,elektrik ve elektronik muhendisligi,elektronik ve haberlesme muhendisligi,endustri muhendisligi,engineering management,fen bilimleri,fizik,iktisat,insaat muhendisligi,isletme,istatistik,kimya,kimya muhendisligi,makine muhendisligi,matematik,mekatronik muhendisligi,ogretmenlik,programcilik,sayisal,sosyoloji,veri,web,yazilim muhendisligi,yönetim bilisim sistemleri
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
0,0.0,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,9.0,4.5,2,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,4.0,2.0,2,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,4.0,1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4.0,4.0,1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [137]:
t_agg = t_agg.merge(df_school, on="user_id", how="left")
t_agg


Unnamed: 0_level_0,degree_sum,degree_mean,degree_count,degree_max,bilgi teknolojisi,bilgisayar muhendisligi,bilisim sistemleri muhendisligi,bilisim teknolojileri,control and automation engineering,ekonomi,elektrik ve elektronik muhendisligi,elektronik ve haberlesme muhendisligi,endustri muhendisligi,engineering management,fen bilimleri,fizik,iktisat,insaat muhendisligi,isletme,istatistik,kimya,kimya muhendisligi,makine muhendisligi,matematik,mekatronik muhendisligi,ogretmenlik,programcilik,sayisal,sosyoloji,veri,web,yazilim muhendisligi,yönetim bilisim sistemleri,ahmet yesevi universitesi,akdeniz universitesi,anadolu universitesi,ankara universitesi,ataturk universitesi,atilim universitesi,bahcesehir universitesi,balikesir universitesi,baskent universitesi,beykent universitesi,bilge adam,bilkent universitesi,bilkent universitesi / bilkent universitesi,bogazici universitesi,cankaya universitesi,celal bayar universitesi,cukurova universitesi,dogus universitesi,dokuz eylul universitesi,dumlupinar universitesi,duzce universitesi,ege universitesi,erciyes universitesi,eskisehir osmangazi universitesi,eskisehir teknik universitesi,fen lisesi,firat universitesi,galatasaray universitesi,gazi universitesi,gaziantep universitesi,gebze teknik universitesi,hacettepe universitesi,isik universitesi,istanbul aydin universitesi,istanbul bilgi universitesi,istanbul kultur universitesi,istanbul teknik universitesi,istanbul ticaret universitesi,istanbul universitesi,izmir ekonomi universitesi,izmir yuksek teknoloji enstitusu,kadir has universitesi,karabuk universitesi,karadeniz teknik universitesi,kirikkale universitesi,koc universitesi,kocaeli universitesi,lise,maltepe universitesi,marmara universitesi,mersin universitesi,meslek lisesi,mugla sitki kocman universitesi,namik kemal universitesi,okan universitesi,ondokuz mayis universitesi,orta dogu teknik universitesi,ozyegin universitesi,pamukkale universitesi,sabanci universitesi,sakarya universitesi,selcuk universitesi,suleyman demirel universitesi,ted ankara koleji,tobb ekonomi ve teknoloji universitesi,trakya universitesi,udacity,uludag universitesi,yeditepe universitesi,yildiz teknik universitesi
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1
0,0.0,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9.0,4.5,2,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,2.0,2,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,4.0,1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,4.0,1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66269,9.0,4.5,2,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66270,18.0,4.5,4,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66271,15.0,5.0,3,6.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66272,4.0,4.0,1,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [150]:
train = pd.read_csv("./train_user_lang_work_revise.csv")
test = pd.read_csv("./test_user_lang_work_revise.csv")

In [151]:
train.shape

(53019, 379)

In [152]:
train = train.merge(t_agg, on="user_id", how="left")
train.shape

(53019, 482)

In [153]:
test.shape

(13255, 378)

In [154]:
test = test.merge(t_agg, on="user_id", how="left")
test.shape

(13255, 481)

In [155]:
train.drop(columns=["work_company_id2_sum", "work_company_id2_mean"], inplace=True)
test.drop(columns=["work_company_id2_sum", "work_company_id2_mean"], inplace=True)

In [156]:
train.to_csv("train_user_lang_work_ed_revise.csv", index=False)
test.to_csv("test_user_lang_work_ed_revise.csv", index=False)

In [144]:
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, recall_score, precision_score, accuracy_score
import re


X_train_user_id = train["user_id"]

y = train["moved_after_2019"]
X = train.drop(columns=["moved_after_2019", "user_id", "work_company_id2_sum", "work_company_id2_mean"])
X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

auc_errs = []
ap_errs = []
prec_errs = []
recall_errs = []
f1_errs = []
acc_scores = []

models = []

kf = KFold(n_splits=10)

for split_train, split_val in list(kf.split(X)):
    
    split_train = X.index[split_train]
    split_val = X.index[split_val]
    
    X_train, y_train = X.loc[split_train], y.loc[split_train]
    X_val, y_val = X.loc[split_val], y.loc[split_val]
    print("Train shape:", X_train.shape, "|", "Val Shape:", X_val.shape)
    print("Positive Count in Val Split:", y_val.sum())

    model = lgb.LGBMClassifier()
    

    model.fit(X_train, y_train, eval_metric = 'accuracy',
                eval_set = [(X_val, y_val), (X_train, y_train)],
                eval_names = ['valid', 'train'],
                early_stopping_rounds = 100, verbose = 200)
    
    preds = model.predict(X_val)
    pred_probas = model.predict_proba(X_val)[:,1]

    print("Positive Count in Predictions:", preds.sum())
    
    acc = accuracy_score(y_val, preds)
    print("Accuracy: ", acc)
    acc_scores.append(acc)
    
    f1_err = f1_score(y_val, preds)
    print("Fold F1: ", f1_err)
    f1_errs.append(f1_err)

    prec_err = precision_score(y_val, preds)
    print("Fold Precision: ", prec_err)
    prec_errs.append(prec_err)

    recall_err = recall_score(y_val, preds)
    print("Fold Recall: ", recall_err)
    recall_errs.append(recall_err)
    
    auc_err = roc_auc_score(y_val, pred_probas)
    print("Fold AUC: ", auc_err)
    auc_errs.append(auc_err)

    ap_err = average_precision_score(y_val, pred_probas)
    print("Fold AP: ", ap_err)
    ap_errs.append(ap_err)
    
    models.append(model)
    
    print("\n", "*"*70, "\n")

import plotly.express as px
importance = [model.feature_importances_ for model in models]

f_importance = pd.concat([pd.Series(X.columns.to_list(),name='Feature'),
                        pd.Series(np.mean(importance,axis=0),name="Importance")],
                        axis=1).sort_values(by='Importance',
                                            ascending=True)

fig = px.bar(f_importance.tail(20),x='Importance',y='Feature')
fig.update_layout(
    title_text="First 20 Important Features - CatBoost Average of Folds"
)
fig.show()

Train shape: (47717, 478) | Val Shape: (5302, 478)
Positive Count in Val Split: 2070



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1151
Accuracy:  0.6840814786872879
Fold F1:  0.4799751629928594
Fold Precision:  0.6715899218071243
Fold Recall:  0.37342995169082127
Fold AUC:  0.7222575722246138
Fold AP:  0.6278317588694139

 ********************************************************************** 

Train shape: (47717, 478) | Val Shape: (5302, 478)
Positive Count in Val Split: 2063



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1093
Accuracy:  0.6876650320633723
Fold F1:  0.47528517110266155
Fold Precision:  0.6861848124428179
Fold Recall:  0.36354823073194376
Fold AUC:  0.7321823205040005
Fold AP:  0.6501270045156424

 ********************************************************************** 

Train shape: (47717, 478) | Val Shape: (5302, 478)
Positive Count in Val Split: 1989



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1085
Accuracy:  0.7046397585816673
Fold F1:  0.49056603773584906
Fold Precision:  0.6949308755760368
Fold Recall:  0.3790849673202614
Fold AUC:  0.7394636695607915
Fold AP:  0.6452388081246553

 ********************************************************************** 

Train shape: (47717, 478) | Val Shape: (5302, 478)
Positive Count in Val Split: 1990



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1058
Accuracy:  0.6793662768766503
Fold F1:  0.44225721784776906
Fold Precision:  0.6370510396975425
Fold Recall:  0.33869346733668343
Fold AUC:  0.7289094931177628
Fold AP:  0.6167847540863489

 ********************************************************************** 

Train shape: (47717, 478) | Val Shape: (5302, 478)
Positive Count in Val Split: 2061



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1092
Accuracy:  0.6848359109769898
Fold F1:  0.47002854424357754
Fold Precision:  0.6785714285714286
Fold Recall:  0.35953420669577874
Fold AUC:  0.7208843629378022
Fold AP:  0.6269844174887603

 ********************************************************************** 

Train shape: (47717, 478) | Val Shape: (5302, 478)
Positive Count in Val Split: 2025



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1102
Accuracy:  0.6829498302527348
Fold F1:  0.46242404860889025
Fold Precision:  0.6560798548094374
Fold Recall:  0.35703703703703704
Fold AUC:  0.7203173634421727
Fold AP:  0.6198428102299307

 ********************************************************************** 

Train shape: (47717, 478) | Val Shape: (5302, 478)
Positive Count in Val Split: 2095



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1157
Accuracy:  0.6831384383251603
Fold F1:  0.4833948339483395
Fold Precision:  0.6793431287813311
Fold Recall:  0.37517899761336515
Fold AUC:  0.7251043920183547
Fold AP:  0.6462155537823424

 ********************************************************************** 

Train shape: (47717, 478) | Val Shape: (5302, 478)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1157
Accuracy:  0.6850245190494153
Fold F1:  0.4823310601363919
Fold Precision:  0.672428694900605
Fold Recall:  0.3760270662155631
Fold AUC:  0.7440649883384509
Fold AP:  0.6517036046169516

 ********************************************************************** 

Train shape: (47717, 478) | Val Shape: (5302, 478)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1080
Accuracy:  0.6769143719351188
Fold F1:  0.456017783423309
Fold Precision:  0.6648148148148149
Fold Recall:  0.347027549540841
Fold AUC:  0.7296724196776325
Fold AP:  0.6384959648469829

 ********************************************************************** 

Train shape: (47718, 478) | Val Shape: (5301, 478)
Positive Count in Val Split: 2092



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1093
Accuracy:  0.6734578381437465
Fold F1:  0.45651491365777086
Fold Precision:  0.6651418115279049
Fold Recall:  0.3475143403441683
Fold AUC:  0.7219015204012138
Fold AP:  0.6364083645828944

 ********************************************************************** 



In [145]:
test.drop(columns=["user_id", "work_company_id2_sum", "work_company_id2_mean"],inplace=True)

model_preds = [model.predict(test) for model in models]
model_preds

[array([0, 1, 0, ..., 1, 1, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64)]

In [146]:
sample_submission = pd.DataFrame()
df = pd.read_csv("../data/test_users.csv")
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,1
2,2110,0
3,55082,0
4,37165,0


In [147]:
sample_submission.moved_after_2019.value_counts()

0    10512
1     2743
Name: moved_after_2019, dtype: int64

In [148]:
sample_submission.to_csv("submission_no_skill_comp_id.csv", index=False)

In [23]:
education_train_data["fields_of_study"] = education_train_data["fields_of_study"].swifter.apply(field_clean)
education_test_data["fields_of_study"] = education_test_data["fields_of_study"].swifter.apply(field_clean)


Pandas Apply:   0%|          | 0/114092 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/28486 [00:00<?, ?it/s]

In [24]:
education_test_data.head()


Unnamed: 0,user_id,school_name,degree,fields_of_study,start_year_month,end_year_month
0,17449,katedra biocybernetics&polytechnic,doktora,,,
1,17449,anadolu universitesi,other,isletme,,
2,17449,ataturk universitesi,,,,
3,17449,ege universitesi,,,,
4,17449,gebze teknik universitesi,,,,


In [25]:
education_test_data[education_test_data["school_name"]=="nan"]


Unnamed: 0,user_id,school_name,degree,fields_of_study,start_year_month,end_year_month
1553,48352,,,,,
16471,24525,,,,,


In [None]:
education_test_data.school_name.replace("nan", np.nan, inplace=True)
education_test_data.fields_of_study.replace("nan", np.nan, inplace=True)

In [None]:
education_train_data.school_name.replace("nan", np.nan, inplace=True)
education_train_data.fields_of_study.replace("nan", np.nan, inplace=True)

In [26]:
education_train_data.degree.unique()        


array(['lisans', 'onlisans', 'other', 'nan', 'yuksek lisans', 'lise',
       'doktora'], dtype=object)

In [27]:
degree_dict = [{'col': 'degree', 'mapping': {'lise':1,
            'onlisans':2,
            'other':3,
            'lisans': 4,
            'yuksek lisans': 5,
            'doktora': 6,
            'nan': np.nan}}]

In [28]:
from category_encoders import OrdinalEncoder
oe = OrdinalEncoder(mapping=degree_dict)

In [29]:
education_train_data.degree = oe.fit_transform(education_train_data.degree)

In [31]:
education_test_data.degree = oe.transform(education_test_data.degree)

In [32]:
education_test_data.degree.replace(-1, np.nan, inplace=True)

In [34]:
education_train_data.degree.replace(-1, np.nan, inplace=True)


In [38]:
education_train_data = pd.merge(education_train_data,train[['user_id','moved_after_2019']],on='user_id', how='left')


In [40]:
target_train = education_train_data[['moved_after_2019']]
train_skills = education_train_data.drop(['moved_after_2019', 'start_year_month', 'end_year_month'], axis = 1)
test_skills = education_test_data.drop(['start_year_month', 'end_year_month'], axis=1)

In [55]:
def encode_test(skills_train):
    from sklearn.model_selection import KFold
    from catboost import CatBoostClassifier
    import lightgbm as lgb
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, recall_score, precision_score, accuracy_score
    import re

    

    train = pd.read_csv("./train_user_lang_work_revise.csv")
    train = train.merge(skills_train, on="user_id", how="left")



    X_train_user_id = train["user_id"]

    y = train["moved_after_2019"]
    X = train.drop(columns=["moved_after_2019", "user_id", "work_company_id2_sum", "work_company_id2_mean"])
    X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

    auc_errs = []
    ap_errs = []
    prec_errs = []
    recall_errs = []
    f1_errs = []
    acc_scores = []

    models = []

    kf = KFold(n_splits=10)

    for split_train, split_val in list(kf.split(X)):
        
        split_train = X.index[split_train]
        split_val = X.index[split_val]
        
        X_train, y_train = X.loc[split_train], y.loc[split_train]
        X_val, y_val = X.loc[split_val], y.loc[split_val]
        print("Train shape:", X_train.shape, "|", "Val Shape:", X_val.shape)
        print("Positive Count in Val Split:", y_val.sum())

        model = lgb.LGBMClassifier()
        

        model.fit(X_train, y_train, eval_metric = 'accuracy',
                    eval_set = [(X_val, y_val), (X_train, y_train)],
                    eval_names = ['valid', 'train'],
                    early_stopping_rounds = 100, verbose = 200)
        
        preds = model.predict(X_val)
        pred_probas = model.predict_proba(X_val)[:,1]

        print("Positive Count in Predictions:", preds.sum())
        
        acc = accuracy_score(y_val, preds)
        print("Accuracy: ", acc)
        acc_scores.append(acc)
        
        f1_err = f1_score(y_val, preds)
        print("Fold F1: ", f1_err)
        f1_errs.append(f1_err)

        prec_err = precision_score(y_val, preds)
        print("Fold Precision: ", prec_err)
        prec_errs.append(prec_err)

        recall_err = recall_score(y_val, preds)
        print("Fold Recall: ", recall_err)
        recall_errs.append(recall_err)
        
        auc_err = roc_auc_score(y_val, pred_probas)
        print("Fold AUC: ", auc_err)
        auc_errs.append(auc_err)

        ap_err = average_precision_score(y_val, pred_probas)
        print("Fold AP: ", ap_err)
        ap_errs.append(ap_err)
        
        models.append(model)
        
        print("\n", "*"*70, "\n")

    import plotly.express as px
    importance = [model.feature_importances_ for model in models]

    f_importance = pd.concat([pd.Series(X.columns.to_list(),name='Feature'),
                            pd.Series(np.mean(importance,axis=0),name="Importance")],
                            axis=1).sort_values(by='Importance',
                                                ascending=True)

    fig = px.bar(f_importance.tail(20),x='Importance',y='Feature')
    fig.update_layout(
        title_text="First 20 Important Features - CatBoost Average of Folds"
    )
    fig.show()

    return models


        

        

In [44]:
import category_encoders as ce
cbe_encoder = ce.cat_boost.CatBoostEncoder()
cbe_encoder.fit(train_skills, target_train)
education_catboost_train = cbe_encoder.transform(train_skills)
education_catboost_test = cbe_encoder.transform(test_skills)


#choc['company_location'] = labelencoder.fit_transform(choc['company_location'])

In [45]:
education_catboost_train_agg = education_catboost_train.groupby('user_id').agg(['sum', 'mean'])


In [46]:
education_catboost_train_agg

Unnamed: 0_level_0,school_name,school_name,degree,degree,fields_of_study,fields_of_study
Unnamed: 0_level_1,sum,mean,sum,mean,sum,mean
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,1.327849,0.442616,0.0,,1.124416,0.374805
3,0.460124,0.460124,4.0,4.0,0.413637,0.413637
5,1.791610,0.447903,14.0,3.5,1.615717,0.403929
6,0.591450,0.295725,4.0,4.0,0.788443,0.394221
7,0.460124,0.460124,4.0,4.0,0.413512,0.413512
...,...,...,...,...,...,...
66266,1.500433,0.375108,18.0,4.5,1.500591,0.375148
66270,1.500433,0.375108,18.0,4.5,1.500591,0.375148
66271,1.219187,0.406396,15.0,5.0,1.220920,0.406973
66272,0.974837,0.324946,4.0,4.0,1.155124,0.385041


In [47]:
education_catboost_test_agg = education_catboost_test.groupby('user_id').agg(['sum', 'mean'])

In [48]:
education_catboost_test_agg

Unnamed: 0_level_0,school_name,school_name,degree,degree,fields_of_study,fields_of_study
Unnamed: 0_level_1,sum,mean,sum,mean,sum,mean
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,0.920247,0.460124,9.0,4.500000,0.827275,0.413637
2,0.881016,0.440508,4.0,2.000000,0.807492,0.403746
4,0.460124,0.460124,4.0,4.000000,0.413637,0.413637
11,1.058619,0.529309,8.0,4.000000,0.827149,0.413575
14,0.708003,0.354002,9.0,4.500000,0.812634,0.406317
...,...,...,...,...,...,...
66254,1.623735,0.405934,13.0,3.250000,2.039412,0.509853
66261,1.219187,0.406396,15.0,5.000000,1.220920,0.406973
66267,1.198149,0.399383,10.0,3.333333,1.586311,0.528770
66268,0.835774,0.417887,0.0,,0.788443,0.394221


In [49]:
def cat_agg(categorical_grouped):
    group_var = 'user_id'

    # Need to create new column names
    columns = []

    # Iterate through the variables names
    for var in categorical_grouped.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in ['count', 'count_norm']:
                # Make a new column name for the variable and stat
                columns.append('%s_%s' % (var, stat))

    #  Rename the columns
    categorical_grouped.columns = columns
    print(categorical_grouped.head())

In [50]:
cat_agg(education_catboost_train_agg)


         school_name_count  school_name_count_norm  degree_count  \
user_id                                                            
0                 1.327849                0.442616           0.0   
3                 0.460124                0.460124           4.0   
5                 1.791610                0.447903          14.0   
6                 0.591450                0.295725           4.0   
7                 0.460124                0.460124           4.0   

         degree_count_norm  fields_of_study_count  fields_of_study_count_norm  
user_id                                                                        
0                      NaN               1.124416                    0.374805  
3                      4.0               0.413637                    0.413637  
5                      3.5               1.615717                    0.403929  
6                      4.0               0.788443                    0.394221  
7                      4.0               0.

In [51]:
cat_agg(education_catboost_test_agg)

         school_name_count  school_name_count_norm  degree_count  \
user_id                                                            
1                 0.920247                0.460124           9.0   
2                 0.881016                0.440508           4.0   
4                 0.460124                0.460124           4.0   
11                1.058619                0.529309           8.0   
14                0.708003                0.354002           9.0   

         degree_count_norm  fields_of_study_count  fields_of_study_count_norm  
user_id                                                                        
1                      4.5               0.827275                    0.413637  
2                      2.0               0.807492                    0.403746  
4                      4.0               0.413637                    0.413637  
11                     4.0               0.827149                    0.413575  
14                     4.5               0.

In [56]:
test = pd.read_csv("./test_user_lang_work_revise.csv")
test = test.merge(education_catboost_test_agg, on="user_id", how="left")
test.drop(columns=["user_id","work_company_id2_sum", "work_company_id2_mean"],inplace=True)

models = encode_test(education_catboost_train_agg)
model_preds = [model.predict(test) for model in models]
model_preds

Train shape: (47717, 381) | Val Shape: (5302, 381)
Positive Count in Val Split: 2070



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1335
Accuracy:  0.7116182572614108
Fold F1:  0.5509544787077827
Fold Precision:  0.702621722846442
Fold Recall:  0.45314009661835747
Fold AUC:  0.7665508262782801
Fold AP:  0.6951119830310345

 ********************************************************************** 

Train shape: (47717, 381) | Val Shape: (5302, 381)
Positive Count in Val Split: 2063



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1296
Accuracy:  0.7127499056959637
Fold F1:  0.546591247395058
Fold Precision:  0.7083333333333334
Fold Recall:  0.4449830344158992
Fold AUC:  0.7720174341523875
Fold AP:  0.7022850974406515

 ********************************************************************** 

Train shape: (47717, 381) | Val Shape: (5302, 381)
Positive Count in Val Split: 1989



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1285
Accuracy:  0.7367031308940023
Fold F1:  0.5736102626756262
Fold Precision:  0.7307392996108949
Fold Recall:  0.4720965309200603
Fold AUC:  0.792263577050779
Fold AP:  0.7201817787058257

 ********************************************************************** 

Train shape: (47717, 381) | Val Shape: (5302, 381)
Positive Count in Val Split: 1990



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1250
Accuracy:  0.7084119200301773
Fold F1:  0.5228395061728395
Fold Precision:  0.6776
Fold Recall:  0.42562814070351757
Fold AUC:  0.7645127054353895
Fold AP:  0.6828541029263444

 ********************************************************************** 

Train shape: (47717, 381) | Val Shape: (5302, 381)
Positive Count in Val Split: 2061



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1288
Accuracy:  0.7161448509996228
Fold F1:  0.5506121230217976
Fold Precision:  0.7158385093167702
Fold Recall:  0.44735565259582727
Fold AUC:  0.7711592929084701
Fold AP:  0.7010768537621745

 ********************************************************************** 

Train shape: (47717, 381) | Val Shape: (5302, 381)
Positive Count in Val Split: 2025



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1279
Accuracy:  0.7102980007544323
Fold F1:  0.5351089588377724
Fold Precision:  0.691164972634871
Fold Recall:  0.4365432098765432
Fold AUC:  0.7669673029758475
Fold AP:  0.6896918016368122

 ********************************************************************** 

Train shape: (47717, 381) | Val Shape: (5302, 381)
Positive Count in Val Split: 2095



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1319
Accuracy:  0.7099207846095813
Fold F1:  0.549502050380785
Fold Precision:  0.711144806671721
Fold Recall:  0.4477326968973747
Fold AUC:  0.7730363844603056
Fold AP:  0.709973358266276

 ********************************************************************** 

Train shape: (47717, 381) | Val Shape: (5302, 381)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1349
Accuracy:  0.7163334590720483
Fold F1:  0.5599765944997074
Fold Precision:  0.7094143810229799
Fold Recall:  0.4625422909618173
Fold AUC:  0.783333410573686
Fold AP:  0.7165515476958417

 ********************************************************************** 

Train shape: (47717, 381) | Val Shape: (5302, 381)
Positive Count in Val Split: 2069



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1295
Accuracy:  0.7110524330441342
Fold F1:  0.544589774078478
Fold Precision:  0.7073359073359073
Fold Recall:  0.44272595456742386
Fold AUC:  0.7727431602297298
Fold AP:  0.7013480068660523

 ********************************************************************** 

Train shape: (47718, 381) | Val Shape: (5301, 381)
Positive Count in Val Split: 2092



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



Positive Count in Predictions: 1265
Accuracy:  0.7085455574419921
Fold F1:  0.5397676496872207
Fold Precision:  0.7162055335968379
Fold Recall:  0.43307839388145314
Fold AUC:  0.7699055506531285
Fold AP:  0.7031937407782576

 ********************************************************************** 



[array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 1, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64),
 array([0, 1, 0, ..., 1, 0, 0], dtype=int64)]

In [57]:
sample_submission = pd.DataFrame()
df = pd.read_csv("../data/test_users.csv")
sample_submission['user_id'] = df["user_id"]
sample_submission['moved_after_2019'] = (np.mean(model_preds,axis=0)>=0.5).astype(int)

sample_submission.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,1
2,2110,0
3,55082,0
4,37165,0


In [58]:
sample_submission.moved_after_2019.value_counts()

0    10093
1     3162
Name: moved_after_2019, dtype: int64

In [146]:
check_df(skills)

**********************************************************************
******************************* Shape ********************************
**********************************************************************
(1398443, 2)
**********************************************************************
******************************* Types ********************************
**********************************************************************
user_id     int64
skill      object
dtype: object
**********************************************************************
******************************** Head ********************************
**********************************************************************
   user_id          skill
0        1    Mühendislik
1        1         Eğitim
2        2        Android
3        2           Java
4        2  3D Studio Max
**********************************************************************
******************************** Tail ********************************


In [164]:
skills.skill.value_counts()[:200]

Java                                 24360
JavaScript                           24072
SQL                                  23590
C#                                   23422
HTML                                 16872
CSS                                  15109
C++                                  14762
Microsoft SQL Server                 14593
C                                    14121
Microsoft Office                     13854
Python                               13353
MySQL                                13320
Git                                  12651
Linux                                11552
Software Development                 10978
jQuery                               10497
ASP.NET                              10178
HTML5                                 9380
.NET                                  8963
ASP.NET MVC                           8806
PHP                                   8444
Yazılım Geliştirme                    8243
OOP (Nesne Yönelimli Programlama)     8196
Visual Stud

In [149]:
value_counts = skills["skill"].value_counts()

# Sütunun %95'lik dilimindeki değerleri seç
selected_values = value_counts[value_counts.cumsum() <= 0.95 * value_counts.sum()].index.tolist()

# Sonuçları görüntüle
print(selected_values)

['Java', 'JavaScript', 'SQL', 'C#', 'HTML', 'CSS', 'C++', 'Microsoft SQL Server', 'C', 'Microsoft Office', 'Python', 'MySQL', 'Git', 'Linux', 'Software Development', 'jQuery', 'ASP.NET', 'HTML5', '.NET', 'ASP.NET MVC', 'PHP', 'Yazılım Geliştirme', 'OOP (Nesne Yönelimli Programlama)', 'Visual Studio', 'Matlab', 'XML', 'React.js', 'Microsoft Excel', 'İngilizce', 'Scrum', 'JSON', 'Android', 'PL/SQL', 'Web Services', 'Project Management', 'Spring Framework', 'JIRA', 'PostgreSQL', 'Proje Yönetimi', 'Software Engineering', 'Web Geliştirme', 'OOP', 'Hibernate', 'Web Development', 'Object-Oriented Programming (OOP)', 'Eclipse', 'AJAX', 'Node.js', 'T-SQL', 'Photoshop', 'Ekip Çalışması', 'English', 'Docker', 'MongoDB', 'Mühendislik', 'Agile Methodologies', 'Microsoft Word', 'AutoCAD', 'Entity Framework', 'Programming', 'Programlama', 'Bootstrap', 'Design Patterns', 'Araştırma', 'Object Oriented Design', 'Maven', 'Oracle', 'Python (Programming Language)', 'Android Development', 'SOAP', 'Software 

In [150]:
len(selected_values)

12045

In [11]:
loc_info = pd.concat([train['location'], test['location']]).unique()

In [14]:
def clean_cities(sehir_data):
    sehir_data = sehir_data.copy()
    sehir_data["location2"] = ""
    sehir_data.loc[sehir_data["location"].str.contains("Greater"), "location2"] = sehir_data["location"].str.replace("Greater ", "").str.cat(sehir_data[",Turkey"], sep=", ")
    sehir_data["location2"] = sehir_data["location2"].fillna(sehir_data["location"])
    sehir_data["comma_count"] = sehir_data["location2"].str.count(",")
    sehir_data["location2"] = sehir_data.apply(lambda row: ", ".join([","] * (2 - row["comma_count"])) + row["location2"] if row["comma_count"] < 2 else row["location2"], axis=1)
    sehir_data["location2"] = sehir_data["location2"].str.replace(" ", "")
    sehir_data = sehir_data.join(sehir_data["location2"].str.split(",", expand=True).add_prefix("loc"))
    sehir_data.fillna("", inplace=True)
    return sehir_data.filter(regex="^loc.*")

In [16]:
loc_info

array(['Istanbul, Istanbul, Turkey', 'Turkey', 'Istanbul, Turkey',
       'Ankara, Ankara, Turkey', 'Ankara, Turkey',
       'Gebze, Kocaeli, Turkey', 'Fatih, Istanbul, Turkey',
       'Kartal, Istanbul, Turkey', 'İzmir, Turkey',
       'Hendek, Sakarya, Turkey', 'Üsküdar, Istanbul, Turkey',
       'Gaziemir, İzmir, Turkey', 'Amasya, Turkey', 'Samsun, Turkey',
       'Bursa, Turkey', 'Buca, İzmir, Turkey',
       'Gaziantep, Gaziantep, Turkey', 'Urla, İzmir, Turkey',
       'Maltepe, Istanbul, Turkey', 'Umraniye, Istanbul, Turkey',
       'Ortahisar, Trabzon, Turkey', 'Kadikoy, Istanbul, Turkey',
       'Tuzla, Istanbul, Turkey', 'Greater Istanbul',
       'Beylikduzu, Istanbul, Turkey', 'İzmir, İzmir, Turkey',
       'Karatay, Konya, Turkey', 'Atasehir, Istanbul, Turkey',
       'Kagithane, Istanbul, Turkey', 'Kayseri, Turkey',
       'Corlu, Tekirdağ, Turkey', 'Pendik, Istanbul, Turkey',
       'Balıkesir, Turkey', 'Gümüşhane, Turkey',
       'Etimesgut, Ankara, Turkey', 'Çorlu, Teki

In [15]:
cleaned = clean_cities(loc_info)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices