In [1]:
import numpy as np
import pandas as pd

import catboost as ctb

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score


In [2]:
def evaluate_model(model, X_train, y_train, X_test,  y_test):
    model = ctb.CatBoostClassifier(cat_features = cat_feats)
    model.fit(X_train, y_train, verbose=False)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average = 'binary')
    rec = recall_score(y_test, y_pred, average='binary')
    return {'f1': [f1], 'roc_auc': [roc], 'precision': [prec], 'recall': [rec]}
    

Problem 1 
take a dataset for the binary classification 

In [3]:
df = pd.read_csv('aug_train.csv')
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [4]:
df.shape

(19158, 14)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [6]:
df = df.drop(columns=['enrollee_id'])

In [7]:
df['target'] = df['target'].astype(int)

In [8]:
df['target'].value_counts()

0    14381
1     4777
Name: target, dtype: int64

Problem2 - feature engineering 

In [9]:
for col in df.select_dtypes('object').columns:
    df[col] = df[col].fillna(df[col].value_counts().index[0])

In [10]:
df.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,1,36,1
1,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0
2,city_21,0.624,Male,No relevent experience,Full time course,Graduate,STEM,5,50-99,Pvt Ltd,never,83,0
3,city_115,0.789,Male,No relevent experience,no_enrollment,Graduate,Business Degree,<1,50-99,Pvt Ltd,never,52,1
4,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0


In [27]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], test_size=0.2, random_state=42)


Problem3 Use any classificator

In [28]:
cat_feats =  ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level',
             'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job']

In [29]:
metrics = pd.DataFrame(evaluate_model(ctb.CatBoostClassifier(cat_features=cat_feats),
                                      X_train, y_train, X_test, y_test))

In [30]:
metrics

Unnamed: 0,f1,roc_auc,precision,recall
0,0.949807,0.952206,1.0,0.904412


Problem4 - split  the dataset on P (positives) и U (unlabeled) using only part of Positives

In [15]:
def create_unlabeled(df, pos_frac = 0.2):
    sdf = df.copy()
    pos_mask = (df['target'] == 1)
    pos_ind = df[pos_mask].sample(frac=pos_frac).index
    unlab_ind = df[~df.index.isin(pos_ind)].index
    
    df.loc[pos_ind, 'is_labeled'] =1
    df.loc[unlab_ind, 'is_labeled'] = 0
    df['is_labeled'] = df['is_labeled'].astype(int)
    return df

In [16]:
rns_df = create_unlabeled(df, pos_frac=0.2)

In [17]:
rns_df.head(3)

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target,is_labeled
0,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,1,36,1,0
1,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0,0
2,city_21,0.624,Male,No relevent experience,Full time course,Graduate,STEM,5,50-99,Pvt Ltd,never,83,0,0


In [18]:
x_data = rns_df.iloc[:,:-2].values # just the X 
y_labeled = rns_df.iloc[:,-1].values # new class (just the P & U)
y_positive = rns_df.iloc[:,-2].values # original class

Problem 5
Apply Random Negative Samplings for building a crassifier in new conditions

In [19]:
def get_rns_samples(rns_df):
    rns_df = rns_df.sample(frac=1)
    
    pos_sample = rns_df[rns_df['is_labeled'] == 1]
    neg_sample = rns_df[rns_df['is_labeled'] == 0][:pos_sample.shape[0]]
    train_sample = pd.concat([neg_sample, pos_sample]).sample(frac=1)
    test_sample = rns_df[rns_df['is_labeled'] == 0][pos_sample.shape[0]:]
    
    return train_sample, test_sample

In [20]:
train_sample, test_sample = get_rns_samples(rns_df)

In [21]:
evaluate_metrics = evaluate_model(ctb.CatBoostClassifier(cat_features=cat_feats),
                             train_sample.iloc[:, :-2],
                             train_sample['is_labeled'],
                             test_sample.iloc[:, :-2],
                             test_sample['target'])

In [22]:
metrics = metrics.append(pd.DataFrame(evaluate_metrics))


  metrics = metrics.append(pd.DataFrame(evaluate_metrics))


Problem 6 compare the model quality with solution in Problem 4 (creaate metric tab)

In [23]:
metrics.index = ['normal', "RNS"]

In [24]:
metrics

Unnamed: 0,f1,roc_auc,precision,recall
normal,0.499113,0.666604,0.571042,0.443277
RNS,0.523486,0.726774,0.403079,0.746471


RNS predicted better than the model in Problem 4. Recall is increased, but precision in decreased, and more observations are in Positive class

Problem 7 Experiment with P.How incressing/decreasing P would affect model quiality

In [25]:
rns_metrics = pd.DataFrame(columns=['f1', 'roc-auc', 'precision', 'recall'])

fracs = np.linspace(0.1, 0.9, 9)
for frac in fracs:
    train_samples, test_samples = get_rns_samples(create_unlabeled(df, pos_frac= frac))
    frac_metrics = evaluate_model(ctb.CatBoostClassifier(cat_features= cat_feats), 
                                  train_sample.iloc[:, :-2],
                             train_sample['is_labeled'],
                             test_sample.iloc[:, :-2],
                             test_sample['target'])
    rns_metrics = rns_metrics.append(pd.DataFrame(frac_metrics))

  rns_metrics = rns_metrics.append(pd.DataFrame(frac_metrics))
  rns_metrics = rns_metrics.append(pd.DataFrame(frac_metrics))
  rns_metrics = rns_metrics.append(pd.DataFrame(frac_metrics))
  rns_metrics = rns_metrics.append(pd.DataFrame(frac_metrics))
  rns_metrics = rns_metrics.append(pd.DataFrame(frac_metrics))
  rns_metrics = rns_metrics.append(pd.DataFrame(frac_metrics))
  rns_metrics = rns_metrics.append(pd.DataFrame(frac_metrics))
  rns_metrics = rns_metrics.append(pd.DataFrame(frac_metrics))
  rns_metrics = rns_metrics.append(pd.DataFrame(frac_metrics))


In [26]:
rns_metrics.index = fracs
rns_metrics

Unnamed: 0,f1,roc-auc,precision,recall,roc_auc
0.1,0.523486,,0.403079,0.746471,0.726774
0.2,0.523486,,0.403079,0.746471,0.726774
0.3,0.523486,,0.403079,0.746471,0.726774
0.4,0.523486,,0.403079,0.746471,0.726774
0.5,0.523486,,0.403079,0.746471,0.726774
0.6,0.523486,,0.403079,0.746471,0.726774
0.7,0.523486,,0.403079,0.746471,0.726774
0.8,0.523486,,0.403079,0.746471,0.726774
0.9,0.523486,,0.403079,0.746471,0.726774
