## Active Learning

Download the titanic dataset here: https://drive.google.com/file/d/0Bz9_0VdXvv9bbVhpOEMwUDJ2elU/view?usp=sharing

In this exercise, we will simulate active learning. We will keep the small sample of observations for testing and we will test how quality of the model rises when we use active learning to choose labeled observations.

In [121]:
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn import svm
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np

In [122]:
df = pd.read_csv('titanic_dataset.csv')

In [123]:
# load dataset
df = pd.read_csv('titanic_dataset.csv')

# drop useless columns
df.drop(columns=['Cabin','Name','Ticket'], inplace = True)

# TEST SAMPLE
# USE THIS SAMPLE ONLY FOR TESTING
test_df = df.sample(n=100, random_state=42)
# KEEP ONLY THOSE WHO ARE NOT IN THE TEST SET
df = df[~df.PassengerId.isin(test_df.PassengerId.tolist())]

# FIT THE FIRST MODEL ONLY ON THE DATAFRAME START_DF
start_df = df.sample(n=100, random_state=42)
# DROP OBS FROM START_DF FROM DF
df = df[~df.PassengerId.isin(start_df.PassengerId.tolist())]

# Preprocessing of data
# DROP NA
start_df.dropna(inplace=True)
test_df.dropna(inplace=True)

In [91]:
start_df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
288,289,1,2,male,42.0,0,0,13.0000,S
416,417,1,2,female,34.0,1,1,32.5000,S
329,330,1,1,female,16.0,0,1,57.9792,C
587,588,1,1,male,60.0,1,1,79.2000,C
686,687,0,3,male,14.0,4,1,39.6875,S
...,...,...,...,...,...,...,...,...,...
438,439,0,1,male,64.0,1,4,263.0000,S
687,688,0,3,male,19.0,0,0,10.1708,S
10,11,1,3,female,4.0,1,1,16.7000,S
173,174,0,3,male,21.0,0,0,7.9250,S


In [124]:
# for categorical data
ohe = OneHotEncoder()

def encoder(model,data, columns, fit=False):
    if fit:  
        encoded = model.fit_transform(data.loc[:,columns]).toarray()
    else:
        encoded = model.transform(data.loc[:,columns]).toarray()
    encoded = pd.DataFrame(encoded, index = data.index, columns=[ohe.get_feature_names(columns)])
    return pd.merge(data.PassengerId,encoded,left_index=True,right_index=True,how='left')

encoded_train = encoder(ohe,start_df,['Sex','Embarked'],fit=True)
encoded_test = encoder(ohe,test_df,['Sex','Embarked'])

sc = StandardScaler()

def scaler(model, data, columns, fit=False):
    if fit:
        scaled = pd.DataFrame(model.fit_transform(data.loc[:,columns]),index = data.index,columns=columns)
    else:
        scaled = pd.DataFrame(model.transform(data.loc[:,columns]),index = data.index,columns=columns)
    return pd.merge(data.PassengerId,scaled,left_index=True,right_index=True,how='left')

scaled_train = scaler(sc,start_df,['Pclass', 'Age', 'SibSp', 'Parch'],fit=True)
scaled_test = scaler(sc,test_df,['Pclass', 'Age', 'SibSp', 'Parch'])

X = pd.merge(scaled_train.drop(columns=['PassengerId']),encoded_train.drop(columns=['PassengerId']),left_index=True,right_index=True)
X_test = pd.merge(scaled_test.drop(columns=['PassengerId']),encoded_test.drop(columns=['PassengerId']),left_index=True,right_index=True)
y = start_df.Survived
y_test = test_df.Survived

In [68]:
from sklearn import svm

clf = svm.SVC()
clf.fit(X,y)

SVC()

### Tasks

1. fit the first model only on the **start_df** using **SVM** and evaluate accuracy, precision and recall on test_df
2. in each iteration, add 10 observations (choose the observation using active learning approach) from **df** to your trainset, refit the model and evaluate on test_df again
3. the goal is to converge to the optimal solution as fast as possible by choosing **right** observations in each iteration
4. plot the graphs for each eval metric, where on the axis x is iteration number, on y is the metric value for that model

In [75]:
clf.decision_function(X).shape

(79,)

In [56]:
from sklearn import metrics

acc_hist=[]
pre_hist=[]
rec_hist=[]

In [185]:
# evluation metrics history
acc_hist=[]
pre_hist=[]
rec_hist=[]
iteration_nb=[]
i = 0 # iteration_nb
# SVM classifier
X_i = X
y_i = y

In [186]:
for k in range(1):
        # fit model to current data
    clf = svm.SVC()
    clf.fit(X_i,y_i)

    acc_hist.append(metrics.accuracy_score(y_test,clf.predict(X_test)))
    pre_hist.append(metrics.precision_score(y_test,clf.predict(X_test)))
    rec_hist.append(metrics.recall_score(y_test,clf.predict(X_test)))
    i += 1
    iteration_nb.append(i)
    
    # next 10 observations
    next_df = df.sample(n=min([10,len(df)]),random_state=42)
    
    print(i,len(df),len(next_df))
    next_df.dropna(inplace=True)
    if len(next_df)>0:
        encoded_next = encoder(ohe,next_df,['Sex','Embarked'])
        scaled_next = scaler(sc,next_df,['Pclass', 'Age', 'SibSp', 'Parch'])

        X_next = pd.merge(scaled_next.drop(columns=['PassengerId']),encoded_next.drop(columns=['PassengerId']),left_index=True,right_index=True)
        y_next = next_df.Survived
             
        decision = np.abs(list(clf.decision_function(X_next))) 
        ind = np.argpartition(decision, 2)
        indy_to_drop = X_next.iloc[ind[0:2]].index
        
    
        X_i = pd.concat([X_i,X_next.iloc[ind[0:2]]],ignore_index=True)
        y_i = pd.concat([y_i,y_next.iloc[ind[0:2]]],ignore_index=True)

    # drop observations from global df
    df = df[~df.index.isin([indy_to_drop])]

1 691 10


In [191]:
df[~df.index.isin(indy_to_drop)]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
882,883,0,3,female,22.0,0,0,10.5167,S
883,884,0,2,male,28.0,0,0,10.5000,S
887,888,1,1,female,19.0,0,0,30.0000,S
888,889,0,3,female,,1,2,23.4500,S


In [188]:
indy_to_drop

Int64Index([339, 419], dtype='int64')

Int64Index([339, 419], dtype='int64')