# Pseudo-Labelling : A Semi-Supervised learning technique

We will use the [Big Mart Data](https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/Sales) problem from AV data hack platform.

Start by importing the basic libraries.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder 

Load data into memory. 

In [2]:
train = pd.read_csv('../data/big_mart_sales/Train.csv')
test = pd.read_csv('../data/big_mart_sales/Test.csv')

Take care of some preprocessing: fill NaNs, drop unique labels, categorize features, and transform the years established variable. 

In [3]:
# combine train and test for preprocessing steps
combi = train.append(test, sort = False)

# fill missing values with mean for item weight
combi['Item_Weight'].fillna((combi['Item_Weight'].mean()), inplace=True)

# reducing fat content to only two categories 
combi['Item_Fat_Content'] = combi['Item_Fat_Content'].replace(['low fat','LF'], ['Low Fat','Low Fat']) 
combi['Item_Fat_Content'] = combi['Item_Fat_Content'].replace(['reg'], ['Regular']) 

# calculating years established from establishment year
combi['Outlet_Establishment_Year'] = 2018 - combi['Outlet_Establishment_Year'] 

# fill missing values for size
combi['Outlet_Size'].fillna('Small',inplace=True)

# label encoding categirical variables.
col = ['Outlet_Size','Outlet_Location_Type','Outlet_Type','Item_Fat_Content']
le = LabelEncoder()
for i in col:
    combi[i] = le.fit_transform(combi[i].astype('str'))
    combi[i] = combi[i].astype('int')
    
# Split the combined array back into test and train
train = combi[:train.shape[0]].copy()
test = combi[train.shape[0]:].copy()

## removing unique id variables from moel building arrays
training  = train.drop(['Outlet_Identifier','Item_Type','Item_Identifier'],axis=1)
testing  = test.drop(['Outlet_Identifier','Item_Type','Item_Identifier'],axis=1)

# Setup arrays for sklearn models
y_train = training['Item_Outlet_Sales']
training.drop('Item_Outlet_Sales',axis=1,inplace=True)
testing.drop('Item_Outlet_Sales',axis=1,inplace=True)
features = training.columns
target = 'Item_Outlet_Sales'

X_train, X_test = training, testing

# Model exploration

We will benchmark the PL algorithm with some standard sklearn models. First we import the packages. 

In [4]:
from xgboost import XGBRegressor
from sklearn.linear_model import BayesianRidge, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

  from numpy.core.umath_tests import inner1d


Now we make a model factory to chug through the models with our data set using cross fold validation to score the perfomance of each model. 

In [9]:
model_factory = [
    XGBRegressor(n_jobs=1),
    Ridge(), ElasticNet(),
    KNeighborsRegressor(),
    BayesianRidge(),
    ExtraTreesRegressor(),
    RandomForestRegressor(),
    GradientBoostingRegressor()    
]

for model in model_factory:
    model.random_state = 42
    num_folds = 5

    scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring='neg_mean_squared_error')
    score_description = " %0.2f (+/- %0.2f)" % (np.sqrt(scores.mean()*-1), scores.std() * 2)

    print('{model:25} CV-{num_folds} RMSE: {score}'.format(
        model=model.__class__.__name__,
        num_folds=num_folds,
        score=score_description
    ))

XGBRegressor              CV-5 RMSE:  1083.15 (+/- 35120.40)
Ridge                     CV-5 RMSE:  1206.14 (+/- 86014.33)
ElasticNet                CV-5 RMSE:  1259.05 (+/- 115278.93)
KNeighborsRegressor       CV-5 RMSE:  1227.93 (+/- 65213.30)
BayesianRidge             CV-5 RMSE:  1206.26 (+/- 85051.90)
ExtraTreesRegressor       CV-5 RMSE:  1204.44 (+/- 75170.43)
RandomForestRegressor     CV-5 RMSE:  1187.15 (+/- 52115.88)
GradientBoostingRegressor CV-5 RMSE:  1086.53 (+/- 35982.21)


We observe that XGBoost performs the best and so we will use this model to test our PL algorithm. 

In [10]:
## normal submission using xgb
model = XGBRegressor()
model.fit(X_train,y_train)
pred = model.predict(X_test)

## saving file
sub = pd.DataFrame(data = pred, columns=['Item_Outlet_Sales'])
sub['Item_Identifier'] = test['Item_Identifier']
sub['Outlet_Identifier'] = test['Outlet_Identifier']
#sub.to_csv('bigmart-xgb.csv', index='False')

scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring='neg_mean_squared_error', n_jobs=8)

# Print results
score_description = "RMSE: %0.4f (+/- %0.4f)" % (np.sqrt(scores.mean()*-1), scores.std() * 2)

print('{model:25} CV-{num_folds} {score_cv}'.format(
    model=model.__class__.__name__,
    num_folds=num_folds,
    score_cv=score_description))

XGBRegressor              CV-5 RMSE: 1083.1475 (+/- 35120.4032)


## PseudoLabeler Class

In [11]:
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics.scorer import make_scorer

class PseudoLabeler(BaseEstimator, RegressorMixin):
    '''
    Sci-kit learn wrapper for creating pseudo-lebeled estimators.
    '''
      
    def __init__(self, model, unlabled_data, features, target, sample_rate=0.2, random_state=42):
        '''
        @model - the regressor model to build the model with
        @unlabeld_data - X features only of unlabeled data
        @features - list of feature names
        @target - list of y label name
        @sample_rate - percent of samples used as pseudo-labelled data from the unlabled dataset                      
        '''
        assert sample_rate <= 1.0, 'Sample_rate should be between 0.0 and 1.0.'
        
        self.sample_rate = sample_rate
        self.random_state = random_state
        self.model = model
        self.model.random_state = random_state
        self.unlabled_data = unlabled_data
        self.features = features
        self.target = target
        
    def get_params(self, deep=True):
        return {
            "sample_rate": self.sample_rate,
            "random_state": self.random_state,
            "model": self.model,
            "unlabled_data": self.unlabled_data,
            "features": self.features,
            "target": self.target
        }

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

        
    def fit(self, X, y):
        '''
        Fit the data using pseudo labeling.
        '''
        num_of_samples = int(len(self.unlabled_data) * self.sample_rate)
        
        # Train the model on X and y and creat the pseudo-labels
        self.model.fit(X, y)
        pseudo_labels = self.model.predict(self.unlabled_data[self.features])
        
        # Add the pseudo-labels to the test set
        pseudo_data = self.unlabled_data.copy(deep=True)
        pseudo_data[self.target] = pseudo_labels
        
        # Take a subset of the test set with pseudo-labels and append to the training set and shuffle
        sampled_pseudo_data = pseudo_data.sample(n=num_of_samples)
        temp_train = pd.concat([X, y], axis=1)
        augemented_train = shuffle(pd.concat([sampled_pseudo_data, temp_train]))
        
        # Fit the model again with the augemented data set
        self.model.fit(augemented_train[self.features], augemented_train[self.target])
        return self
    
           
    def predict(self, X):
        '''
        Returns the predicted values.
        '''
        return self.model.predict(X)
    
    def get_model_name(self):
        return self.model.__class__.__name__
    
    def 

### Testing out PsuedoLabeler 
As of now the cross_val_score is not working

In [None]:
model = PseudoLabeler(
    XGBRegressor(n_jobs=1),
    X_test,
    features,
    target,
    sample_rate = 0.3
)

model.fit(X_train,y_train)
pred = model.predict(X_train)
#scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring='neg_mean_squared_error', n_jobs=8)
display(pred)

sub = pd.DataFrame(data = pred, columns=['Item_Outlet_Sales'])
sub['Item_Identifier'] = test['Item_Identifier']
sub['Outlet_Identifier'] = test['Outlet_Identifier']
#sub.to_csv('pseudo-labelling.csv', index='False')

# Comparing xgboost with xgb with pseudo labelling

In [None]:
model_factory = [
    XGBRegressor(n_jobs=1),
    
    PseudoLabeler(
        XGBRegressor(n_jobs=1),
        X_test,
        features,
        target,
        sample_rate=0.3
    ),
]

for model in model_factory:
    model.seed = 42
    num_folds = 8
    
    scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring='neg_mean_squared_error', n_jobs=8)
    score_description = "MSE: %0.4f (+/- %0.4f)" % (np.sqrt(scores.mean()*-1), scores.std() * 2)

    print('{model:25} CV-{num_folds} {score_cv}'.format(
        model=model.__class__.__name__,
        num_folds=num_folds,
        score_cv=score_description
    ))

# Performance of pseudo-labelling depedendance on sampling rate

In [None]:
sample_rates = np.linspace(0, 1, 10)

def pseudo_label_wrapper(model):
    return PseudoLabeler(model, test, features, target)

# List of all models to test
model_factory = [
    RandomForestRegressor(n_jobs=1),
    XGBRegressor(),
]

# Apply the PseudoLabeler class to each model
model_factory = map(pseudo_label_wrapper, model_factory)

# Train each model with different sample rates
results = {}
num_folds = 5

for model in model_factory:
    model_name = model.get_model_name()
    print('%s' % model_name)

    results[model_name] = list()
    for sample_rate in sample_rates:
        model.sample_rate = sample_rate
        
        # Calculate the CV-3 R2 score and store it
        scores = cross_val_score(model, X_train, y_train, cv=num_folds, scoring='neg_mean_squared_error', n_jobs=8)
        results[model_name].append(np.sqrt(scores.mean()*-1))

In [None]:
plt.figure(figsize=(16, 18))

i = 1
for model_name, performance in results.items():    
    plt.subplot(3, 3, i)
    i += 1
    
    plt.plot(sample_rates, performance)
    plt.title(model_name)
    plt.xlabel('sample_rate')
    plt.ylabel('RMSE')
    

plt.show()