# Train Model

In [None]:
import ast
import operator
import warnings

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import ParameterGrid

from src.data.progress_bar import progress_bar
from src.features.features_utils import convert_categoricals_to_numerical
from src.features.features_utils import convert_target_to_numerical

## Reading in the Data

First let's read in both sets of training and validation features and targets as well as the sample weights we created for covariate shift adaptation. We make sure to convert the categorical fields to a numerical form that is suitable for building machine learning models.

In [None]:
train_features = pd.read_csv('../data/processed/train-features.csv')
X_train = convert_categoricals_to_numerical(train_features)
X_train.head()

In [None]:
sample_weights = pd.read_csv('../models/train-features-sample-weights.csv')
sample_weights.head()

In [None]:
train_features_topics = pd.read_csv('../data/processed/train-features-topics.csv')
X_train_topics = convert_categoricals_to_numerical(train_features_topics)
X_train_topics.head()

In [None]:
sample_weights_topics = pd.read_csv('../models/train-features-topics-sample-weights.csv')
sample_weights_topics.head()

In [None]:
train_target = pd.read_csv('../data/processed/train-target.csv', index_col='full_name', squeeze=True)
y_train = convert_target_to_numerical(train_target)
y_train.head()

In [None]:
validation_features = pd.read_csv('../data/processed/validation-features.csv')
X_validation = convert_categoricals_to_numerical(validation_features)
X_validation.head()

In [None]:
validation_features_topics = pd.read_csv('../data/processed/validation-features-topics.csv')
X_validation_topics = convert_categoricals_to_numerical(validation_features_topics)
X_validation_topics.head()

In [None]:
validation_target = pd.read_csv('../data/processed/validation-target.csv', index_col='full_name',
                                squeeze=True)
y_validation = convert_target_to_numerical(validation_target)
y_validation.head()

## Hyperparameter Selection

The hyperparameters of the models that we will be fitting are critical to their predictive performance. We will use an exhaustive grid search to select them in a principled manner. The optimal hyperparameter values will be chosen according to the set of values that maximize the Matthews Correlation Coefficient (MCC) on the validation set. The function below will be used to accomplish this task.

In [None]:
def evaluate_classifier(
    X_train, y_train, X_validation, y_validation, clf=LogisticRegression(),
    param_grid=ParameterGrid(dict(C=np.logspace(-5, 15, 21, base=2.0))),
    score_func=matthews_corrcoef, greater_score_is_better=True, solver='lbfgs',
    sample_weight=None, max_iter=1000, random_state=None, progress_bar=None):
    
    if progress_bar:
        progress_bar.start()

    train_scores = {}
    validation_scores = {}
    classifiers = {}
    num_iters = 0
    for params in param_grid:
        
        num_iters += 1
        if progress_bar:
            progress_bar.update(num_iters)
        
        # fit the model to training set
        if isinstance(clf, LogisticRegression):
            classifier = LogisticRegression(
                penalty=params.get('penalty', 'l2'), C=params.get('C', 1.0), solver=solver,
                random_state=random_state, class_weight=params.get('class_weight'), max_iter=max_iter)
        elif isinstance(clf, SVC):
            classifier = SVC(
                C=params.get('C', 1.0), kernel=params.get('kernel', 'rbf'),
                gamma=params.get('gamma', 'auto_deprecated'), random_state=random_state,
                class_weight=params.get('class_weight'), max_iter=max_iter)
        else:
            raise NotImplementedError
        classifier.fit(X_train, y_train, sample_weight=sample_weight)
        classifiers[str(params)] = classifier

        # predict on validation set and evaluate scores
        y_train_predict = classifier.predict(X_train)
        y_validation_predict = classifier.predict(X_validation)
        with warnings.catch_warnings():  # ignore runtime warnings caused by zero MCC
            warnings.filterwarnings('ignore', category=RuntimeWarning)
            train_scores[str(params)] = score_func(y_true=y_train, y_pred=y_train_predict)
            validation_scores[str(params)] = score_func(y_true=y_validation,
                                                        y_pred=y_validation_predict)
            
    if progress_bar:
        progress_bar.finish()
    
    # find the best scoring model
    sorted_validation_scores = sorted(
        validation_scores.items(), key=operator.itemgetter(1), reverse=greater_score_is_better)
    best_params = ast.literal_eval(sorted_validation_scores[0][0])
    best_score = sorted_validation_scores[0][1]
    best_classifier = classifiers[str(best_params)]
    
    # return results
    results = {'best_classifier': best_classifier, 'best_params': best_params, 'best_score': best_score,
               'train_scores': train_scores, 'validation_scores': validation_scores} 
    return results


def print_best_classifier(results, title=None):
    if title:
        print(title)
    print('Best params: ', results['best_params'])
    print('Training score: ', round(results['train_scores'][str(results['best_params'])], 3))
    print('Validation score: ', round(results['best_score'], 3))

It's now time to select the best parameters for the two feature sets with and without the sample weights.

### Logistic Regression

The hyperparameters to be selected for the logistic regression model are:
- The `penalty` which is used to specify whether the $L1$ or $L2$ norms are used in the regularization. The latter favors sparse solutions and naturally performs feature selection. 
- `C`, the inverse of regularization strength. Smaller values specify stronger regularization.
- `class_weight`, the weights associated with the classes. It penalizes mistakes in samples of a class with its associated class_weight. So a higher value indicates more emphasis is put on a class.

Let's perform the grid search now.

In [None]:
penalty = ['l1', 'l2']
Cs= np.logspace(-5, 15, 21, base=2.0)
class_weight = ([{0: weight, 1: 1.0 - weight} for weight in np.linspace(0.0, 1.0, 21)] +
                 [{0: 1.0, 1: 1.0}] + ['balanced'])
param_grid = ParameterGrid(dict(penalty=penalty, C=Cs, class_weight=class_weight))

clf = LogisticRegression()
solver = 'liblinear'
bar = progress_bar(len(param_grid), banner_text_begin='Running: ', banner_text_end=' param sets')

In [None]:
logit_results = evaluate_classifier(
    X_train, y_train, X_validation, y_validation, clf=clf, param_grid=param_grid, solver=solver,
    random_state=0, progress_bar=bar)
print_best_classifier(logit_results, 'Logistic Regression')

In [None]:
logit_results_weights = evaluate_classifier(
    X_train, y_train, X_validation, y_validation, clf=clf, param_grid=param_grid, solver=solver,
    sample_weight=sample_weights['weight'], random_state=1, progress_bar=bar)
print_best_classifier(logit_results_weights, 'Logistic Regression + sample weights')

In [None]:
logit_results_topics = evaluate_classifier(
    X_train_topics, y_train, X_validation_topics, y_validation, clf=clf, param_grid=param_grid,
    solver=solver, random_state=2, progress_bar=bar)
print_best_classifier(logit_results_topics, 'Logistic Regression (topics)')

In [None]:
logit_results_topics_weights = evaluate_classifier(
    X_train_topics, y_train, X_validation_topics, y_validation, clf=clf, param_grid=param_grid,
    solver=solver, sample_weight=sample_weights_topics['weight'], random_state=3, progress_bar=bar)
print_best_classifier(logit_results_topics_weights, 'Logistic Regression (topics) + sample weights')

We can make the following observations about the results:
- Since none of the models selected uniform class weights, we can see that the choice of this hyperparamter is very important.
- Unsurprisingly, $L1$ regularization is chosen for the original features and $L2$ regularization for the topics features.
- Models fitted with the original features are overfitting and those with the topics features are underfitting (the validation MCC's are higher than the training MCCs).
- Applying strong regularization does not improve performance for the original features.

### Support Vector Machine (SVM)

The hyperparameters to be selected for the support vector machine model are:
- The regularization parameter `C` of the error term. This parameter trades off correct classification of training examples against maximization of the separating hyperplane's margin. For larger values of `C`, a smaller margin will be accepted if the separating hyperplane is better at classifying training points correctly. Lower values of `C` encourage a larger margin at the cost of misclassifying more training points.
- `class_weight`, as defined above for logistic regression.

Note that the `kernel` parameter, which is used to specify the kernel type to be used in the algorithm, will be fixed as the *linear* kernel. The reason for not considering the *RBF* or *poly* kernels is interpretability of the model. For the *RBF* and *poly* kernels, the [separating hyperplane and the weights that define it exist in a transformed space](https://stackoverflow.com/questions/21260691/how-to-obtain-features-weights) that is not directly related to the input feature space. OK let's perform the grid search now.

In [None]:
Cs = np.logspace(-5, 10, 16, base=2.0)
param_grid = ParameterGrid(dict(kernel=['linear'], C=Cs, class_weight=class_weight))

clf=SVC()
max_iter = -1
bar = progress_bar(len(param_grid), banner_text_begin='Running: ', banner_text_end=' param sets')

In [None]:
svm_results = evaluate_classifier(
    X_train, y_train, X_validation, y_validation, clf=clf, param_grid=param_grid, random_state=4,
    progress_bar=bar, max_iter=max_iter)
print_best_classifier(svm_results, 'SVM')

In [None]:
svm_results_weights = evaluate_classifier(
    X_train, y_train, X_validation, y_validation, clf=clf, param_grid=param_grid,
    sample_weight=sample_weights['weight'], random_state=5, progress_bar=bar, max_iter=max_iter)
print_best_classifier(svm_results_weights, 'SVM + sample weights')

In [None]:
svm_results_topics = evaluate_classifier(
    X_train_topics, y_train, X_validation_topics, y_validation, clf=clf, param_grid=param_grid,
    random_state=6, progress_bar=bar, max_iter=max_iter)
print_best_classifier(svm_results_topics, 'SVM (topics)')

In [None]:
svm_results_topics_weights = evaluate_classifier(
    X_train_topics, y_train, X_validation_topics, y_validation, clf=clf, param_grid=param_grid,
    sample_weight=sample_weights_topics['weight'], random_state=7, progress_bar=bar, max_iter=max_iter)
print_best_classifier(svm_results_topics_weights, 'SVM (topics) + weights')

We can make the following observations about the results:
- Once again, since none of the models selected uniform class weights, we can see that the choice of this hyperparamter is very important. Interestingly, the balanced class weight is chosen for the topics features.
- Again we see overfitting and underfitting of the models. However, for the classifiers fitted with the sample weights, the effect is far less pronounced than for the logistic regression models.
- The sample-weighted classifiers seem to favor larger values of C (smaller margin) whereas the unweighted ones facvor smaller values of C (larger margin). 