In [None]:
import pandas as pd
from pandas_profiling import ProfileReport
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings("ignore")

np.random.seed(2)

Loading data and exploring features:

In [None]:
train_data = pd.read_csv('./stream_quality_data/train_data.csv')
test_data = pd.read_csv('./stream_quality_data/test_data.csv')

In [None]:
report = ProfileReport(train_data)
#report.to_file('./clf_report.html')

In [None]:
X_train, y_train = train_data.drop('stream_quality', axis=1), train_data['stream_quality']
X_test, y_test = test_data.drop('stream_quality', axis=1), test_data['stream_quality']

In [None]:
train_data.head()

## Feature preprocessing

There are categorical features that have to be encoded in order to use it in model

In [None]:
cat_columns = train_data.columns[train_data.dtypes == 'object']
print('Categorical features: ', cat_columns.tolist())

In [None]:
from category_encoders import OneHotEncoder, OrdinalEncoder

# mapping categories to labels
mapping = [{'col':'auto_bitrate_state', 'mapping':{'off':0, 'partial':1, 'full':2}},
          {'col':'auto_fec_state', 'mapping':{'off':0, 'partial':1}}]

encoder = OrdinalEncoder(mapping=mapping)
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

## Feature selection

In [None]:
import seaborn as sns

sns.heatmap(X_train.assign(target=y_train).corr())

There is high positive correlation with fps_lags feature and negative correlation with fps_mean feature.

Automatic selection based on mutual information between features and target variable:

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

selector = SelectKBest(mutual_info_classif, k=8)

selector.fit(X_train_scaled, y_train)
selected_feats = X_train.columns[selector.get_support()]
print('Selected features with highest mutual inf: ', selected_feats.tolist())

Selection based on importance of feature on performance of model:

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV, RFE 

rfecv = RFECV(
    estimator=LogisticRegression(),
    step=1,
    cv=5,
    min_features_to_select=2
)
rfecv.fit(X_train_scaled, y_train)
imp_cols = X_train.columns[rfecv.support_]
print('Selected_features with rfecv: ',  imp_cols.tolist())

## Model

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

def get_clf_metrics(y_true, y_pred):
    """
    Calculate classification metrics
    :param y_true: ground truth labels
    :param y_pred: predicted labels
    :return: calculated metrics
    """
    metrics = {'accuracy':accuracy_score(y_true, y_pred), 
              'precision':precision_score(y_true, y_pred),
              'recall':recall_score(y_true, y_pred),
              'f1':f1_score(y_true, y_pred)}
    
    return pd.Series(metrics)

In [None]:
def fit_predict(model, X_train, y_train, X_test):
    """
    Function to fit model and get predictions
    :param model: model to use
    :param X_train: train data
    :param y_train: target
    :param X_test: test data
    :return: predicted labels for train and test data
    """
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)    
    return y_pred_train, y_pred_test

def print_metrics(y_train, y_pred_train, y_test, y_pred_test):
    """
    Function to get train and test metrics
    :param y_train: train labels
    :param y_pred_train: predicted labels
    :param y_test: test labels
    :param y_pred_test: predicted test labels
    :return: DataFrame with test and train metrics
    """
    train_metrics = get_clf_metrics(y_train, y_pred_train)
    test_metrics = get_clf_metrics(y_test, y_pred_test)
    return pd.concat([train_metrics, test_metrics], axis=1, keys=['train', 'test'])

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

def get_clf_metrics(y_true, y_pred):
    """
    Calculate classification metrics
    :param y_true: ground truth labels
    :param y_pred: predicted labels
    :return: calculated metrics
    """
    metrics = {'accuracy':accuracy_score(y_true, y_pred), 
              'precision':precision_score(y_true, y_pred),
              'recall':recall_score(y_true, y_pred),
              'f1':f1_score(y_true, y_pred)}
    
    return pd.Series(metrics)

Training simple logistic regression model:

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

logreg = Pipeline([
    ('preprocess', StandardScaler()), 
    ('model', LogisticRegression(penalty='none'))
])

y_pred_train, y_pred_test = fit_predict(logreg, X_train[imp_cols], y_train, X_test[imp_cols])
print_metrics(y_train, y_pred_train, y_test, y_pred_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

logreg_l2 = Pipeline([
    ('preprocess', StandardScaler()), 
    ('model', LogisticRegression())
])

search_tool = RandomizedSearchCV(logreg_l2, 
                                 {'model__C':loguniform(1e-5, 1)}, 
                                 scoring='f1', 
                                 n_jobs=-1)

search_tool.fit(X_train[imp_cols], y_train)
print("Best parameter for C: ", search_tool.best_params_['model__C'])
print('Best  cross validation f1 score: ',  search_tool.best_score_)

logreg_l2['model'].C = search_tool.best_params_['model__C']
    
y_pred_train, y_pred_test = fit_predict(logreg_l2, X_train[imp_cols], y_train, X_test[imp_cols])
print_metrics(y_train, y_pred_train, y_test, y_pred_test)

In [None]:
logreg_l1 = Pipeline([
    ('preprocess', StandardScaler()), 
    ('model', LogisticRegression(max_iter=200, penalty="l1", solver="saga"))
])

search_tool = RandomizedSearchCV(logreg_l1, 
                                 {'model__C':loguniform(1e-5, 1)},
                                 scoring='f1', 
                                 n_jobs=-1,
                                 n_iter=10)
search_tool.fit(X_train[imp_cols], y_train)
logreg_l1['model'].C = search_tool.best_params_['model__C']
print("Best parameter for C: ", search_tool.best_params_['model__C'])
print('Best  cross validation f1 score: ',  search_tool.best_score_)

y_pred_train, y_pred_test = fit_predict(logreg_l1, X_train[imp_cols], y_train, X_test[imp_cols])
print_metrics(y_train, y_pred_train, y_test, y_pred_test)

In [None]:
logreg_poly = Pipeline([
    ('poly', PolynomialFeatures(degree=3, interaction_only=True)),
    ('preprocess', StandardScaler()), 
    ('model', LogisticRegression(max_iter=200, penalty="l2"))
])

search_tool = RandomizedSearchCV(logreg_poly, 
                                 {'model__C':loguniform(1e-5, 10)},
                                 scoring='f1', 
                                 n_jobs=-1,
                                 n_iter=10)
search_tool.fit(X_train[imp_cols], y_train)
logreg_poly['model'].C = search_tool.best_params_['model__C']
print("Best parameter for C: ", search_tool.best_params_['model__C'])
print('Best  cross validation f1 score: ',  search_tool.best_score_)

y_pred_train, y_pred_test = fit_predict(logreg_poly, X_train[imp_cols], y_train, X_test[imp_cols])
print_metrics(y_train, y_pred_train, y_test, y_pred_test)

## Outlier detection

In [None]:
iqr = X_train[imp_cols].quantile(0.75) - X_train[imp_cols].quantile(0.25)
up_bound = X_train[imp_cols].quantile(0.75) + iqr * 1.5
low_bound = X_train[imp_cols].quantile(0.25) - iqr * 1.5
outliers_mask = ((X_train[imp_cols] > up_bound) | (X_train[imp_cols] < low_bound)).sum(axis=1) > 0
X_train_clean = X_train[~outliers_mask][imp_cols].copy()
y_train_clean = y_train[~outliers_mask].copy()

print('Percentage of outliers: ', 1 - X_train_clean.shape[0] / X_train.shape[0])

In [None]:
search_tool.fit(X_train_clean, y_train_clean)
logreg_poly['model'].C = search_tool.best_params_['model__C']
print("Best parameter for C: ", search_tool.best_params_['model__C'])
print('Best  cross validation f1 score: ',  search_tool.best_score_)

y_pred_train, y_pred_test = fit_predict(logreg_poly, X_train_clean, y_train_clean, X_test[imp_cols])
print_metrics(y_train_clean, y_pred_train, y_test, y_pred_test)

Detecting outliers with interquantile range is not suitable in this case. There are many informative points dropped leading to poor performance. Perhaps the distribution of variables is too wide

In [None]:
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)
outliers_mask = yhat == -1
X_train_clean = X_train[~outliers_mask][imp_cols]
y_train_clean = y_train[~outliers_mask]

vals, counts = np.unique(yhat, return_counts=True)
print('Percentage of outliers: ', counts[0] / train_data.shape[0])

In [None]:
search_tool.fit(X_train_clean, y_train_clean)
logreg_poly['model'].C = search_tool.best_params_['model__C']
print("Best parameter for C: ", search_tool.best_params_['model__C'])
print('Best  cross validation f1 score: ',  search_tool.best_score_)

y_pred_train, y_pred_test = fit_predict(logreg_poly, X_train_clean, y_train_clean, X_test[imp_cols])
print_metrics(y_train_clean, y_pred_train, y_test, y_pred_test)

The metrics show improvement

## Class imbalance

The target labels data are highly imbalanced:

In [None]:
print('Class distribution: ')
print(f'Class 1: {(y_train == 1).sum() / (y_train).count():f}')
print(f'Class 0: {(y_train == 0).sum() / (y_train).count():f}')

Perform balancing of data by adding random observations from minority class

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

oversample_tool = RandomOverSampler(sampling_strategy=0.35)
X_over, y_over = oversample_tool.fit_resample(X_train_clean, y_train_clean)

In [None]:
print('Class distribution after oversampling:')
print(f'Class 1: {(y_over == 1).sum() / (y_over).count():f}')
print(f'Class 0: {(y_over == 0).sum() / (y_over).count():f}')

Evaluate performance on new data

In [None]:
search_tool.fit(X_over, y_over)
logreg_poly['model'].C = search_tool.best_params_['model__C']
print("Best parameter for C: ", search_tool.best_params_['model__C'])
print('Best  cross validation f1 score: ',  search_tool.best_score_)

y_pred_train, y_pred_test = fit_predict(logreg_poly, X_over, y_over, X_test[imp_cols])
print_metrics(y_over, y_pred_train, y_test, y_pred_test)

In [None]:
from imblearn.pipeline import Pipeline as PipelineIm

logreg_over = PipelineIm([
    ('over', RandomOverSampler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('preprocess', StandardScaler()),
    ('model', LogisticRegression(max_iter=200, penalty="l2"))
])

search_tool = RandomizedSearchCV(logreg_over, {'model__C':uniform(loc=1e-8, scale=1), 
                                               'over__sampling_strategy':uniform(loc=0.1, scale=1)},
                                 scoring='f1')
search_tool.fit(X_train_clean, y_train_clean)

logreg_over['model'].C = search_tool.best_params_['model__C']
logreg_over['over'].sampling_strategy = search_tool.best_params_['over__sampling_strategy']
print("Best parameter for C: ", search_tool.best_params_['model__C'])
print("Best parameter for sampling: ", search_tool.best_params_['over__sampling_strategy'])
print("Best  cross validation f1 score: ",  search_tool.best_score_)

y_pred_train, y_pred_test = fit_predict(logreg_over, X_train_clean, y_train_clean, X_test[imp_cols])
print_metrics(y_train_clean, y_pred_train, y_test, y_pred_test)

The performance of the model has been improved

Handling imbalanced data by randomly undersampling the majority class:

In [None]:
logreg_under = PipelineIm([
    ('over', RandomOverSampler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('preprocess', StandardScaler()),
    ('model', LogisticRegression(max_iter=200, penalty="l2"))
])

search_tool = RandomizedSearchCV(logreg_under, {'model__C':uniform(loc=1e-8, scale=1), 
                                               'over__sampling_strategy':uniform(loc=0.1, scale=1)},
                                 scoring='f1')
search_tool.fit(X_train_clean, y_train_clean)

logreg_under['model'].C = search_tool.best_params_['model__C']
logreg_under['over'].sampling_strategy = search_tool.best_params_['over__sampling_strategy']
print("Best parameter for C: ", search_tool.best_params_['model__C'])
print("Best parameter for sampling: ", search_tool.best_params_['over__sampling_strategy'])
print("Best  cross validation f1 score: ",  search_tool.best_score_)

y_pred_train, y_pred_test = fit_predict(logreg_under, X_train_clean, y_train_clean, X_test[imp_cols])
print_metrics(y_train_clean, y_pred_train, y_test, y_pred_test)

In [63]:
print('Class distribution: ')
print(f'Class 1: {(y_train == 1).sum() / (y_train).count():f}')
print(f'Class 0: {(y_train == 0).sum() / (y_train).count():f}')

Class distribution: 
Class 1: 0.068460
Class 0: 0.931540


Perform balancing of data by adding random observations from minority class

In [60]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

oversample_tool = RandomOverSampler(sampling_strategy=0.35)
X_over, y_over = oversample_tool.fit_resample(X_train_clean, y_train_clean)

In [62]:
print('Class distribution after oversampling:')
print(f'Class 1: {(y_over == 1).sum() / (y_over).count():f}')
print(f'Class 0: {(y_over == 0).sum() / (y_over).count():f}')

Class distribution after oversampling:
Class 1: 0.259259
Class 0: 0.740741


Evaluate performance on new data

In [64]:
search_tool.fit(X_over, y_over)
logreg_poly['model'].C = search_tool.best_params_['model__C']
print("Best parameter for C: ", search_tool.best_params_['model__C'])
print('Best  cross validation f1 score: ',  search_tool.best_score_)

y_pred_train, y_pred_test = fit_predict(logreg_poly, X_over, y_over, X_test[imp_cols])
print_metrics(y_over, y_pred_train, y_test, y_pred_test)

Best parameter for C:  2.35506005866699
Best  cross validation f1 score:  0.5760705664612442


Unnamed: 0,train,test
accuracy,0.82664,0.919715
precision,0.784687,0.384293
recall,0.456616,0.408755
f1,0.577298,0.396147


In [72]:
from imblearn.pipeline import Pipeline as PipelineIm

logreg_over = PipelineIm([
    ('over', RandomOverSampler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('preprocess', StandardScaler()),
    ('model', LogisticRegression(max_iter=200, penalty="l2"))
])

search_tool = RandomizedSearchCV(logreg_over, {'model__C':uniform(loc=1e-8, scale=1), 
                                               'over__sampling_strategy':uniform(loc=0.1, scale=1)},
                                 scoring='f1')
search_tool.fit(X_train_clean, y_train_clean)

logreg_over['model'].C = search_tool.best_params_['model__C']
logreg_over['over'].sampling_strategy = search_tool.best_params_['over__sampling_strategy']
print("Best parameter for C: ", search_tool.best_params_['model__C'])
print("Best parameter for sampling: ", search_tool.best_params_['over__sampling_strategy'])
print("Best  cross validation f1 score: ",  search_tool.best_score_)

y_pred_train, y_pred_test = fit_predict(logreg_over, X_train_clean, y_train_clean, X_test[imp_cols])
print_metrics(y_train_clean, y_pred_train, y_test, y_pred_test)

Best parameter for C:  0.009342710156912187
Best parameter for sampling:  0.3543567312102637
Best  cross validation f1 score:  0.4394981800847999


Unnamed: 0,train,test
accuracy,0.924645,0.920816
precision,0.430719,0.39108
recall,0.46065,0.41124
f1,0.445182,0.400907


The performance of the model has been improved

Handling imbalanced data by randomly undersampling the majority class:

In [73]:
logreg_under = PipelineIm([
    ('over', RandomOverSampler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('preprocess', StandardScaler()),
    ('model', LogisticRegression(max_iter=200, penalty="l2"))
])

search_tool = RandomizedSearchCV(logreg_under, {'model__C':uniform(loc=1e-8, scale=1), 
                                               'over__sampling_strategy':uniform(loc=0.1, scale=1)},
                                 scoring='f1')
search_tool.fit(X_train_clean, y_train_clean)

logreg_under['model'].C = search_tool.best_params_['model__C']
logreg_under['over'].sampling_strategy = search_tool.best_params_['over__sampling_strategy']
print("Best parameter for C: ", search_tool.best_params_['model__C'])
print("Best parameter for sampling: ", search_tool.best_params_['over__sampling_strategy'])
print("Best  cross validation f1 score: ",  search_tool.best_score_)

y_pred_train, y_pred_test = fit_predict(logreg_under, X_train_clean, y_train_clean, X_test[imp_cols])
print_metrics(y_train_clean, y_pred_train, y_test, y_pred_test)

Best parameter for C:  0.6800031692422369
Best parameter for sampling:  0.3809611180929845
Best  cross validation f1 score:  0.4387663215992485


Unnamed: 0,train,test
accuracy,0.92011,0.916148
precision,0.409326,0.372177
recall,0.49046,0.438958
f1,0.446235,0.402818
