# Text Classification of StackOverflow using  Embeddings Centroids Vectors

In [1]:
import sys
import os
from os import pardir, getcwd
from os.path import join, abspath
PARENT_DIRECTORY = abspath(join(getcwd(), pardir))
sys.path.insert(0, PARENT_DIRECTORY)

import warnings
import sklearn.exceptions
import talos as ta
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

from definitions import  TALOS_DIR
from app.preprocessing import load_dataset, load_embeddings, preprocess_data
from app.models import load_model, find_best_model_over_scan_logs


Using TensorFlow backend.


## Preprocessing for the loaded Dataset
1. Remove *punctuation* characters
2. Remove *stopwords*
3. Remove *links*
4. Remove *Numbers*
5. Format into *lowercase*

In [2]:
data = load_dataset(load_from_pickle=True)
Classes = list(data['tags'].value_counts().index)
Nclasses = len(Classes)
print(data['tags'].value_counts())
print(data['post'][0])

sql              2000
ruby-on-rails    2000
android          2000
angularjs        2000
asp.net          2000
c                2000
c#               2000
c++              2000
css              2000
html             2000
ios              2000
iphone           2000
java             2000
javascript       2000
jquery           2000
mysql            2000
objective-c      2000
php              2000
python           2000
.net             2000
Name: tags, dtype: int64
causing behavior c# datetime type <pre><code>[test] public void sadness() { var datetime = datetime.utcnow; assert.that(datetime is.equalto(datetime.parse(datetime.tostring()))); } </code></pre> failed <pre><code> expected: - - : : . was: - - : : . </code></pre> wish know happening behind scenes tostring() etc cause behavior. edit seeing jon answer <pre><code>[test] public void newsadness() { var datetime = datetime.utcnow; assert.that(datetime is.equalto(datetime.parse(datetime.tostring( )))); } </code></pre> result <pre><code>e

## MLP classifier in Keras using not standardized **embeddings-centroids** features

 ### Hyper parameter tuning for the  model

In [3]:
# 70% Train & 30% Test
# 70% Train-Dev % 30* Train-Dev 
embeddings = load_embeddings(data, 'post', minimized=False)
model_data_ftc = preprocess_data(data, 'tags', 'post',
                                 input_ins='as_centroids',
                                 cv_split_full=0.3,
                                 cv_split_dev=0.3,
                                 embeddings=embeddings)


In [4]:
TALOS_FTC_LOG_FILENAME = 'talos_ftc_log_test'
# TALOS_FTC_LOG_FILENAME = 'talos_ftc_log'
talos_ftc_log_pathname = os.path.join(TALOS_DIR, TALOS_FTC_LOG_FILENAME)


###### Production configuration
ftc_model_config = {
    'visualize_proccess': [False],
    'first_neuron': [128, 256],  # First Layer
    'activation': ['relu', 'tanh'],
    'dropout': [0.2, 0.4, 0.6],
    'number_of_hidden_layers': [2],
    'shapes':['funnel'],
    'epochs': [5, 10, 15],
    'batch_size': [32, 64],
    'model_type': ['keras_embeddings_centroids_model'],
}

history_model_ftc = ta.Scan(model_data_ftc['x_train'],
                            model_data_ftc['y_train'],
                            x_val=model_data_ftc['x_train_dev'],
                            y_val=model_data_ftc['y_train_dev'],
                            model=load_model,
                            params=ftc_model_config,
                            grid_downsample=0.1,
                            print_params=True,
                            seed=(123),
                            dataset_name=talos_ftc_log_pathname
                            )


  0%|          | 0/1 [00:00<?, ?it/s]

{'visualize_proccess': True, 'first_neuron': 256, 'activation': 'relu', 'dropout': 0.5, 'number_of_hidden_layers': 2, 'epochs': 10, 'batch_size': 32, 'model_type': 'keras_embeddings_centroids_model'}
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


KeyboardInterrupt: 

    Finds the best model configuration set for the TF-IDF, after the Talos Scanning.

In [None]:
report_ftc = ta.Reporting(history_model_ftc)
best_model_idx = report_ftc.data['val_f1'].idxmax()
best_model_params = report_ftc.data.loc[best_model_idx].to_dict()
best_model_params


    Train return a TF-IDF Model with the the best configuration set.

## MLP classifier in Keras using standardized **embeddings-centroids** features

In [None]:
# Train and Load the best model of given the tuned featured model
model_ftc_history, model_ftc = load_model(model_data_ftc['x_train'],
                                         model_data_ftc['y_train'],
                                         model_data_ftc['x_train_dev'],
                                         model_data_ftc['y_train_dev'],
                                         best_model_params)

### Visualize Model History Scores

In [None]:
from app.visualization import plot_history_metrics
import matplotlib.pylab as plt

%matplotlib inline
plot_history_metrics(history_obj=model_ftc_history)

### Evaluate performance model

Evaluates the performance of the best trained model in the **test** dataset. 

In [None]:
scoref_ftc = model_ftc.evaluate(model_data_ftc['x_test'],
                           model_data_ftc['y_test'],
                           batch_size=best_model_params['batch_size'],
                           verbose=1)

print('\nTest f1: %.4f' % (scoref_ftc[1]))
print('\nTest accuracy: %.4f'% (scoref_ftc[2]))

### Visualize Prediction Perfomance  model

In [None]:
import numpy as np
from app.visualization import (plot_prediction_metrics,
                               create_clf_report,
                               plot_roc_curve,
                               plot_precision_recall_curve,
                               plot_confusion_matrix)
import matplotlib.pylab as plt

prediction_val_ftc = model_ftc.predict(model_data_ftc['x_test'], batch_size=best_model_params['batch_size'])

# returns each entry result to the classification with the relevant probabilities
y_pred_processed_ftc = np.array([np.argmax(val) for val in prediction_val])
y_true_processed_ftc = np.array([np.argmax(val) for val in model_data_tf['y_test']])

# If you want to see the OneVSAll ROC Curves of each class uncomment the below line
# plot_roc_curve(model_data_ftc['y_test'], prediction_val_ftc, Classes, 1)

# If you want to see the OneVSAll Precission Recall Curves of each class, comment out the below line
# plot_precision_recall_curve(model_data_ftc['y_test'], prediction_val_ftc, Classes , 1)

# If you want to get the Classification Report, comment out the below line
create_clf_report(y_true_processed_ftc, prediction_val_ftc, Classes)

# If you want to get the confusion matrix,comment out the below line
plot_confusion_matrix(y_true_processed_ftc, prediction_val_ftc, Classes)

    Loads the dataset using centroids embeddings but now every word vector has been stardirdized.

In [None]:
from app.models import load_model

model_data_sdr_ftc = preprocess_data(data, 'tags', 'post',
                                 input_ins='as_centroids',
                                 cv_split_full=0.3,
                                 cv_split_dev=0.3,
                                 embeddings=embeddings)

history_model_sdr_ftc = ta.Scan(model_data_sdr_ftc['x_train'],
                            model_data_sdr_ftc['y_train'],
                            x_val=model_data_sdr_ftc['x_train_dev'],
                            y_val=model_data_sdr_ftc['y_train_dev'],
                            model=load_model,
                            params=ftc_model_config,
                            grid_downsample=1,
                            print_params=True,
                            seed=(123)
                            )

    Finds the best model for the Standardized FastText Embeddings Centroids.

In [None]:
report_sdr_ftc = ta.Reporting(history_model_sdr_ftc)

#get best model configuration
best_model_sdr_ftc_idx = report_sdr_ftc.data['val_f1'].idxmax()
best_model_params_sdr_ftc = report_sdr_ftc.data.loc[best_model_sdr_ftc_idx].to_dict()
best_model_params_sdr_ftc

    Trains and fit the MLP Network with the standardized data using the best selected features.

In [None]:
model_sdr_ftc_history, model_sdr_ftc = load_model(model_data_sdr_ftc['x_train'],
                                          model_data_sdr_ftc['y_train'],
                                          model_data_sdr_ftc['x_train_dev'],
                                          model_data_sdr_ftc['y_train_dev'],
                                          best_model_params_sdr_ftc)

 ### Visualize Model History Scores


In [None]:
from app.visualization import plot_history_metrics
%matplotlib inline
plot_history_metrics(model_sdr_ftc_history)

### Evaluate performance model

Evaluates the performance of the best trained model in the **test** dataset. 

In [None]:
score_sdr_ftc = model_sdr_ftc.evaluate(model_data_sdr_ftc['x_test'],
                                       model_data_sdr_ftc['y_test'],
                                       batch_size=best_model_params_sdr_ftc['batch_size'],
                                       verbose=1)

print('\nTest f1: %.4f' % (score_sdr_ftc[1]))
print('\nTest accuracy: %.4f'% (score_sdr_ftc[2]))

### Visualize Prediction Perfomance of standardized centroids MLP model

In [None]:
import numpy as np
from app.visualization import (plot_prediction_metrics,
                               create_clf_report,
                               plot_roc_curve,
                               plot_precision_recall_curve)
import matplotlib.pylab as plt

prediction_val_sdr_ftc = model_sdr_ftc.predict(model_data_ftc['x_test'], batch_size=best_model_params_ftc['batch_size'])

# returns each entry result to the classification with the relevant probabilities
y_pred_processed_sdr_ftc = np.array([np.argmax(val) for val in prediction_val_sdr_ftc])
y_true_processed_sdr_ftc = np.array([np.argmax(val) for val in model_data_sdr_ftc['y_test']])

# If you want to get the Classification Report, comment out the below line
create_clf_report(y_true_processed_sdr_ftc, y_pred_processed_sdr_ftc, Classes)

# If you want to get the confusion matrix,comment out the below line
plot_confusion_matrix(y_true_processed_sdr_ftc, y_pred_processed_sdr_ftc, Classes)