### Sentence Classification Model Training
- This notebook explains how to train a sentence classification model using Machine Learning Algorithms 
- We use **Universal Sentence Encoder** pre-trained language model from **Tensorflow Hub** to obtain text representations for our text corpus

**Import Libraries**

In [None]:
import pandas as pd 
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

print(tf.__version__)

In [None]:
# read training data from drive 
ds1 = pd.read_excel('../../input/data_Categorization_set1.xlsx', sheet_name = 1, usecols = ['News', 'Sub-Categories'])
ds2 = pd.read_excel('../../input/data_Categorization_set2.xlsx', sheet_name = 1, usecols = ['News', 'Sub-Categories'])
ds1 = ds1[ds1['Sub-Categories'].notnull()]
ds2 = ds2[ds2['Sub-Categories'].notnull()]
ds = pd.concat([ds1, ds2], axis = 0)
ds.drop_duplicates('News', keep = 'first', inplace = True)
list_sentences = ds['News'].tolist()
print(len(list_sentences))

In [None]:
# load pre-trained language model from Tfhub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [None]:
# helper function 
def embed_text(text):
    embeddings = embed(text)
    return[vector.numpy().tolist() for vector in embeddings]

In [None]:
# get embeddings for all sentences 
list_of_sentence_vectors = embed_text(list_sentences)

In [None]:
# convert list of sentence vectors into a dataframe
embeddings_df = pd.DataFrame()

for i in range(len(list_of_sentence_vectors)):
    df = pd.DataFrame([list_of_sentence_vectors[i]])
    embeddings_df = embeddings_df.append(df)

In [None]:
# save the embeddings in a file
embeddings_df.to_csv('../../output/sentence_embeddings_tfh.csv', index = False)

**Model Training**

In [None]:
# import libraries 
from pprint import pprint
import logging
from time import time
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.externals import joblib

print(sklearn.__version__)

In [None]:
# read embeddings that is saved in disc
embeddings_df = pd.read_csv(data_path + 'sentence_embeddings_tfh.csv')
embeddings_df.shape

In [None]:
# specify Xs and Ys
X = embeddings_df
y = ds['Sub-Categories']

In [None]:
# train/test split
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 999)
print(f'Training data size: {train_x.shape}')
print(f'Testing data size: {test_x.shape}')

In [None]:
# create pipeline for various models to train
logit_ppl = Pipeline([('logit', LogisticRegression(multi_class='ovr', solver='liblinear'))])
rf_ppl = Pipeline([('rf', RandomForestClassifier(n_estimators = 50))])
xgb_ppl = Pipeline([('xgb', xgb.XGBClassifier(objective = 'multi:softmax', ))])
svm_ppl = Pipeline([('svm', LinearSVC(multi_class = 'ovr'))])

# parameters for Gridsearch
param_logitGv = {'logit__max_iter':[100, 500, 1000]}
param_rfGv = {'rf__min_samples_split': [10, 20, 50]}
param_svmGv = {'svm__max_iter': [100, 500, 1000]}
param_xgbGv = {'xgb__learning_rate': [.01, .05], 'xgb__n_estimators': [10, 50]}

logitGv = GridSearchCV(logit_ppl, param_logitGv, cv = 5)
rfGv = GridSearchCV(rf_ppl, param_rfGv, cv = 5)
svmGv = GridSearchCV(svm_ppl, param_svmGv, cv = 5)
xgbGv = GridSearchCV(xgb_ppl, param_xgbGv, cv = 5)

In [None]:
# train models 
print('Performing Grid Search ...')
print('Pipeline:', [name for name, _ in logit_ppl.steps])
print('Parameters:')
pprint(param_logitGv)
t0 = time()
logitGv.fit(train_x, train_y)
print('Done in %0.3fs' % (time() - t0))
print()

print('Best score %0.3f' % logitGv.best_score_)
print('Best parameters set:')
best_parameters = logitGv.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print('\t%s:%r' % (param_name, best_parameters[param_name]))

print('-----' * 30)

print('Performing Grid Search ...')
print('Pipeline:', [name for name, _ in rf_ppl.steps])
print('Parameters:')
pprint(param_rfGv)
to = time()
rfGv.fit(train_x, train_y)
print('Done in %0.3fs' % (time() - t0))
print()

print('Best score %0.3f' % rfGv.best_score_)
print('Best parameters set:')
best_parameters = rfGv.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print('\t%s:%r' % (param_name, best_parameters[param_name]))

print('-----' * 30)

print('Performing Grid Search ...')
print('Pipeline:', [name for name, _ in svm_ppl.steps])
print('Parameters:')
pprint(param_svmGv)
to = time()
svmGv.fit(train_x, train_y)
print('Done in %0.3fs' % (time() - t0))
print()

print('Best score %0.3f' % svmGv.best_score_)
print('Best parameters set:')
best_parameters = svmGv.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print('\t%s:%r' % (param_name, best_parameters[param_name]))

print('-----' * 30)

print('Performing Grid Search ...')
print('Pipeline:', [name for name, _ in xgb_ppl.steps])
print('Parameters:')
pprint(param_xgbGv)
to = time()
xgbGv.fit(train_x, train_y)
print('Done in %0.3fs' % (time() - t0))
print()

print('Best score %0.3f' % xgbGv.best_score_)
print('Best parameters set:')
best_parameters = xgbGv.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
    print('\t%s:%r' % (param_name, best_parameters[param_name]))

In [None]:
# Train a model with best parameters from Grid Search 
# Predict 
svm_classifier = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=100,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
svm_model = svm_classifier.fit(train_x, train_y)
print(svm_model)
print('----' * 10)
preds = svm_model.predict(test_x)

In [None]:
# plot confusion matrix
target_labels = train_y.drop_duplicates().values
conf_matrix = confusion_matrix(test_y, preds, labels = target_labels)
fig, ax = plt.subplots(figsize = (8, 8))
sns.heatmap(conf_matrix, annot = True, fmt = 'd', xticklabels = target_labels, yticklabels = target_labels)

**Save & Load Models**

In [None]:
joblib.dump(svm_model, '../../input/sent_classifier_model/svm_model_wt_use.pkl') # save model to disc 
svm_clf_model = joblib.load('../../input/sent_classifier_model/svm_model_wt_use.pkl') # read model from disc

### Sentence Classification Model Prediction

**Prepare Data**

In [None]:
# import dataset with sentences obtained from ABSA models 
sample_data = pd.read_csv('../../input/sample_data_.csv', encoding = 'latin-1')
print(f'Number of Articles: {sample_data.doc_id.value_counts().count()}')
print(f'Number of Sentences: {sample_data.shape[0]}')

In [None]:
# Create a list of sentences
sentence_list = sample_data['sents'].astype('str').tolist()
text_vectors = embed_text(sentence_list)
len(text_vectors)

**Text Embeddings**

In [None]:
# Generate text representations 
random_embeddings_df = pd.DataFrame()
for i in range(len(text_vectors)):
    df = pd.DataFrame([text_vectors[i]])
    random_embeddings_df = random_embeddings_df.append(df)

**Prediction**

In [None]:
# predict & write results into a text file
random_preds = svm_clf_model.predict(random_embeddings_df)
for sentence, pred in zip(sentence_list, random_preds):
    with open('../../output/results_prediction_for_reports.txt', 'a') as outfile:
        results_to_write = sentence + '\t' + pred
        outfile.write(results_to_write)
        outfile.write('\n')

In [None]:
# merge model predictions with main dataset
results_df = pd.read_table('../../output/results_prediction_for_reports.txt', sep = '\t', header = None, names = ['sentence', 'category'], usecols = ['category'])
sample_data = pd.merge(sample_data[['d_pol', 'doc_id', 'company_name', 'sents', 's_pol']], results_df, how = 'left', left_index=True, right_index=True)