In [43]:
from transformers import BertTokenizer
import tensorflow as tf
from keras.utils import pad_sequences
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib.colors import LinearSegmentedColormap
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, top_k_accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from PIL import Image, ImageFont, ImageDraw
import numpy as np
from xsvmlib.xsvmc import xSVMC
from joblib import dump, load

In [4]:
RANDOM_STATE = 42

In [5]:
df_news = pd.read_csv("./data/df_total.csv")

In [6]:
sentences = df_news.news.values

In [7]:
df_news.Type.unique()

array(['Otra', 'Regulaciones', 'Alianzas', 'Macroeconomia', 'Innovacion',
       'Sostenibilidad', 'Reputacion'], dtype=object)

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)

In [11]:
max_len = 0

# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


Max sentence length:  4433


In [12]:
token_ids = []
encoded_inputs = []
embed_vectors = []
token_to_text = []
for sentence in sentences:
  encoded_input = tokenizer(sentence, return_tensors='tf')
  tokens = encoded_input.input_ids[0]
  
  token_ids.append(tokens)
  encoded_inputs.append(encoded_input)

ready_tokens = pad_sequences(token_ids, padding="post", maxlen=max_len)

for i in range(len(ready_tokens)):
  text = tokenizer.convert_ids_to_tokens(ready_tokens[i])
  token_to_text.append(text) 

In [18]:
scaler = MinMaxScaler()
scaler.fit(ready_tokens)

In [45]:
clf = load("./xsvmc.joblib")

In [35]:
X = scaler.transform(ready_tokens)
y = df_news["Type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

In [21]:
param_grid = {
  'C': [1, 0.1, 10, 100], 
  'kernel': ["poly"],
  'gamma': [0.01, 0.1, 1],
  'degree': [3],
  'coef0': [0.1, 1, 10, 100],
}

In [22]:
# grid = GridSearchCV(xSVMC(), param_grid, cv=3, verbose=2)
# grid.fit(X_train,y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END ..C=1, coef0=0.1, degree=3, gamma=0.01, kernel=poly; total time=   2.4s
[CV] END ..C=1, coef0=0.1, degree=3, gamma=0.01, kernel=poly; total time=   2.1s
[CV] END ..C=1, coef0=0.1, degree=3, gamma=0.01, kernel=poly; total time=   2.0s
[CV] END ...C=1, coef0=0.1, degree=3, gamma=0.1, kernel=poly; total time=   1.8s
[CV] END ...C=1, coef0=0.1, degree=3, gamma=0.1, kernel=poly; total time=   1.7s
[CV] END ...C=1, coef0=0.1, degree=3, gamma=0.1, kernel=poly; total time=   1.6s
[CV] END .....C=1, coef0=0.1, degree=3, gamma=1, kernel=poly; total time=   1.7s
[CV] END .....C=1, coef0=0.1, degree=3, gamma=1, kernel=poly; total time=   1.6s
[CV] END .....C=1, coef0=0.1, degree=3, gamma=1, kernel=poly; total time=   1.6s
[CV] END ....C=1, coef0=1, degree=3, gamma=0.01, kernel=poly; total time=   1.6s
[CV] END ....C=1, coef0=1, degree=3, gamma=0.01, kernel=poly; total time=   1.7s
[CV] END ....C=1, coef0=1, degree=3, gamma=0.01

In [23]:
# grid.best_estimator_

In [36]:
#Best parameter
kernel = "poly"
gamma = 0.1
C =  1
k = 3
coef0 = 0.1
clf = xSVMC(kernel=kernel, C=C, gamma=gamma, k=k, coef0=coef0, random_state=RANDOM_STATE)
clf.fit(X_train, y_train)

In [None]:
SVs = clf.support_vectors_

In [37]:
confusion_matrix(y_test, clf.predict(X_test))

array([[38,  0, 33,  5,  3,  0,  2],
       [ 3, 26, 20,  0,  0,  0,  3],
       [16,  7, 74,  2,  4,  0,  5],
       [21,  1, 16,  1,  1,  0,  0],
       [18,  1, 11,  2,  2,  0,  1],
       [ 2,  0,  6,  1,  1,  0,  0],
       [ 3,  6, 18,  0,  0,  0, 13]], dtype=int64)

In [38]:
print(classification_report(y_test, clf.predict(X_test)))

                precision    recall  f1-score   support

      Alianzas       0.38      0.47      0.42        81
    Innovacion       0.63      0.50      0.56        52
 Macroeconomia       0.42      0.69      0.52       108
          Otra       0.09      0.03      0.04        40
  Regulaciones       0.18      0.06      0.09        35
    Reputacion       0.00      0.00      0.00        10
Sostenibilidad       0.54      0.33      0.41        40

      accuracy                           0.42       366
     macro avg       0.32      0.29      0.29       366
  weighted avg       0.38      0.42      0.38       366



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
dump(clf, "./xsvmc.joblib")

['./xsvmc.joblib']

In [None]:
colormap = LinearSegmentedColormap.from_list('custom', 
                                       [(0, '#162cd9'),
                                        (1,   '#f2271f')], N=256)

In [None]:
def draw_text(draw_obj, text, pos_x, pos_y, prob, font):
  color = colormap(prob)[:3]
  draw_obj.text((pos_x, pos_y), text, fill=(int(color[0] * 255), int(color[1] * 255), int(color[2] * 255), 255), font=font)

In [None]:
def create_img(l_text, prob, path, y_size):
  offset_y = 10
  offset_x = 10
  img_x = 800
  img_y = 600

  img = Image.new(mode="RGBA", size=(img_x, img_y), color = (255, 255, 255))
  txt = Image.new('RGBA', img.size, (255,255,255,0))
  draw = ImageDraw.Draw(txt)
  font = ImageFont.truetype("fonts\SpaceMono-Bold.ttf", 16)

  last_pos_x = 0
  combined = 0
  pos_y = 0
  for i in range(len(l_text)):
    palabra = l_text[i]
    if palabra.startswith("##"):
      palabra = palabra[2:]
      pos_x = (len(palabra)) * 10
    elif palabra not in [".", ","]:
      palabra = " " + palabra
      pos_x = (len(palabra)) * 10
      if last_pos_x + pos_x + offset_x >= img_x - 70:
        pos_y += 18
        last_pos_x = 0
    else:
      pos_x = (len(palabra)) * 10

    draw_text(draw, palabra, last_pos_x + offset_x, pos_y + offset_y, prob[i], font)
    last_pos_x += pos_x

  combined = Image.alpha_composite(img, txt)
  combined.save(path)

In [None]:
def render_text(text, idx, mu_misv, nu_misv, save_name):
  y_size = int(len(text) / 60 * 25)
  values_yes = mu_misv
  max_yes = max(values_yes)
  prob_yes = (values_yes / max_yes)
  create_img(text, prob_yes, "./out-%s.png" % save_name, y_size)

In [None]:
def preprocess_text(text):
  encoded_input = tokenizer(text, return_tensors='tf')
  tokens = encoded_input.input_ids[0]
  ready_tokens = pad_sequences(token_ids, padding="post", maxlen=max_len)[0]
  text_tokens = tokenizer.convert_ids_to_tokens(ready_tokens)
  return ready_tokens, text_tokens, len(tokens)

In [None]:
def contextualized_prediction(text):
  tokens, text_tokens, original_len = preprocess_text(text)
  clean_text = text_tokens[1:original_len-1]
  topK = clf.predict_with_context(tokens)
  print(len(topK))
  for i in range(len(topK)):
    pred = topK[i]
    mu_misv = SVs[pred.eval.mu_hat.misv_idx][1:original_len-1]
    nu_misv = SVs[pred.eval.nu_hat.misv_idx][1:original_len-1]
    render_text(clean_text, idx, mu_misv, nu_misv, "%d-favor" % i)