In [2]:
from transformers import BertTokenizer
import tensorflow as tf
from keras.utils import pad_sequences
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib.colors import LinearSegmentedColormap
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, top_k_accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from PIL import Image, ImageFont, ImageDraw
import numpy as np
from xsvmlib.xsvmc import xSVMC
from joblib import dump, load
import base64
from io import BytesIO

In [3]:
RANDOM_STATE = 42

In [4]:
df_news = pd.read_csv("./data/df_total.csv")

In [5]:
sentences = df_news.news.values

In [6]:
df_news.Type.unique()

array(['Otra', 'Regulaciones', 'Alianzas', 'Macroeconomia', 'Innovacion',
       'Sostenibilidad', 'Reputacion'], dtype=object)

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

In [8]:
max_len = 4500

In [9]:
token_ids = []
encoded_inputs = []
token_to_text = []
for sentence in sentences:
  encoded_input = tokenizer(sentence, return_tensors='tf')
  tokens = encoded_input.input_ids[0]
  
  token_ids.append(tokens)
  encoded_inputs.append(encoded_input)

ready_tokens = pad_sequences(token_ids, padding="post", maxlen=max_len)

for i in range(len(ready_tokens)):
  text = tokenizer.convert_ids_to_tokens(ready_tokens[i])
  token_to_text.append(text) 

Token indices sequence length is longer than the specified maximum sequence length for this model (651 > 512). Running this sequence through the model will result in indexing errors


In [10]:
scaler = MinMaxScaler()
scaler.fit(ready_tokens)

In [11]:
clf = load("./xsvmc.joblib")

In [12]:
X = scaler.transform(ready_tokens)
y = df_news["Type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

In [13]:
param_grid = {
  'C': [1, 0.1, 10, 100], 
  'kernel': ["poly"],
  'gamma': [0.01, 0.1, 1],
  'degree': [3],
  'coef0': [0.1, 1, 10, 100],
}

In [14]:
# grid = GridSearchCV(xSVMC(), param_grid, cv=3, verbose=2)
# grid.fit(X_train,y_train)

In [15]:
# grid.best_estimator_

In [16]:
#Best parameter
kernel = "poly"
gamma = 0.1
C =  1
k = 3
coef0 = 0.1
clf = xSVMC(kernel=kernel, C=C, gamma=gamma, k=k, coef0=coef0, random_state=RANDOM_STATE)
clf.fit(X_train, y_train)

In [17]:
SVs = clf.support_vectors_

In [18]:
confusion_matrix(y_test, clf.predict(X_test))

array([[41,  2, 28,  4,  5,  0,  1],
       [ 5, 27, 16,  0,  2,  0,  2],
       [20, 11, 61,  2,  8,  0,  6],
       [17,  1, 17,  2,  3,  0,  0],
       [18,  3, 11,  2,  0,  0,  1],
       [ 4,  1,  4,  0,  1,  0,  0],
       [ 2,  7, 17,  0,  2,  0, 12]], dtype=int64)

In [19]:
print(classification_report(y_test, clf.predict(X_test)))

                precision    recall  f1-score   support

      Alianzas       0.38      0.51      0.44        81
    Innovacion       0.52      0.52      0.52        52
 Macroeconomia       0.40      0.56      0.47       108
          Otra       0.20      0.05      0.08        40
  Regulaciones       0.00      0.00      0.00        35
    Reputacion       0.00      0.00      0.00        10
Sostenibilidad       0.55      0.30      0.39        40

      accuracy                           0.39       366
     macro avg       0.29      0.28      0.27       366
  weighted avg       0.36      0.39      0.36       366



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
dump(clf, "./xsvmc.joblib")

['./xsvmc.joblib']

In [21]:
colormap = LinearSegmentedColormap.from_list('custom', 
                                       [(0, '#162cd9'),
                                        (1,   '#f2271f')], N=256)

In [22]:
def draw_text(draw_obj, text, pos_x, pos_y, prob, font):
  color = colormap(prob)[:3]
  draw_obj.text((pos_x, pos_y), text, fill=(int(color[0] * 255), int(color[1] * 255), int(color[2] * 255), 255), font=font)

In [23]:
def create_img(l_text, prob, y_size):
  offset_y = 10
  offset_x = 10
  img_x = 800
  img_y = 600

  img = Image.new(mode="RGBA", size=(img_x, img_y), color = (255, 255, 255))
  txt = Image.new('RGBA', img.size, (255,255,255,0))
  draw = ImageDraw.Draw(txt)
  font = ImageFont.truetype("fonts\SpaceMono-Bold.ttf", 16)

  last_pos_x = 0
  combined = 0
  pos_y = 0
  for i in range(len(l_text)):
    palabra = l_text[i]
    if palabra.startswith("##"):
      palabra = palabra[2:]
      pos_x = (len(palabra)) * 10
    elif palabra not in [".", ","]:
      palabra = " " + palabra
      pos_x = (len(palabra)) * 10
      if last_pos_x + pos_x + offset_x >= img_x - 70:
        pos_y += 18
        last_pos_x = 0
    else:
      pos_x = (len(palabra)) * 10

    draw_text(draw, palabra, last_pos_x + offset_x, pos_y + offset_y, prob[i], font)
    last_pos_x += pos_x

  combined = Image.alpha_composite(img, txt)
  buffered = BytesIO()
  combined.save(buffered, format="PNG")
  img_str = base64.b64encode(buffered.getvalue())
  return img_str

In [24]:
def render_text(text, misv):
  y_size = int(len(text) / 60 * 25)
  values = misv
  max_value = max(values)
  prob = (values / max_value)
  create_img(text, prob, y_size)

In [25]:
def preprocess_text(text):
  encoded_input = tokenizer(text, return_tensors='tf')
  tokens = encoded_input.input_ids[0]
  ready_tokens = pad_sequences([tokens], padding="post", maxlen=max_len)[0]
  text_tokens = tokenizer.convert_ids_to_tokens(ready_tokens)
  return ready_tokens, text_tokens, len(tokens)

In [26]:
def contextualized_prediction(text):
  tokens, text_tokens, original_len = preprocess_text(text)
  clean_text = text_tokens[1:original_len-1]
  topK = clf.predict_with_context(tokens)
  response = []
  for i in range(len(topK)):
    pred = topK[i]
    mu_misv = SVs[pred.eval.mu_hat.misv_idx][1:original_len-1]
    nu_misv = SVs[pred.eval.nu_hat.misv_idx][1:original_len-1]
    b64_pro = render_text(clean_text, mu_misv)
    b64_contra = render_text(clean_text, nu_misv)
    print(b64_contra)
    response.append({
      'clase': pred.class_name,
      'favor': b64_pro,
      'contra': b64_contra
    })
  return response

In [27]:
preprocess_text(df_news.news[0])

(array([  101, 13038, 10125, ...,     0,     0,     0]),
 ['[CLS]',
  'Durante',
  'el',
  'for',
  '##o',
  'La',
  'banca',
  'arti',
  '##cula',
  '##dor',
  'empresa',
  '##rial',
  'para',
  'el',
  'desarrollo',
  'sos',
  '##teni',
  '##ble',
  'el',
  'director',
  'de',
  'sos',
  '##teni',
  '##bilidad',
  'y',
  'clientes',
  'globale',
  '##s',
  'de',
  'BB',
  '##VA',
  'en',
  'Colombia',
  'Andrés',
  'García',
  'as',
  '##egu',
  '##ró',
  'que',
  'es',
  'importante',
  'entender',
  'que',
  'la',
  'sos',
  '##teni',
  '##bilidad',
  'no',
  'la',
  'podemos',
  'as',
  '##oci',
  '##ar',
  'a',
  'mayores',
  'costo',
  '##s',
  '.',
  'Yo',
  'c',
  '##reo',
  'que',
  'el',
  'no',
  'tener',
  'un',
  'concepto',
  'de',
  'negocio',
  'sos',
  '##teni',
  '##ble',
  'puede',
  'tener',
  'un',
  'mayor',
  'impacto',
  'de',
  'lo',
  'que',
  'ima',
  '##gina',
  '##mos',
  '.',
  'Para',
  'García',
  'el',
  'ret',
  '##o',
  'más',
  'importante',
  'es',