In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, confusion_matrix
import numpy as np

In [2]:
train = pd.read_csv('https://github.com/tknishh/Text-Classification-Ag-News/raw/master/data/train.csv')
train = train.sample(len(train))
test = pd.read_csv('https://github.com/tknishh/Text-Classification-Ag-News/raw/master/data/test.csv')

# Categorias
* World
* Sports
* Business
* Science/Technology

In [3]:
train[train['Class Index'] == 1].sample(10)['Title']

13497     Khatami says IAEA must accept Iran #39;s right...
110891           Annan gets ovation at UN despite US attack
97892                   Bush Has a Plan to Create Jobs (AP)
105526       American Military Relief Effort Picks Up Steam
15392                 Thatcher #39;s wife lands at Heathrow
47798              Amy Fisher Addresses Her Past in New Bio
45901                          Blair to stay for third term
12315             Convention Protesters Eager to Begin (AP)
116179    Turkish captain foils EU #39;s would-be defenders
90653             Thai PM weighs call for conciliatory body
Name: Title, dtype: object

In [4]:
train[train['Class Index'] == 2].sample(10)['Title']

48890                  Garciaparra still interested in Cubs
11819         Schumacher tipped to win seventh championship
93144           Hawks Rally From 11 Down, Edge Rockets (AP)
45776     Schwarzenegger Vetoes Sports Diet Supplement Bill
32634            NFL Game Summary - Carolina at Kansas City
5093                             Pakistan to rest speed duo
74196                              Peace summit to end feud
115889                                        Senor Moment?
66154     Red Sox Look to Reverse the Curse with Game Se...
98492                  ROVERS BOUNCE BACK IN EWOOD THRILLER
Name: Title, dtype: object

In [5]:
train[train['Class Index'] == 3].sample(10)['Title']

101239       Shortage of steel forces halt at Nissan
12562          Oil prices firm after week of decline
64525                   House prices go on the slide
105487                        Gold Offer Loses Shine
110953    SEC probes auditor's link to fund trustees
54681            Krispy Kreme #39;s Sticky Situation
3735               Qantas posts record annual profit
89522                             The Toy War Begins
119202       Bad Day for Drug Companies and Patients
67697                        Merger wounds JP Morgan
Name: Title, dtype: object

In [6]:
train[train['Class Index'] == 4].sample(10)['Title']

33314    PeopleSoft prepares for questions at Connect show
3717           Bankrupt Commerce One Patents Fetch \$15.5M
68500        Colo. bird population reported in sorry state
48825    Mount Vernon City Library to close books on ru...
73581         Intel eyes remote wireless device management
90676        Opportunity Rover to Pack Up and Leave Crater
28822    Inventor Develops Nose-Steered Web Surfing Sys...
51609                       Web Industry Still Flies Blind
1509     HP releases  quot;carrier quot; grade Linux fo...
9389             Actuate pushes open-source data reporting
Name: Title, dtype: object

In [7]:
train['content'] = train['Title'] + " " + train['Description']
test['content'] = test['Title'] + " " + test['Description']

# Cargo embeddings

In [8]:
from gensim.models import KeyedVectors
from gensim import downloader

downloader.load('word2vec-google-news-300')
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

ImportError: cannot import name 'triu' from 'scipy.linalg' (c:\Users\Facu\anaconda3\envs\myenv\Lib\site-packages\scipy\linalg\__init__.py)

Nos quedamos solo con las palabras útiles en un set

In [None]:
import re

TOKENIZER_REGEX = r"\w+(?:'\w+)?|[^\w\s]"

palabras_utiles = set()

for text in train['content'].tolist():
  palabras_utiles.update(re.findall(TOKENIZER_REGEX, text.lower()))

In [None]:
len(palabras_utiles)

Extraemos los embeddings

In [None]:
key_to_index = {}
embs = []

for i in range(len(model)):
  # Para ahorrar RAM, solo nos quedamos con el vector si es parte de las palabras utiles
  if model.index_to_key[i] in palabras_utiles:
    key_to_index[model.index_to_key[i]] = len(embs)
    embs.append(model[model.index_to_key[i]])

In [None]:
embs = np.asarray(embs)

In [None]:
embs.shape

In [None]:
#ahorramos ram
model = None
palabras_utiles = None

Creamos un vector para la palabra deconocida usando el promedio de todas

In [None]:
unknown_emb = embs.mean(axis=0)

In [None]:
unknown_emb.shape

Creamos la matriz de embeddings, en el indice 0 contiene a un vector de 0s para indicar que no hay palabras, en el indice 1 contiene al vector de palabra desconocida y en el resto las palabras del embedding.

In [None]:
embs = np.concatenate([np.zeros((1,300)), unknown_emb.reshape((1,300)), embs], axis=0)
embs.shape

Tokenizamos todos los textos y transformamos cada palabra en índice de la matriz

In [None]:
X_train = []

for text in train['content'].tolist():
  tokens = re.findall(TOKENIZER_REGEX, text.lower())
  # Sumamos 2 por el vector de 0s y el de unknown
  tokens = [key_to_index[t] + 2 if t in key_to_index else 1 for t in tokens]
  X_train.append(tokens)

In [None]:
X_test = []

for text in test['content'].tolist():
  tokens = re.findall(TOKENIZER_REGEX, text.lower())
  # Sumamos 2 por el vector de 0s y el de unknown
  tokens = [key_to_index[t] + 2 if t in key_to_index else 1 for t in tokens]
  X_test.append(tokens)

In [None]:
print(train['content'].tolist()[0])
print(X_train[0])

Paddeamos los textos para que tengan todos el mismo largo

In [None]:
np.quantile([len(x) for x in X_train], 0.8)

In [None]:
from tensorflow.keras.utils import pad_sequences

In [None]:
X_train = pad_sequences(X_train,maxlen=55,padding='post',truncating='post',value=0)
X_test = pad_sequences(X_test,maxlen=55,padding='post',truncating='post',value=0)

X_train.shape, X_test.shape

In [None]:
y_train = train['Class Index'].values.reshape((-1, 1))-1
y_test = test['Class Index'].values.reshape((-1, 1))-1

y_train.shape, y_test.shape

In [None]:
from tensorflow.keras.layers import Input, Dense, Conv1D, GlobalMaxPooling1D, GRU, Concatenate, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
inp = Input((55,), dtype='int32')
# Ponemos que los embeddings no sean entrenables
emb_layer = Embedding(input_dim=embs.shape[0], output_dim=embs.shape[1], weights=[embs], trainable=False)(inp)
conv1 = Conv1D(16, kernel_size=2, padding='same')(emb_layer)
conv2 = Conv1D(8, kernel_size=3, padding='same')(emb_layer)
conv3 = Conv1D(4, kernel_size=5, padding='same')(emb_layer)
concat = Concatenate()([conv1, conv2, conv3])
maxpool = GlobalMaxPooling1D()(concat)
out = Dense(4, activation='softmax')(maxpool)
model = Model(inputs=inp, outputs=out)

In [None]:
model.summary()

In [None]:
model.compile(optimizer=Adam(0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model.fit(X_train, y_train, epochs=10, validation_split=0.1, callbacks=[early_stop])

In [None]:
preds = np.argmax(model.predict(X_test), axis=1)

In [None]:
accuracy_score(y_test, preds)

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, preds)).plot()