In [1]:
import numpy as np
import pandas as pd
import gensim
from gensim.models import Word2Vec
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import string
from sklearn.utils import shuffle
import multiprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics

In [8]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\logonpflocal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\logonpflocal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [2]:
df = pd.read_csv('uci-news-aggregator.csv')
df = df[['TITLE','CATEGORY']]
#categories: b = business, t = science and technology, e = entertainment, m = health

In [3]:
# shuffle - embaralha as linhas para evitar problema (aprende apenas amostras de uma categoria e isso pode levar o gradiente a
# ficar preso num mínimo local e só aprender bem sobre)
df = shuffle(df)
df = df.reset_index(drop = True)
df.head()

Unnamed: 0,TITLE,CATEGORY
0,'Agents of S.H.I.E.L.D.' Season Finale Sneak P...,e
1,"Mosquito-borne virus may soon hit GA, health d...",m
2,Chiquita combines with Dublin-based Fyffes to ...,b
3,Memorial Day weekend and meteor shower viewing...,t
4,Meet the Visionary Behind the World's Largest IPO,b


In [4]:
import nltk
import re
import string
import unicodedata

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
    
        
def normalize_accents(text):
    return unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("utf-8")

def normalize_str(text):
    text = text.lower()
    text = remove_punctuation(text)
    text = normalize_accents(text)
    text = re.sub(re.compile(r" +"), " ",text)
    return " ".join([w for w in text.split()])

def remove_punctuation(text):
    punctuations = string.punctuation
    table = str.maketrans({key: " " for key in punctuations})
    text = text.translate(table)
    return text


def tokenizer(text):
    stop_words = nltk.corpus.stopwords.words("english") # portuguese, caso o dataset seja em português
    if isinstance(text, str):
        text = normalize_str(text)
        text = "".join([w for w in text if not w.isdigit()])
        text = word_tokenize(text)
        text = [x for x in text if x not in stop_words]
        text = [y for y in text if len(y) > 2]
        return [t for t in text]
    else:
        return None


In [9]:
df['Title_Treated'] = df['TITLE'].apply(tokenizer)

In [10]:
df.head()

Unnamed: 0,TITLE,CATEGORY,Title_Treated
0,'Agents of S.H.I.E.L.D.' Season Finale Sneak P...,e,"[agents, season, finale, sneak, peek, coulson,..."
1,"Mosquito-borne virus may soon hit GA, health d...",m,"[mosquito, borne, virus, may, soon, hit, healt..."
2,Chiquita combines with Dublin-based Fyffes to ...,b,"[chiquita, combines, dublin, based, fyffes, cr..."
3,Memorial Day weekend and meteor shower viewing...,t,"[memorial, day, weekend, meteor, shower, viewi..."
4,Meet the Visionary Behind the World's Largest IPO,b,"[meet, visionary, behind, world, largest, ipo]"


In [11]:
labels = np.array(df['CATEGORY']) # label para cada uma das frases

In [12]:
# parâmetros do word2vec
dim_vec = 300
min_count = 10
window = 4
num_workers = multiprocessing.cpu_count()
seed = np.random.seed(42)

In [13]:
# instância do Word2Vec
modelo = Word2Vec(df["Title_Treated"],
                    min_count = min_count, 
                    vector_size = dim_vec, 
                    window = window,
                    seed = seed,
                    workers = num_workers,
                    sg = 1) #sg = 0 -> CBOW e sg = 1 -> skipgram

In [14]:
print("Tamanho do vocabulário do Word2Vec: ", len(modelo.wv))

Tamanho do vocabulário do Word2Vec:  16241


In [15]:
# exemplos das relações semânticas que o word2vec consegue estabelecer
print(modelo.wv.most_similar('samsung'), '\n') # palavra mais similar a 'itau'
print(modelo.wv.similarity('google', 'microsoft'), '\n') # similaridade entre duas palavras
print(modelo.wv.most_similar(positive = ['show', 'movie'], negative = ['home'], topn = 3)) # similaridade considerando exemplos positivos e negativos

[('galaxy', 0.6372635960578918), ('tab', 0.5986858010292053), ('neo', 0.5947222709655762), ('waterproof', 0.583037793636322), ('specification', 0.5822816491127014), ('antutu', 0.5815654397010803), ('electronics', 0.5799608826637268), ('tizen', 0.5776135325431824), ('phablets', 0.5722646117210388), ('spire', 0.5720193982124329)] 

0.29598713 

[('film', 0.44674697518348694), ('gritty', 0.3722717761993408), ('cinematic', 0.368047833442688)]


In [16]:
def meanVector(model,phrase):
    vocab = list(model.wv.index_to_key)
    phrase = " ".join(phrase)
    phrase = [x for x in word_tokenize(phrase) if x in vocab]
    #Quando não houver palavra o vector recebe 0 para todas as posições
    if phrase == []:
        vetor = [0.0]*dim_vec 
    else: 
        #Caso contrário, calculando a matriz da frase
        vetor = np.mean([model.wv[word] for word in phrase],axis=0)
    return vetor

In [17]:
def createFeatures(base): 
    features = [meanVector(modelo,base['Title_Treated'][i])for i in range(len(base))]
    return features

In [18]:
df = createFeatures(df)
df[0]

array([-1.47298977e-01, -1.29128145e-02,  4.25189920e-02, -3.33455647e-03,
       -2.56209701e-01, -5.32733619e-01,  2.47667789e-01,  9.31246504e-02,
       -1.74444035e-01, -2.32230142e-01,  3.82345319e-01, -2.64998171e-02,
       -1.18765354e-01,  1.47529960e-01,  9.99135002e-02,  8.77503380e-02,
       -2.82531500e-01,  4.97776009e-02, -6.84843808e-02, -3.18773687e-01,
        6.18683882e-02, -1.63953587e-01,  4.93220925e-01,  1.01788737e-01,
        5.13082035e-02,  2.60523826e-01, -2.79241234e-01,  2.35770687e-01,
       -7.67095312e-02,  3.23193558e-02, -2.88207293e-01, -7.75894970e-02,
        9.78389662e-03, -1.84486672e-01,  1.87153623e-01,  9.61149558e-02,
       -2.00837851e-03,  1.65779199e-02, -8.18101689e-02, -7.91890845e-02,
       -1.77113846e-01,  3.21817622e-02,  4.30391692e-02,  1.07597478e-01,
       -4.76628244e-02, -2.29110911e-01, -1.29013509e-03, -2.72823513e-01,
       -5.16905077e-02,  5.75333424e-02,  8.29335079e-02,  3.69382948e-01,
       -8.10828879e-02,  

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[0:100000], labels[0:100000], test_size=0.3,random_state=42)
clf = svm.SVC(kernel='rbf') 
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))