In [1]:
import numpy as np
import pandas as pd
import gensim
from gensim.models import Word2Vec
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import string
from sklearn.utils import shuffle
import multiprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics

In [2]:
df = pd.read_csv('uci-news-aggregator.csv')
df = df[['TITLE','CATEGORY']]
#categories: b = business, t = science and technology, e = entertainment, m = health

In [3]:
# shuffle - embaralha as linhas para evitar problema (aprende apenas amostras de uma categoria e isso pode levar o gradiente a
# ficar preso num mínimo local e só aprender bem sobre)
df = shuffle(df)
df = df.reset_index(drop = True)
df.head()

Unnamed: 0,TITLE,CATEGORY
0,Honda America to recall over two million vehic...,t
1,Rolf Harris guilty: The stories the jury didn'...,e
2,Kelly Clarkson welcomes baby girl,e
3,"Miley Cyrus Gets Floyd Tribute Tattoo, Flashes...",e
4,"Portland boil water alert: Fred Meyer, other b...",m


In [4]:
def TreatText(data):
    #import pdb; pdb.set_trace()
    stops = set(stopwords.words("english"))  # melhora a performance convertendo num set
    data['TITLE'] = [re.sub("[^a-zA-Z]", " ",data['TITLE'][i]) for i in range(len(data))] #mantém apenas letras (há números, links, etc.)     
    data['TITLE'] = [word_tokenize(data['TITLE'][i].lower()) for i in range(len(data))] # caixa baixa
    data['TITLE'] = [[w for w in data['TITLE'][i] if w not in stops]for i in range(len(data))]# remove stop words
    return(data)

Caso o código abaixo de erro, faca:

    1. Abra o terminal e execute
    2. python
    3. import nltk
    4. nltk.download('stopwords')
    5. selecione all e clique em download

In [5]:
df = TreatText(df)

In [6]:
labels = np.array(df['CATEGORY']) # label para cada uma das frases

In [7]:
# parâmetros do word2vec
dim_vec = 300
min_count = 10
window = 4
num_workers = multiprocessing.cpu_count()
seed = np.random.seed(42)

In [8]:
# instância do Word2Vec
modelo = Word2Vec(df["TITLE"],
                    min_count = min_count, 
                    vector_size = dim_vec, 
                    window = window,
                    seed = seed,
                    workers = num_workers,
                    sg = 1) #sg = 0 -> CBOW e sg = 1 -> skipgram

In [9]:
print("Tamanho do vocabulário do Word2Vec: ", len(modelo.wv))

Tamanho do vocabulário do Word2Vec:  16523


In [10]:
# exemplos das relações semânticas que o word2vec consegue estabelecer
print(modelo.wv.most_similar('samsung')) # palavra mais similar a 'itau'
print(modelo.wv.similarity('google', 'microsoft')) # similaridade entre duas palavras
print(modelo.wv.most_similar(positive = ['show', 'movie'], negative = ['home'], topn = 3)) # similaridade considerando exemplos positivos e negativos

[('galaxy', 0.6574547290802002), ('neo', 0.5911335349082947), ('waterproof', 0.5883448123931885), ('tab', 0.5882940888404846), ('tizen', 0.5862393379211426), ('fingerprint', 0.5824985504150391), ('exynos', 0.575131356716156), ('antitheft', 0.5749682784080505), ('electronics', 0.570072591304779), ('phablets', 0.5659135580062866)]
0.29914683
[('film', 0.43369606137275696), ('biopic', 0.38826411962509155), ('gritty', 0.38421639800071716)]


In [12]:
def meanVector(model,phrase):
    vocab = list(model.wv.index_to_key)
    phrase = " ".join(phrase)
    phrase = [x for x in word_tokenize(phrase) if x in vocab]
    #Quando não houver palavra o vector recebe 0 para todas as posições
    if phrase == []:
        vetor = [0.0]*dim_vec 
    else: 
        #Caso contrário, calculando a matriz da frase
        vetor = np.mean([model.wv[word] for word in phrase],axis=0)
    return vetor

In [13]:
def createFeatures(base): 
    features = [meanVector(modelo,base['TITLE'][i])for i in range(len(base))]
    return features

In [14]:
df = createFeatures(df)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df[0:100000], labels[0:100000], test_size=0.3,random_state=109)
clf = svm.SVC(kernel='linear') 
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [16]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8901666666666667
