In [19]:
import numpy as np
import pandas as pd
import gensim
from gensim.models import Word2Vec
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import string
from sklearn.utils import shuffle
import multiprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics

In [2]:
df = pd.read_csv('uci-news-aggregator.csv')
df = df[['TITLE','CATEGORY']]
#categories: b = business, t = science and technology, e = entertainment, m = health

In [3]:
# shuffle - embaralha as linhas para evitar problema (aprende apenas amostras de uma categoria e isso pode levar o gradiente a
# ficar preso num mínimo local e só aprender bem sobre)
df = shuffle(df)
df = df.reset_index(drop = True)
df.head()

Unnamed: 0,TITLE,CATEGORY
0,Robin Thicke cries himself a river on 'Paula' ...,e
1,Mila Kunis and Ashton Kutcher are planning to ...,e
2,iPhone 6 May Have Been Leaked By China Telecom,t
3,Former Islanders owner admits to role in $554 ...,b
4,"Big Hair News: Kaley Cuoco's Chop, Kesha's Old...",e


In [4]:
import nltk
import re
import string
import unicodedata

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
    
        
def normalize_accents(text):
    return unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("utf-8")

def normalize_str(text):
    text = text.lower()
    text = remove_punctuation(text)
    text = normalize_accents(text)
    text = re.sub(re.compile(r" +"), " ",text)
    return " ".join([w for w in text.split()])

def remove_punctuation(text):
    punctuations = string.punctuation
    table = str.maketrans({key: " " for key in punctuations})
    text = text.translate(table)
    return text


def tokenizer(text):
    stop_words = nltk.corpus.stopwords.words("english") # portuguese, caso o dataset seja em português
    if isinstance(text, str):
        text = normalize_str(text)
        text = "".join([w for w in text if not w.isdigit()])
        text = word_tokenize(text)
        text = [x for x in text if x not in stop_words]
        text = [y for y in text if len(y) > 2]
        return [t for t in text]
    else:
        return None


In [5]:
df['Title_Treated'] = df['TITLE'].apply(tokenizer)

In [6]:
df.head()

Unnamed: 0,TITLE,CATEGORY,Title_Treated
0,Robin Thicke cries himself a river on 'Paula' ...,e,"[robin, thicke, cries, river, paula, album, re..."
1,Mila Kunis and Ashton Kutcher are planning to ...,e,"[mila, kunis, ashton, kutcher, planning, get, ..."
2,iPhone 6 May Have Been Leaked By China Telecom,t,"[iphone, may, leaked, china, telecom]"
3,Former Islanders owner admits to role in $554 ...,b,"[former, islanders, owner, admits, role, milli..."
4,"Big Hair News: Kaley Cuoco's Chop, Kesha's Old...",e,"[big, hair, news, kaley, cuoco, chop, kesha, o..."


In [7]:
labels = np.array(df['CATEGORY']) # label para cada uma das frases

In [8]:
# parâmetros do word2vec
dim_vec = 300
min_count = 10
window = 4
num_workers = multiprocessing.cpu_count()
seed = np.random.seed(42)

In [9]:
# instância do Word2Vec
modelo = Word2Vec(df["Title_Treated"],
                    min_count = min_count, 
                    vector_size = dim_vec, 
                    window = window,
                    seed = seed,
                    workers = num_workers,
                    sg = 1) #sg = 0 -> CBOW e sg = 1 -> skipgram

In [10]:
print("Tamanho do vocabulário do Word2Vec: ", len(modelo.wv))

Tamanho do vocabulário do Word2Vec:  16241


In [11]:
# exemplos das relações semânticas que o word2vec consegue estabelecer
print(modelo.wv.most_similar('samsung'), '\n') # palavra mais similar a 'itau'
print(modelo.wv.similarity('google', 'microsoft'), '\n') # similaridade entre duas palavras
print(modelo.wv.most_similar(positive = ['show', 'movie'], negative = ['home'], topn = 3)) # similaridade considerando exemplos positivos e negativos

[('galaxy', 0.6603061556816101), ('tizen', 0.6135312914848328), ('waterproof', 0.6055197715759277), ('tab', 0.5974157452583313), ('exynos', 0.5933058261871338), ('fingerprint', 0.5907710790634155), ('optical', 0.5883619785308838), ('cameraphone', 0.5811545252799988), ('neo', 0.5780284404754639), ('specification', 0.5770387053489685)] 

0.31254223 

[('film', 0.41209676861763), ('flick', 0.38928914070129395), ('biopic', 0.37935981154441833)]


In [12]:
def meanVector(model,phrase):
    vocab = list(model.wv.index_to_key)
    phrase = " ".join(phrase)
    phrase = [x for x in word_tokenize(phrase) if x in vocab]
    #Quando não houver palavra o vector recebe 0 para todas as posições
    if phrase == []:
        vetor = [0.0]*dim_vec 
    else: 
        #Caso contrário, calculando a matriz da frase
        vetor = np.mean([model.wv[word] for word in phrase],axis=0)
    return vetor

In [13]:
def createFeatures(base): 
    features = [meanVector(modelo,base['Title_Treated'][i])for i in range(len(base))]
    return features

In [14]:
df = createFeatures(df)
df[0]

array([ 7.76788220e-02, -1.56580824e-02, -1.99973091e-01, -3.48755687e-01,
       -3.97247002e-02,  2.27950007e-01, -1.41073996e-02, -1.96009278e-01,
        5.00374734e-02,  1.41638145e-01,  2.69054826e-02, -1.54619977e-01,
       -1.34386718e-01,  2.91046143e-01, -3.90845627e-01,  1.93845406e-01,
        4.82922971e-01, -1.82963740e-02,  1.41421959e-01,  4.47828799e-01,
        3.59108269e-01, -1.69355363e-01, -6.92565218e-02,  2.12228566e-01,
        9.07041654e-02,  2.34435126e-01,  1.68567017e-01, -4.35160324e-02,
        5.07158302e-02, -2.21138984e-01, -1.61573812e-01, -1.99439317e-01,
        7.11930469e-02,  6.99207634e-02,  2.33431920e-01,  1.33546561e-01,
       -3.41324247e-02, -1.06512599e-01, -1.08512983e-01, -2.08253905e-01,
        7.54968747e-02,  1.88351169e-01, -9.04641375e-02,  2.56415606e-01,
        2.01211050e-01,  1.40200958e-01,  2.20510699e-02, -1.12380870e-01,
       -1.53903246e-01,  2.26212487e-01,  4.24410887e-02, -9.68811512e-02,
       -5.77640116e-01,  

In [17]:
X_train, X_test, y_train, y_test = train_test_split(df[0:100000], labels[0:100000], test_size=0.3,random_state=42)
clf = svm.SVC(kernel='rbf') 
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [18]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9247666666666666
