## Day32 - NLP | Word2Vec + Sentiment Analysis

In [20]:
import pandas as pd
import numpy as np
from collections import Counter

from nltk import word_tokenize, SnowballStemmer
from nltk.corpus import stopwords
import string
import re

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

import multiprocessing

from time import time

from sklearn.cluster import KMeans

### Data

In [2]:
# Dataset : https://www.kaggle.com/kazanova/sentiment140
# col_0 = target 
# col_5 = text
df = pd.read_csv('../dataset/twitter_SA.csv', encoding = 'ISO-8859-1', header=None)
df = df[[0,5]]
df.columns = ['target','text']

# 0 = negative
# 4 = positive -> 1
df['target'] = df.target.apply(lambda x : 1 if x == 4 else x)

In [86]:
stemmer = SnowballStemmer(language="english")

# cleaning
def clean_text(text):
    
    t = text.lower()                           # lowercase
    t = re.sub(r"[0-9]","",t)                  # remove digits
    t = re.sub(r"_", " ", t)
    t = re.sub(r"[^\w\s]","",t)
    #t = re.sub(r"https:\/\/[\r\n]*","", t) # remove link
    #t = re.sub(r"http:\/\/[\r\n]*","", t) # remove link
    
    tokens = word_tokenize(t)            # tokenization
    stemmed_tokens = [stemmer.stem(tk) for tk in tokens if tk not in stopwords.words('english')]
    
    return stemmed_tokens

df_test = df.sample(80000)
df_test['text'] = df_test['text'].apply(lambda x : clean_text(x))

In [87]:
tweets = df_test.text
y = df_test.target

In [88]:
# source : https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial
phrases = Phrases(tweets, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[tweets]

In [89]:
sentences[:2]

<gensim.interfaces.TransformedCorpus at 0x1c800333190>

### Word2Vec

In [90]:
# source : https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

w2v_model.build_vocab(sentences, progress_per=50000)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

(4428121, 16851930)

In [95]:
w2v_model.wv.most_similar('car', topn=10)

[('littl', 0.9998283982276917),
 ('hour', 0.9998264908790588),
 ('woke', 0.9998259544372559),
 ('hous', 0.9998244643211365),
 ('let', 0.999824047088623),
 ('next_week', 0.9998226165771484),
 ('yep', 0.999821662902832),
 ('train', 0.9998204112052917),
 ('drive', 0.9998199939727783),
 ('shit', 0.9998198747634888)]