## Here, we will re-run the code modules written in the blog of [Word Vector Encoding](https://www.enjoyalgorithms.com/blog/word-vector-encoding-in-nlp/)

In [2]:
import pandas as pd

tweets = pd.read_csv('Corona_NLP_train.csv', encoding='ISO-8859-1')

print(tweets.head())

   UserName  ScreenName   Location     TweetAt  \
0      3799       48751     London  16-03-2020   
1      3800       48752         UK  16-03-2020   
2      3801       48753  Vagabonds  16-03-2020   
3      3802       48754        NaN  16-03-2020   
4      3803       48755        NaN  16-03-2020   

                                       OriginalTweet           Sentiment  
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...             Neutral  
1  advice Talk to your neighbours family to excha...            Positive  
2  Coronavirus Australia: Woolworths to give elde...            Positive  
3  My food stock is not the only one which is emp...            Positive  
4  Me, ready to go at supermarket during the #COV...  Extremely Negative  


In [3]:
tweets = tweets[['OriginalTweet', 'Sentiment']] #extraction
tweets.columns = ['Text', 'Sentiment'] #renaming

tweets['Text'] = tweets['Text'].str.lower()
tweets['Text'] = tweets['Text'].str.replace(r"http\S+", "", regex=True)

tweets['Text'] = tweets['Text'].str.replace('[^A-Za-z0-9]+',' ', regex=True)

In [4]:
import nltk
from nltk.corpus import stopwords
## NLTK library provides the set of stop words for English

stopwords = stopwords.words('english')

tweets['Text'] = tweets['Text'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stopwords))
print(tweets.head())

                                                Text           Sentiment
0                       menyrbie phil gahan chrisitv             Neutral
1  advice talk neighbours family exchange phone n...            Positive
2  coronavirus australia woolworths give elderly ...            Positive
3  food stock one empty please panic enough food ...            Positive
4  ready go supermarket covid19 outbreak paranoid...  Extremely Negative


In [6]:
import nltk

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in  w_tokenizer.tokenize(text)]

tweets['lemmatized_tokens'] = tweets['Text'].apply(lemmatize_text)
tweets.head()

Unnamed: 0,Text,Sentiment,lemmatized_tokens
0,menyrbie phil gahan chrisitv,Neutral,"[menyrbie, phil, gahan, chrisitv]"
1,advice talk neighbours family exchange phone n...,Positive,"[advice, talk, neighbour, family, exchange, ph..."
2,coronavirus australia woolworths give elderly ...,Positive,"[coronavirus, australia, woolworth, give, elde..."
3,food stock one empty please panic enough food ...,Positive,"[food, stock, one, empty, please, panic, enoug..."
4,ready go supermarket covid19 outbreak paranoid...,Extremely Negative,"[ready, go, supermarket, covid19, outbreak, pa..."


In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

tokens = []

for i in range(len(tweets['lemmatized_tokens'])):
    
    for j in range(len(tweets['lemmatized_tokens'][i])):
        
        if tweets['lemmatized_tokens'][i][j] not in tokens:
            
            tokens.append(tweets['lemmatized_tokens'][i][j])

In [14]:
print(len(tokens))

51111


In [11]:
integer_label_encoded = label_encoder.fit_transform(tokens[1:10])

label_encoded = integer_label_encoded.reshape(len(integer_label_encoded), 1)

onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(label_encoded)

print(onehot_encoded)

[[0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]]


In [22]:
import gensim

from gensim.models import Word2Vec

CBOW = gensim.models.Word2Vec(tweets['lemmatized_tokens'], vector_size=10, window=5, min_count=1, workers=4)

print("Cosine similarity between 'Market' " + "and 'Stock' - Continuous Bag of Word : ", CBOW.wv.similarity('market', 'stock'))

Cosine similarity between 'Market' and 'Stock' - Continuous Bag of Word :  0.756814


In [23]:
CSG = gensim.models.Word2Vec(tweets['lemmatized_tokens'], vector_size=10, window=5, min_count=1, workers=4, sg=1)

print("Cosine similarity between 'Market' " + "and 'Stock' - Skip Gram: ", CSG.wv.similarity('market', 'stock'))

Cosine similarity between 'Market' and 'Stock' - Skip Gram:  0.87107706


In [24]:
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(tweets["Text"])

# to visualize the formed TF-IDF matrix
tfs.toarray()



array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.36782873, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [27]:
import itertools
import collections
import pandas as pd

lemmatized_tokens = list(tweets["lemmatized_tokens"])

token_list = list(itertools.chain(*lemmatized_tokens))

counts_no = collections.Counter(token_list)

clean_tweets = pd.DataFrame(counts_no.most_common(30),
                             columns=['words', 'count'])

clean_tweets.sort_values(by='count')
most_frequent_words = clean_tweets['words'][:20]

In [29]:
import numpy as np

vectors = []

for line in tweets['Text']:
    tokens = nltk.word_tokenize(line)
    vec = []
    for token in most_frequent_words:
        if token in tokens:
            vec.append(1)
        else:
            vec.append(0)
    vectors.append(vec)

sentence_vectors = np.asarray(vectors)
# Bag-of-Word Matrix
sentence_vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0]])