# Word Embeddings

In [None]:
#Importing dependencies
import numpy as np
import tensorflow as tf
import pandas as pd
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import RNN, SimpleRNN, LSTM
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
import random
import matplotlib.pyplot as plt
%matplotlib inline 
from mpl_toolkits import mplot3d
import warnings
warnings.filterwarnings("ignore")

# LAB1: Simple Word Embeddings in TensorFlow

In [None]:
corpus = ['king is a strong man', 
          'queen is a wise woman', 
          'boy is a young man',
          'girl is a young woman', 
          'prince is a young',
          'prince will be strong',
          'princess is young',
          'man is strong', 
          'woman is pretty',
          'prince is a boy', 
          'prince will be king',
          'princess is a girl',
          'princess will be queen']

## Clean the corpus by removing stopwords

Remove stop words
In order for efficiency of creating word vector, we will remove commonly used words

In [None]:
def drop_stop_words(corpus):
    stop_words = ['is', 'a', 'will', 'be','was','and']
    results = []
    for text in corpus:
        tmp = text.split(' ')
        for stop_word in stop_words:
            if stop_word in tmp:
                tmp.remove(stop_word)
        results.append(" ".join(tmp))
    
    return results

In [None]:
corpus_v1 = drop_stop_words(corpus)
print(corpus_v1)

In [None]:
all_words=  [word for text in corpus_v1 for word in text.split(' ')]
words=set(all_words)
print(words)

In [None]:
word2int = {}

for i,word in enumerate(words):
    word2int[word] = i

sentences = []
for sentence in corpus_v1:
    sentences.append(sentence.split())
    


## Data Preparation - Word to Context

In [None]:
WINDOW_SIZE = 2 #5-10

data = []
for sentence in sentences:
    for idx, word in enumerate(sentence):
        for neighbor in sentence[max(idx - WINDOW_SIZE, 0) : min(idx + WINDOW_SIZE, len(sentence)) + 1] : 
            if neighbor != word:
                data.append([word, neighbor])

In [None]:
import pandas as pd
#for text in corpus_v1:
#    print(text)

df = pd.DataFrame(data, columns = ['input', 'label'])

In [None]:
print(df)

In [None]:
df.shape

In [None]:
word2int

## One-Hot encoding

In [None]:
ONE_HOT_DIM = len(words)

In [None]:
def to_one_hot_encoding(data_point_index):
    one_hot_encoding = np.zeros(ONE_HOT_DIM)
    one_hot_encoding[data_point_index] = 1
    return one_hot_encoding
X = [] # input word
Y = [] # target word

for x, y in zip(df['input'], df['label']):
    X.append(to_one_hot_encoding(word2int[ x ]))
    Y.append(to_one_hot_encoding(word2int[ y ]))

# convert them to numpy arrays
X_train = np.asarray(X)
Y_train = np.asarray(Y)    
print(X_train.shape)
print(Y_train.shape)

In [None]:
X_train.shape[1]

## Model Building

In [None]:
model1 = Sequential()
model1.add(Dense(3, input_dim=X_train.shape[1], activation='sigmoid'))
model1.add(Dense(Y_train.shape[1] ,activation='softmax'))
model1.summary()

In [None]:
model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Train model
history = model1.fit(X_train, Y_train, epochs=600, batch_size=4 , verbose=0 )

In [None]:
model1h = Sequential()
model1h.add(Dense(3, input_dim=X_train.shape[1], weights=model1.layers[0].get_weights()))
model1h.add(Activation('sigmoid'))
vectors_data = pd.DataFrame(model1h.predict(X_train))
vectors_data["word"]=df["input"]
vectors_data=vectors_data.drop_duplicates() 
vectors_data.columns
print(vectors_data)

## Final Word2Vec Data visualization

In [None]:
w2v_df=pd.DataFrame()
w2v_df["word"]=vectors_data["word"]
w2v_df["x1"]=vectors_data[0]
w2v_df["x2"]=vectors_data[1]
w2v_df["x3"]=vectors_data[2]
print(w2v_df)

In [None]:
plt.rcParams.update({'font.size': 20})

fig = plt.figure(figsize=(15,10))
ax = plt.axes(projection='3d')
ax = plt.axes(projection='3d')

xdata = w2v_df["x1"]
ydata = w2v_df["x2"]
zdata = w2v_df["x3"]
names=w2v_df["word"]


ax.scatter3D(xdata, ydata, zdata, s=200 , c='green')
for names, x, y, z in zip(names, xdata, ydata, zdata):
    label = names
    ax.text(x, y, z, label )
plt.show()

# LAB2 : Word Embeddings example in Gensim

In [None]:
!pip install gensim
!pip install google.cloud

In [None]:
#import gzip
import gensim

In [None]:
import urllib.request 
#read cat image
urllib.request.urlretrieve("https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/word2vec_data/King_queen_v1.txt", "King_queen_v1.txt")
data_file="King_queen_v1.txt"

In [None]:
def read_input(input_file):
   
    with open (input_file, 'rb') as f:
        for i, line in enumerate (f): 

            if (i%2==0):
                print("read {0} lines".format (i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess (line)
# read the tokenized reviews into a list
# each review item becomes a series of words
# so this becomes a list of lists
documents = list (read_input (data_file))

In [None]:
print(documents)

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(documents, min_count=1, size=3, window = 2)
#size： size of word vector, hidden layer
#min-count：discard words that appear less than # times
#window：Context Window size

## Hyperparameters

### size
The hidden nodes size. The size of the dense vector to represent each token or word. If you have very limited data, then size should be a much smaller value. If you have lots of data, its good to experiment with various sizes. 

### window
Context window size. The maximum distance between the target word and its neighboring word. If your neighbor's position is greater than the maximum window width to the left and the right, then, some neighbors are not considered as being related to the target word. In theory, a smaller window should give you terms that are more related. If you have lots of data, then the window size should not matter too much, as long as its a decent sized window.

### min_count
Minimium frequency count of words. The model would ignore words that do not statisfy the min_count. Extremely infrequent words are usually unimportant, so its best to get rid of those. Unless your dataset is really tiny, this does not really affect the model.

In [None]:
vectors=model[words]
print(vectors)

In [None]:
# access vector for one word
print(model['king'])
print(model['man'])
print(model['queen'])

In [None]:
result=[print(word, model[word]) for word in words]

In [None]:
# save model
model.save(r"model1.bin")
# load model
new_model = Word2Vec.load(r"model1.bin")

#Application of Word2Vec - Sentiment Analysis

[Word2_vec_Sentiment_Analysis code file](https://colab.research.google.com/drive/1666qEPApWzdw-efCu7JHhulF3MLRG2cw)

# Pre trained model by google(3mn words)

## Load the model

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
downloaded = drive.CreateFile({'id':"13e-z7BhTcm69qCjoFNehc6S0sc2qWod8"})   
downloaded.GetContentFile("GoogleNews-vectors-negative300.zip") 
!unzip GoogleNews-vectors-negative300.zip

#binfile- https://drive.google.com/file/d/1OfDK_9nUPYC1uCvscsr5z4cfBnWMrqX6/view?usp=sharing
#zipfile- https://drive.google.com/file/d/13e-z7BhTcm69qCjoFNehc6S0sc2qWod8/view?usp=sharing


In [None]:
from gensim.models import KeyedVectors
# load the google word2vec model
filename="GoogleNews-vectors-negative300.bin"
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [None]:
result = model.most_similar(positive=['congress', 'sonia'], topn=10)
print(result)

In [None]:
# look up top 6 words similar to 'polite'
w1 = ["polite"]
model.wv.most_similar (positive=w1,topn=6)

In [None]:
# look up top 6 words similar to 'france'
w1 = ["france"]
model.wv.most_similar (positive=w1,topn=6)

In [None]:
w1 = ["india"]
model.wv.most_similar (positive=w1,topn=6)

In [None]:
# look up top 6 words similar to 'shocked'
w1 = ["dhoni"]
model.wv.most_similar (positive=w1,topn=6)

In [None]:
# similarity between two different words
model.wv.similarity(w1="dirty",w2="smelly")

In [None]:
# similarity between two different words
model.wv.similarity(w1="dirty",w2="great")

In [None]:
# similarity between two different words
model.wv.similarity(w1="dirty",w2="ugly")

In [None]:
# Which one is the odd one out in this list?
model.wv.doesnt_match(["run","walk","france"])

In [None]:
model.wv.doesnt_match(["run","india","france"])

In [None]:
w1 = ["Nityananda"]
model.wv.most_similar (positive=w1,topn=10)

In [None]:
w1 = ["chess"]
model.wv.most_similar (positive=w1,topn=6)