In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/javaidnabi31/Word-Embeddding-Sentiment-Classification/master/movie_data.csv'
df = pd.read_csv(url, encoding='utf-8')

In [None]:
df.info()
df.head()

In [None]:
lines = df['review'].values.tolist()  # get data from review col in dataframe

In [None]:
lines[:5]

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

In [None]:
def to_lower(tokens):
  """Convert to lower case"""
  return [w.lower() for w in tokens]

def del_punctuation(tokens):
  """Remove punctuation from each word"""    
  table = str.maketrans('', '', string.punctuation)
  stripped = [w.translate(table) for w in tokens]
  return stripped

def del_alphabet(stripped):
    """Remove remaining tokens that are not alphabetic"""
    words = [word for word in stripped if word.isalpha()]
    return words
  
def del_stopwords(words):
    """Filter out stop words"""
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return words
  
def stemming(words):
    """Get word lower-case and word stemming"""
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]
    return words
  
def text_clean(line, text_stemming=False):
    tokens = word_tokenize(line)
    tokens = to_lower(tokens)
    tokens = stemming(tokens) if text_stemming else tokens
    stripped = del_punctuation(tokens)
    words = del_alphabet(stripped)
    words = del_stopwords(words)
    return words

In [None]:
review_lines = list()
for line in lines:   
    words = text_clean(line)
    review_lines.append(words)  # append to review_lines list

In [None]:
# print sample after data preparing for create Word2Vec model
print(lines[0])
print('-' * 100)
print(review_lines[0])

## Traing

In [None]:
from gensim.models import Word2Vec

EMBEDDING_DIM = 100
fname = 'imdb_word2vec.bin'

In [None]:
# train word2vec model
model = Word2Vec(sentences=review_lines,
                 size=EMBEDDING_DIM,
                 window=5,
                 workers=4,
                 min_count=1)
word_vectors = model.wv

In [None]:
word_vectors['potato']

In [None]:
word_vectors.similar_by_word("cat")

In [None]:
word_vectors.similarity('king', 'queen')

In [None]:
# odd word out
# (woman king queen) (movie)
word_vectors.doesnt_match("woman king queen movie".split())

In [None]:
word_vectors.most_similar_cosmul(positive=['women', 'men'])

In [None]:
# Let’s see the result of semantically reasonable word vectors (king - man + woman)
word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

## Save model

In [None]:
word_vectors.save(fname)

## Load model

In [None]:
from gensim.models import KeyedVectors
imdb_word_vectors = KeyedVectors.load(fname, mmap='r')

In [None]:
imdb_word_vectors['hi']