In [None]:
def display(*args, **kargs): pass

# Wikipedia: Word2Vec
 
In this lab, we'll use `Word2Vec` to create vectors the words found in the Wikipedia dataset.  We'll use `Word2Vec` by passing in a `DataFrame` containing sentences.  We can pass into `Word2Vec` what length of vector to create, with larger vectors taking more time to build.
 
Be able to convert words into vectors provides us with features that can be used in traditional machine learning algorithms.  These vectors can be used to compare word similarity, sentence similarity, or even larger sections of text.

Load the data.

In [None]:
baseDir = '/mnt/ml-class/'
dfSmall = sqlContext.read.parquet(baseDir + 'smallwiki.parquet')

In [None]:
dfSmall.count()

Filter out unwanted data.

In [None]:
import pyspark.sql.functions as func
from pyspark.sql.functions import col
filtered = dfSmall.filter((col('title') != '<PARSE ERROR>') &
                           col('redirect_title').isNull() &
                           col('text').isNotNull())

Change all text to lower case.

In [None]:
lowered = filtered.select('*', func.lower(col('text')).alias('lowerText'))

In [None]:
parsed = (lowered
          .drop('text')
          .withColumnRenamed('lowerText', 'text'))

Split the Wikipedia text into sentences.

In [None]:
pattern = r'(\. |\n{2,})'
import re
matches = re.findall(pattern, 'Wiki page. *More information*\n\n And a line\n that continues.')
print matches

In [None]:
from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer(inputCol='text', outputCol='sentences', pattern=pattern)
sentences = tokenizer.transform(parsed).select('sentences')
display(sentences)

In [None]:
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType

sentenceRDD = (sentences
               .flatMap(lambda r: r[0])
               .map(lambda x: Row(sentence=x)))

sentenceSchema = StructType([StructField('sentence', StringType())])
sentence = sqlContext.createDataFrame(sentenceRDD, sentenceSchema)

display(sentence)

Split the sentences into words.

In [None]:
tokenizerWord = RegexTokenizer(inputCol='sentence', outputCol='words', pattern=r'\W+')
words = tokenizerWord.transform(sentence).select('words')
display(words)

Use our `removeWords` function that we registered in wiki-eda to clean up stop words.

In [None]:
sqlContext.sql('drop table if exists words')
words.registerTempTable('words')

In [None]:
noStopWords = sqlContext.sql('select removeWords(words) as words from words') #.cache()
display(noStopWords)

In [None]:
wordVecInput = noStopWords.filter(func.size('words') != 0)
wordVecInput.count()

Build the `Word2Vec` model.  This take about a minute with two workers.

In [None]:
from pyspark.ml.feature import Word2Vec
word2Vec = Word2Vec(vectorSize=150, minCount=50, inputCol='words', outputCol='result', seed=0)
model = word2Vec.fit(wordVecInput)

Let's see the model in action.

In [None]:
model.findSynonyms('house', 10).collect()

In [None]:
synonyms = model.findSynonyms('fruit', 10).collect()

for word, similarity in synonyms:
    print("{}: {}".format(word, similarity))

In [None]:
model.findSynonyms('soccer', 10).collect()

How can we calculate similarity between vectors and handle creating a vector for multiple words at once?

In [None]:
from pyspark.sql import Row
tmpDF = sqlContext.createDataFrame([Row(words=['fruit']),
                                    Row(words=['flower']),
                                    Row(words=['fruit', 'flower'])])

In [None]:
vFruit = model.transform(tmpDF).map(lambda r: r.result).collect()

Let's create a cosine similarity measure.

In [None]:
from numpy.linalg import norm

def similarity(x, y):
  return x.dot(y) / (norm(x) * norm(y))

print similarity(*vFruit[:2])
print similarity(*vFruit[1:])

`Word2Vec` handles multiple words by averaging the vectors.

In [None]:
print vFruit[0][:6]
print vFruit[1][:6]
print (vFruit[0][:6] + vFruit[1][:6]) / 2  # Averaging the word vectors gives us the vector for both words in a sentence
print vFruit[2][:6]

In [None]:
from pyspark.sql import Row
tmpDF = sqlContext.createDataFrame([Row(words=['king']),
                                    Row(words=['man']),
                                    Row(words=['woman']),
                                    Row(words=['queen'])])

v1 = model.transform(tmpDF).rdd.map(lambda r: r.result).collect()

In [None]:
k, m, w, q = v1
print similarity(k, q)
print similarity(k, (q + m)/2)
print similarity(k, m)
print similarity(q, m)
print similarity(q, k - m + w)