In [1]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Word2Vec
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.classification import LogisticRegression
import pyspark.sql.functions as F
import pyspark.sql.types as T

sc = SparkContext('local')
spark = SparkSession(sc)

In [2]:
df = spark.read.format("csv").option("inferschema","true").option("header", "true").option("delimiter", "\t").load("trainReviews.tsv")

In [3]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(df)
wordsData.show(5)

+----+--------+--------------------+--------------------+
|  id|category|                text|               words|
+----+--------+--------------------+--------------------+
| 858|       0|burnt money is th...|[burnt, money, is...|
|1762|       1|the   italian hit...|[the, , , italian...|
| 235|       0|at times   you d ...|[at, times, , , y...|
| 712|       0|after a marketing...|[after, a, market...|
|1319|       1|john cusack is th...|[john, cusack, is...|
+----+--------+--------------------+--------------------+
only showing top 5 rows



In [4]:
word2Vec = Word2Vec(inputCol="words", outputCol="result")

In [5]:
model = word2Vec.fit(wordsData)

In [6]:
model.getVectors().show(100)

+-------------+--------------------+
|         word|              vector|
+-------------+--------------------+
|     incident|[0.00438264850527...|
|      serious|[0.03595425561070...|
|        brink|[0.00153260945808...|
|     sinister|[-0.0162860918790...|
|       breaks|[-0.0075081149116...|
|    forgotten|[-0.0226922817528...|
|     precious|[-0.0177151039242...|
|        mario|[-0.0197891276329...|
|   compliment|[-0.0030940368305...|
|        lover|[-0.0201899744570...|
|     terrible|[-0.0321051441133...|
|         lion|[0.00814848858863...|
|         rate|[-0.0318052470684...|
|   inevitable|[0.01720425672829...|
|  nonetheless|[-0.0055500366725...|
|       lights|[0.00479545490816...|
|         rage|[0.01458693947643...|
|        knack|[-8.9934479910880...|
|         snow|[-0.0167874917387...|
|     headache|[5.87414833717048...|
|     laughing|[0.02785842493176...|
|      michael|[-0.1444831639528...|
|        looks|[-0.0676502808928...|
|     zemeckis|[-0.0028793253004...|
|

In [7]:
synonyms = model.findSynonyms('sinister', 100)

In [8]:
synonyms.show(100)

+------------+------------------+
|        word|        similarity|
+------------+------------------+
|        pete|0.7448366284370422|
|       ricky|0.7204491496086121|
|  astounding|0.7153885364532471|
|       bitch|0.7118590474128723|
|   homicidal| 0.710591197013855|
|        king|0.7058147192001343|
|       score| 0.699134111404419|
|        skin|0.6960994005203247|
|      delmar|0.6951101422309875|
|        gang|0.6895015239715576|
|         car|0.6856046319007874|
|        buck| 0.681916356086731|
|    producer|0.6787402629852295|
|      quirky|0.6778930425643921|
|        neck|0.6763299703598022|
|        kurt|0.6757290363311768|
|      gentle|0.6725389361381531|
|     madness|0.6722574234008789|
|       terry| 0.670669436454773|
|      jerome|0.6685158610343933|
|      quotes|0.6677120923995972|
|    birthday|0.6663986444473267|
|     fighter| 0.665898323059082|
|        bold|0.6608845591545105|
|     kingpin|0.6598169207572937|
|      malick| 0.658460259437561|
|         cop|