<a href="https://colab.research.google.com/github/devnac221990/ICP5-KDM/blob/main/ICP%205%20KDM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/27/67/5158f846202d7f012d1c9ca21c3549a58fd3c6707ae8ee823adcaca6473c/pyspark-3.0.2.tar.gz (204.8MB)
[K     |████████████████████████████████| 204.8MB 66kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 49.1MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.2-py2.py3-none-any.whl size=205186687 sha256=ef720fbe17dab12bfd6b37906f0db6ca83435d25d9fb37e7bed4b94038eb9b32
  Stored in directory: /root/.cache/pip/wheels/8b/09/da/c1f2859bcc86375dc972c5b6af4881b3603269bcc4c9be5d16
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.2


In [2]:
from __future__ import print_function
from pyspark import SparkConf, SparkContext
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SparkSession
from pyspark.ml.feature import NGram
from pyspark.ml.feature import Word2Vec

In [3]:
spark = SparkSession.builder \
          .appName("tfidf ICP") \
          .config("spark.some.config.option", "some-value") \
          .getOrCreate()

In [34]:
import pandas as pd
pdf = pd.DataFrame({
        'terms': [
            ['spark', 'spark', 'spark', 'is', 'awesome', 'awesome'],
            ['I', 'love', 'spark', 'very', 'very', 'much'],
            ['everyone', 'should', 'use', 'spark'],
            ['life','is','very','good'],
            ['it', 'is', 'too', 'cold' ]
        ]
    })
df = spark.createDataFrame(pdf)
df.show(truncate=False)

+-------------------------------------------+
|terms                                      |
+-------------------------------------------+
|[spark, spark, spark, is, awesome, awesome]|
|[I, love, spark, very, very, much]         |
|[everyone, should, use, spark]             |
|[life, is, very, good]                     |
|[it, is, too, cold]                        |
+-------------------------------------------+



In [5]:
from pyspark.ml.feature import HashingTF
from pyspark.ml import Pipeline

hashtf = HashingTF(numFeatures=pow(2, 4), inputCol='terms', outputCol='features(numFeatures), [index], [term frequency]')
stages = [hashtf]
pipeline = Pipeline(stages=stages)

In [14]:
pipeline.fit(df).transform(df).show(truncate=False)

+-------------------------------------------+----------------------------------------------+
|terms                                      |features(vocabSize), [index], [term frequency]|
+-------------------------------------------+----------------------------------------------+
|[spark, spark, spark, is, awesome, awesome]|(15,[0,2,3],[3.0,1.0,2.0])                    |
|[I, love, spark, very, very, much]         |(15,[0,1,5,10,12],[1.0,2.0,1.0,1.0,1.0])      |
|[everyone, should, use, spark]             |(15,[0,6,7,11],[1.0,1.0,1.0,1.0])             |
|[life, is, very, good]                     |(15,[1,2,8,13],[1.0,1.0,1.0,1.0])             |
|[it, is, too, cold]                        |(15,[2,4,9,14],[1.0,1.0,1.0,1.0])             |
+-------------------------------------------+----------------------------------------------+



In [28]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline

countvectorizer = CountVectorizer(minTF=1.0, minDF=1.0, vocabSize=20, 
                                  inputCol='terms', outputCol='features(vocabSize), [index], [term frequency]')
stages = [countvectorizer]
pipeline = Pipeline(stages=stages)

In [116]:
pipeline.fit(df).transform(df).show(truncate=False)

+----------------------------------------+----------------------------------------------+
|terms                                   |features(vocabSize), [index], [term frequency]|
+----------------------------------------+----------------------------------------------+
|[life, be, good]                        |(16,[0,9,14],[1.0,1.0,1.0])                   |
|[student, should, love, coding]         |(16,[1,6,8,13],[1.0,1.0,1.0,1.0])             |
|[everyone, should, be, code]            |(16,[0,1,4,7],[1.0,1.0,1.0,1.0])              |
|[programming, be, the, basic, education]|(16,[0,2,11,12,15],[1.0,1.0,1.0,1.0,1.0])     |
|[the, temperature, be, below, freeze]   |(16,[0,2,3,5,10],[1.0,1.0,1.0,1.0,1.0])       |
+----------------------------------------+----------------------------------------------+



In [117]:
from pyspark.sql.types import StringType
df_vocab = df.select('terms').rdd.\
            flatMap(lambda x: x[0]).\
            toDF(schema=StringType()).toDF('terms')
df_vocab.show()

+-----------+
|      terms|
+-----------+
|       life|
|         be|
|       good|
|    student|
|     should|
|       love|
|     coding|
|   everyone|
|     should|
|         be|
|       code|
|programming|
|         be|
|        the|
|      basic|
|  education|
|        the|
|temperature|
|         be|
|      below|
+-----------+
only showing top 20 rows



In [31]:
vocab_freq = df_vocab.rdd.countByValue()
pdf = pd.DataFrame({
        'term': list(vocab_freq.keys()),
        'frequency': list(vocab_freq.values())
    })
pdf
tf = spark.createDataFrame(pdf).orderBy('frequency', ascending=False)
tf.show()

+----------+---------+
|      term|frequency|
+----------+---------+
|   [spark]|        5|
|      [is]|        3|
|    [very]|        3|
| [awesome]|        2|
|       [I]|        1|
|[everyone]|        1|
|    [love]|        1|
|  [should]|        1|
|     [use]|        1|
|    [good]|        1|
|    [much]|        1|
|    [life]|        1|
|      [it]|        1|
|     [too]|        1|
|    [cold]|        1|
+----------+---------+



In [118]:
spark3 = SparkSession.builder.appName("Word2Vec Example").getOrCreate()

In [119]:
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark3.createDataFrame([
    ("spark spark spark is awesome awesome.".split(" "), ),
    ("I love spark very very much.".split(" "), ),
    ("everyone should use spark.".split(" "), ),
    ("life is very good".split(" "), ),
    ("it is too cold.".split(" "), )
], ["text"])

In [120]:
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)

In [121]:
for row in result.collect():
    text, vector = row
    #printing the results
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [spark, spark, spark, is, awesome, awesome.] => 
Vector: [-0.03233379746476809,0.07991064277788003,-0.12412767360607782]

Text: [I, love, spark, very, very, much.] => 
Vector: [-0.08162251487374306,-0.07343332651847352,-0.1039524401227633]

Text: [everyone, should, use, spark.] => 
Vector: [0.056249113753437996,0.005625518970191479,-0.01240360178053379]

Text: [life, is, very, good] => 
Vector: [-0.04824359342455864,0.0063680075109004974,-0.0043945712968707085]

Text: [it, is, too, cold.] => 
Vector: [-0.01781434239819646,0.012281995266675949,-0.02408296149224043]



In [122]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("spark", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+--------+------------------+
|    word|        similarity|
+--------+------------------+
|  spark.|0.9528490900993347|
|      is|0.9438766241073608|
|    love|0.8344047665596008|
|     too|0.6483806371688843|
|awesome.| 0.484557569026947|
+--------+------------------+



In [123]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("is", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+------+------------------+
|  word|        similarity|
+------+------------------+
|spark.|0.9786363840103149|
|  love|0.9445621967315674|
| spark|0.9438766837120056|
|     I|0.5885893702507019|
|   too|0.5793203115463257|
+------+------------------+



In [124]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("very", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+------+-------------------+
|  word|         similarity|
+------+-------------------+
|     I| 0.9961175322532654|
|  love| 0.6879690885543823|
|    is| 0.5169904232025146|
|spark.|0.46952128410339355|
|    it|0.39674097299575806|
+------+-------------------+



In [125]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("awesome", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+--------+------------------+
|    word|        similarity|
+--------+------------------+
|     use|0.9864492416381836|
|awesome.|0.9357233643531799|
|    good|0.5822964310646057|
|   much.|0.5262214541435242|
|      it|0.3276222348213196|
+--------+------------------+



In [126]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("everyone", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+------+-------------------+
|  word|         similarity|
+------+-------------------+
|should|  0.895369291305542|
|  good|  0.589941143989563|
| much.| 0.4466019570827484|
|    it| 0.4165970981121063|
| cold.|0.04272148385643959|
+------+-------------------+



In [127]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("love", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+------+------------------+
|  word|        similarity|
+------+------------------+
|spark.|0.9609439969062805|
|    is|0.9445623159408569|
| spark|0.8344048857688904|
|     I|0.7400961518287659|
|  very|0.6879690885543823|
+------+------------------+



In [132]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("I", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+------+-------------------+
|  word|         similarity|
+------+-------------------+
|  very| 0.9961175322532654|
|  love| 0.7400961518287659|
|    is| 0.5885894894599915|
|spark.| 0.5378295183181763|
|    it|0.33645814657211304|
+------+-------------------+



In [128]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("good", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+--------+-------------------+
|    word|         similarity|
+--------+-------------------+
|  should| 0.7369943261146545|
|     use| 0.6675746440887451|
|everyone| 0.5899412035942078|
| awesome| 0.5822964310646057|
|   cold.|0.48677951097488403|
+--------+-------------------+



In [134]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("use", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+--------+------------------+
|    word|        similarity|
+--------+------------------+
| awesome|0.9864492416381836|
|awesome.|0.8725539445877075|
|    good|0.6675746440887451|
|   much.|0.4450313448905945|
|   cold.|0.2677077054977417|
+--------+------------------+



In [135]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("should", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+--------+-------------------+
|    word|         similarity|
+--------+-------------------+
|everyone|  0.895369291305542|
|    good| 0.7369943857192993|
|   cold.| 0.4831452965736389|
|   much.|0.15484245121479034|
|      it|0.04120621830224991|
+--------+-------------------+



In [39]:
spark2 = SparkSession.builder.appName("Ngram Example").getOrCreate()

In [44]:
 wordDataFrame = spark2.createDataFrame([
 (0, ['spark', 'spark', 'spark', 'is', 'awesome', 'awesome']),
            (1, ['I', 'love', 'spark', 'very', 'very', 'much']),
            (2, ['everyone', 'should', 'use', 'spark']),
            (3, ['life','is','very','good']),
            (4, ['it', 'is', 'too', 'cold' ])], ['id', 'words'])

In [46]:
#creating NGrams with n=2 (two words)
ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
ngramDataFrame = ngram.transform(wordDataFrame)

In [47]:
ngramDataFrame.select("ngrams").show(truncate=False)

+-----------------------------------------------------------------+
|ngrams                                                           |
+-----------------------------------------------------------------+
|[spark spark, spark spark, spark is, is awesome, awesome awesome]|
|[I love, love spark, spark very, very very, very much]           |
|[everyone should, should use, use spark]                         |
|[life is, is very, very good]                                    |
|[it is, is too, too cold]                                        |
+-----------------------------------------------------------------+



In [52]:
import pandas as pd
pdf = pd.DataFrame({
        'terms': [
            ['spark', 'spark', 'spark', 'spark', 'spark', 'is', 'is', 'awesome', 'awesome', 'awesome'],
            ['I', 'love', 'love', 'spark', 'spark', 'very', 'very', 'very', 'very', 'much'],
            ['everyone', 'should', 'should', 'use', 'use', 'spark'],
            ['life', 'is', 'is', 'very', 'very', 'good'],
            ['it', 'is', 'is', 'too', 'too', 'cold']
        ]
    })
df = spark.createDataFrame(pdf)
df.show(truncate=False)

+----------------------------------------------------------------------+
|terms                                                                 |
+----------------------------------------------------------------------+
|[spark, spark, spark, spark, spark, is, is, awesome, awesome, awesome]|
|[I, love, love, spark, spark, very, very, very, very, much]           |
|[everyone, should, should, use, use, spark]                           |
|[life, is, is, very, very, good]                                      |
|[it, is, is, too, too, cold]                                          |
+----------------------------------------------------------------------+



In [53]:
from pyspark.ml.feature import HashingTF
from pyspark.ml import Pipeline

hashtf = HashingTF(numFeatures=pow(2, 4), inputCol='terms', outputCol='features(numFeatures), [index], [term frequency]')
stages = [hashtf]
pipeline = Pipeline(stages=stages)

In [54]:
pipeline.fit(df).transform(df).show(truncate=False)

+----------------------------------------------------------------------+------------------------------------------------+
|terms                                                                 |features(numFeatures), [index], [term frequency]|
+----------------------------------------------------------------------+------------------------------------------------+
|[spark, spark, spark, spark, spark, is, is, awesome, awesome, awesome]|(16,[6,9],[5.0,5.0])                            |
|[I, love, love, spark, spark, very, very, very, very, much]           |(16,[0,6,8,12],[2.0,2.0,4.0,2.0])               |
|[everyone, should, should, use, use, spark]                           |(16,[5,6,13],[2.0,1.0,3.0])                     |
|[life, is, is, very, very, good]                                      |(16,[5,8,9],[1.0,3.0,2.0])                      |
|[it, is, is, too, too, cold]                                          |(16,[0,3,6,9],[1.0,2.0,1.0,2.0])                |
+-----------------------

In [55]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline

countvectorizer = CountVectorizer(minTF=1.0, minDF=1.0, vocabSize=20, 
                                  inputCol='terms', outputCol='features(vocabSize), [index], [term frequency]')
stages = [countvectorizer]
pipeline = Pipeline(stages=stages)

In [56]:
pipeline.fit(df).transform(df).show(truncate=False)

+----------------------------------------------------------------------+----------------------------------------------+
|terms                                                                 |features(vocabSize), [index], [term frequency]|
+----------------------------------------------------------------------+----------------------------------------------+
|[spark, spark, spark, spark, spark, is, is, awesome, awesome, awesome]|(15,[0,2,3],[5.0,2.0,3.0])                    |
|[I, love, love, spark, spark, very, very, very, very, much]           |(15,[0,1,6,9,10],[2.0,4.0,2.0,1.0,1.0])       |
|[everyone, should, should, use, use, spark]                           |(15,[0,4,5,11],[1.0,2.0,2.0,1.0])             |
|[life, is, is, very, very, good]                                      |(15,[1,2,12,14],[2.0,2.0,1.0,1.0])            |
|[it, is, is, too, too, cold]                                          |(15,[2,7,8,13],[2.0,2.0,1.0,1.0])             |
+---------------------------------------

In [57]:
from pyspark.sql.types import StringType
df_vocab = df.select('terms').rdd.\
            flatMap(lambda x: x[0]).\
            toDF(schema=StringType()).toDF('terms')
df_vocab.show()

+-------+
|  terms|
+-------+
|  spark|
|  spark|
|  spark|
|  spark|
|  spark|
|     is|
|     is|
|awesome|
|awesome|
|awesome|
|      I|
|   love|
|   love|
|  spark|
|  spark|
|   very|
|   very|
|   very|
|   very|
|   much|
+-------+
only showing top 20 rows



In [58]:
vocab_freq = df_vocab.rdd.countByValue()
pdf = pd.DataFrame({
        'term': list(vocab_freq.keys()),
        'frequency': list(vocab_freq.values())
    })
pdf
tf = spark.createDataFrame(pdf).orderBy('frequency', ascending=False)
tf.show()

+----------+---------+
|      term|frequency|
+----------+---------+
|   [spark]|        8|
|      [is]|        6|
|    [very]|        6|
| [awesome]|        3|
|  [should]|        2|
|     [too]|        2|
|    [love]|        2|
|     [use]|        2|
|       [I]|        1|
|[everyone]|        1|
|    [much]|        1|
|    [life]|        1|
|    [good]|        1|
|      [it]|        1|
|    [cold]|        1|
+----------+---------+



In [136]:
spark4 = SparkSession.builder.appName("Word2Vec Example").getOrCreate()

In [137]:
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark3.createDataFrame([
    ("spark spark spark spark spark is is awesome awesome awesome".split(" "), ),
    ("I love love spark spark very very very very much.".split(" "), ),
    ("everyone should should use use spark".split(" "), ),
    ("life is is very very good".split(" "), ),
    ("it is is too too cold".split(" "), )
], ["text"])

In [139]:
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)

In [140]:
for row in result.collect():
    text, vector = row
    #printing the results
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [spark, spark, spark, spark, spark, is, is, awesome, awesome, awesome] => 
Vector: [-0.11181261017918587,0.05737618284765631,-0.12805816382169724]

Text: [I, love, love, spark, spark, very, very, very, very, much.] => 
Vector: [-0.0714676357805729,-0.005499726347625256,-0.04653551112860441]

Text: [everyone, should, should, use, use, spark] => 
Vector: [0.042961226776242256,0.012468320628007252,-0.05180701116720835]

Text: [life, is, is, very, very, good] => 
Vector: [-0.037827527771393456,-0.056995682418346405,-0.05543783182899157]

Text: [it, is, is, too, too, cold] => 
Vector: [-0.10674821585416794,0.017834855243563652,-0.06762616833051045]



In [141]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("spark", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+-------+-------------------+
|   word|         similarity|
+-------+-------------------+
|   cold|  0.952985942363739|
|     is| 0.9410983920097351|
|awesome| 0.8373374938964844|
|    too| 0.6503974199295044|
|  much.|0.49109411239624023|
+-------+-------------------+



In [142]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("is", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+-------+------------------+
|   word|        similarity|
+-------+------------------+
|   cold|0.9806042313575745|
|awesome|0.9495662450790405|
|  spark|0.9410984516143799|
|     it|0.5934741497039795|
|    too|0.5830553770065308|
+-------+------------------+



In [143]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("very", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+-------+------------------+
|   word|        similarity|
+-------+------------------+
|     it|0.9975603222846985|
|awesome|0.6990795731544495|
|     is|0.5392875671386719|
|   cold|0.4887860119342804|
|    use| 0.372077077627182|
+-------+------------------+



In [145]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("awesome", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+-----+------------------+
| word|        similarity|
+-----+------------------+
| cold|0.9626889824867249|
|   is|0.9495663046836853|
|spark|0.8373376131057739|
|   it|0.7370890378952026|
| very|0.6990796327590942|
+-----+------------------+



In [146]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("should", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+--------+-------------------+
|    word|         similarity|
+--------+-------------------+
|    love| 0.6148567795753479|
|       I| 0.5274564623832703|
|    good| 0.5179246068000793|
|     too| 0.3981545567512512|
|everyone|0.12000617384910583|
+--------+-------------------+



In [148]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("love", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+------+--------------------+
|  word|          similarity|
+------+--------------------+
|   too|  0.8810828924179077|
|should|  0.6148567199707031|
| spark| 0.21434985101222992|
|    is|  0.1555756777524948|
|  good|0.030185656622052193|
+------+--------------------+



In [149]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("love", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+------+--------------------+
|  word|          similarity|
+------+--------------------+
|   too|  0.8810828924179077|
|should|  0.6148567199707031|
| spark| 0.21434985101222992|
|    is|  0.1555756777524948|
|  good|0.030185656622052193|
+------+--------------------+



In [150]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("use", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+--------+-------------------+
|    word|         similarity|
+--------+-------------------+
|    life| 0.9672171473503113|
|   much.|0.38150903582572937|
|    very|  0.372077077627182|
|everyone| 0.3341602683067322|
|      it|0.31852108240127563|
+--------+-------------------+



In [151]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("good", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+------+--------------------+
|  word|          similarity|
+------+--------------------+
|     I|  0.7308744192123413|
|should|  0.5179246068000793|
|  life| 0.15769793093204498|
|   use| 0.07257138937711716|
|  love|0.030185649171471596|
+------+--------------------+



In [60]:
import spacy

In [61]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [84]:
import pandas as pd
sentence = ('Life is best, students should love coding, everyone should be coding, programming is the basic education, The temperature is below freezing')
    

In [85]:
doc = nlp(sentence)

In [87]:
" ".join([token.lemma_ for token in doc])

'life be good , student should love coding , everyone should be code , programming be the basic education , the temperature be below freeze'

In [104]:
import pandas as pd
pdf = pd.DataFrame({
        'terms': [
           ['life', 'be', 'good'] , 
           ['student', 'should', 'love', 'coding'] , 
           ['everyone', 'should', 'be', 'code'] , 
           ['programming', 'be', 'the', 'basic', 'education'] , 
           ['the', 'temperature', 'be' ,'below', 'freeze']
        ]
    })
df = spark.createDataFrame(pdf)
df.show(truncate=False)

+----------------------------------------+
|terms                                   |
+----------------------------------------+
|[life, be, good]                        |
|[student, should, love, coding]         |
|[everyone, should, be, code]            |
|[programming, be, the, basic, education]|
|[the, temperature, be, below, freeze]   |
+----------------------------------------+



In [105]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline

countvectorizer = CountVectorizer(minTF=1.0, minDF=1.0, vocabSize=20, 
                                  inputCol='terms', outputCol='features(vocabSize), [index], [term frequency]')
stages = [countvectorizer]
pipeline = Pipeline(stages=stages)

In [106]:
pipeline.fit(df).transform(df).show(truncate=False)

+----------------------------------------+----------------------------------------------+
|terms                                   |features(vocabSize), [index], [term frequency]|
+----------------------------------------+----------------------------------------------+
|[life, be, good]                        |(16,[0,6,14],[1.0,1.0,1.0])                   |
|[student, should, love, coding]         |(16,[2,4,5,12],[1.0,1.0,1.0,1.0])             |
|[everyone, should, be, code]            |(16,[0,2,10,13],[1.0,1.0,1.0,1.0])            |
|[programming, be, the, basic, education]|(16,[0,1,3,7,15],[1.0,1.0,1.0,1.0,1.0])       |
|[the, temperature, be, below, freeze]   |(16,[0,1,8,9,11],[1.0,1.0,1.0,1.0,1.0])       |
+----------------------------------------+----------------------------------------------+



In [107]:
from pyspark.sql.types import StringType
df_vocab = df.select('terms').rdd.\
            flatMap(lambda x: x[0]).\
            toDF(schema=StringType()).toDF('terms')
df_vocab.show()

+-----------+
|      terms|
+-----------+
|       life|
|         be|
|       good|
|    student|
|     should|
|       love|
|     coding|
|   everyone|
|     should|
|         be|
|       code|
|programming|
|         be|
|        the|
|      basic|
|  education|
|        the|
|temperature|
|         be|
|      below|
+-----------+
only showing top 20 rows



In [108]:
vocab_freq = df_vocab.rdd.countByValue()
pdf = pd.DataFrame({
        'term': list(vocab_freq.keys()),
        'frequency': list(vocab_freq.values())
    })
pdf
tf = spark.createDataFrame(pdf).orderBy('frequency', ascending=False)
tf.show()

+-------------+---------+
|         term|frequency|
+-------------+---------+
|         [be]|        4|
|        [the]|        2|
|     [should]|        2|
|       [life]|        1|
|       [good]|        1|
|       [love]|        1|
|   [everyone]|        1|
|    [student]|        1|
|     [freeze]|        1|
|     [coding]|        1|
|       [code]|        1|
|      [basic]|        1|
|[programming]|        1|
|  [education]|        1|
|[temperature]|        1|
|      [below]|        1|
+-------------+---------+



In [154]:
spark5 = SparkSession.builder.appName("Word2Vec Example").getOrCreate()

In [155]:
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark5.createDataFrame([
    ("Life is best".split(" "), ),
    ("students should love coding".split(" "), ),
    ("everyone should be coding".split(" "), ),
    (" programming is the basic education".split(" "), ),
    ("The temperature is below freezing".split(" "), )
], ["text"])

In [156]:
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)

In [157]:
for row in result.collect():
    text, vector = row
    #printing the results
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [Life, is, best] => 
Vector: [-0.11419872442881265,0.006602279841899872,-0.06755329544345537]

Text: [students, should, love, coding] => 
Vector: [-0.04383242875337601,-0.029088133946061134,-0.03560134582221508]

Text: [everyone, should, be, coding] => 
Vector: [-0.0031382720917463303,-0.058355418499559164,0.015217570587992668]

Text: [, programming, is, the, basic, education] => 
Vector: [-0.00033767816300193465,0.011978995171375573,-0.09304226872821648]

Text: [The, temperature, is, below, freezing] => 
Vector: [0.023674151301383974,0.06494050472974777,-0.11245893500745296]



In [158]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("be", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+--------+-------------------+
|    word|         similarity|
+--------+-------------------+
|everyone| 0.8954975008964539|
|     The| 0.7332881093025208|
|        |  0.474439412355423|
|students|0.16039660573005676|
|   basic|0.03834228962659836|
+--------+-------------------+



In [159]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("should", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+-----------+-------------------+
|       word|         similarity|
+-----------+-------------------+
|       Life| 0.9959954619407654|
|        the| 0.7121812701225281|
|  education| 0.6875379085540771|
|     coding| 0.5179629325866699|
|temperature|0.47193169593811035|
+-----------+-------------------+



In [161]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("Life", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+-----------+------------------+
|       word|        similarity|
+-----------+------------------+
|     should|0.9959953427314758|
|  education|0.7390859723091125|
|        the|0.7085385322570801|
|     coding|0.5897789001464844|
|temperature|0.5397940874099731|
+-----------+------------------+



In [165]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("everyone", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+--------+-------------------+
|    word|         similarity|
+--------+-------------------+
|      be| 0.8954974412918091|
|     The| 0.5843537449836731|
|students|0.45005717873573303|
|   basic| 0.4135923981666565|
|        |0.03310742601752281|
+--------+-------------------+



In [166]:
synonyms = model.findSynonyms("love", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+------+-------------------+
|  word|         similarity|
+------+-------------------+
|  best|  0.882599413394928|
|      | 0.6451829075813293|
|    is|0.21804609894752502|
|coding|0.16145852208137512|
|    be|0.03328623250126839|
+------+-------------------+



In [170]:
synonyms = model.findSynonyms("coding", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+-----------+------------------+
|       word|        similarity|
+-----------+------------------+
|temperature|0.9798160195350647|
|  education|0.9458795785903931|
|         is|0.9411322474479675|
|       Life|0.5897789001464844|
|       best|0.5811722278594971|
+-----------+------------------+



In [174]:
synonyms = model.findSynonyms("basic", 2)   # its okay for certain words , real bad for others
synonyms.show(5)

+--------+------------------+
|    word|        similarity|
+--------+------------------+
|students|0.9619790315628052|
|     the|0.7168983817100525|
+--------+------------------+



In [175]:
synonyms = model.findSynonyms("programming", 2)   # its okay for certain words , real bad for others
synonyms.show(5)

+--------+------------------+
|    word|        similarity|
+--------+------------------+
|   below|0.9872907400131226|
|freezing|0.8758460879325867|
+--------+------------------+

