In [1]:
!pip install pyspark



In [2]:
from __future__ import print_function
from pyspark import SparkConf, SparkContext
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SparkSession
from pyspark.ml.feature import NGram
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import CountVectorizer

In [3]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Package cess_cat is already up-to-date!
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Package cess_esp is already up-to-date!
[nltk_data]    | Downloading packag

True

In [4]:
# creating spark session
spark = SparkSession.builder.appName("TfIdf Example").getOrCreate()

In [5]:
# creating spark dataframe with 5 different txt files
#First, declare each text file into their own dataframe
r1= spark.read.text("/content/drive/MyDrive/Colab Notebooks/ICP5/ChristopherNolan.txt")
r2 = spark.read.text("/content/drive/MyDrive/Colab Notebooks/ICP5/DarthPlagueis.txt")
r3 = spark.read.text("/content/drive/MyDrive/Colab Notebooks/ICP5/FriendLikeMe.txt")
r4 = spark.read.text("/content/drive/MyDrive/Colab Notebooks/ICP5/Never Gonna Give You Up.txt")
r5 = spark.read.text("/content/drive/MyDrive/Colab Notebooks/ICP5/RespectfulDriver.txt")

#Then combine all of them into one dataframe
r12 = r1.union(r2) #NOTE: Since union only combines 2 at a time we have to use multiple
r123 = r12.union(r3)
r1234 = r123.union(r4)
full = r1234.union(r5)
full.show()

+--------------------+
|               value|
+--------------------+
|LOS ANGELES—Visib...|
|Did you ever hear...|
|Well Ali Baba had...|
|We're no stranger...|
|LAKEWOOD, OH—Foll...|
+--------------------+



In [6]:
# creating tokens/words from the sentence data
tokenizer = Tokenizer(inputCol="value", outputCol="words")
wordsData = tokenizer.transform(full)
wordsData.show()

+--------------------+--------------------+
|               value|               words|
+--------------------+--------------------+
|LOS ANGELES—Visib...|[los, angeles—vis...|
|Did you ever hear...|[did, you, ever, ...|
|Well Ali Baba had...|[well, ali, baba,...|
|We're no stranger...|[we're, no, stran...|
|LAKEWOOD, OH—Foll...|[lakewood,, oh—fo...|
+--------------------+--------------------+



In [7]:
#Lemmatizer: First we import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
#Then we save the data onto the disk
f = wordsData.select("words").collect()
#Where now we can lemmatize each individual word
for x in f: #For each list
  for y in x: #for row
    for i, z in enumerate(y): #for each individual word
      y[i] = lemmatizer.lemmatize(z)

#Now we save it into an dataframe
lemmaWord = spark.createDataFrame(f)
lemmaWord.show()

+--------------------+
|               words|
+--------------------+
|[los, angeles—vis...|
|[did, you, ever, ...|
|[well, ali, baba,...|
|[we're, no, stran...|
|[lakewood,, oh—fo...|
+--------------------+



In [8]:
#Ngram (we're only gonna do 2)
#NOTE: Since the function below uses "words" column, we have to make another tokenizer
#for specifically ngram

#this section is tokenizing the text, only this time the output column is called ngram
tokenizer = Tokenizer(inputCol="value", outputCol="ngram")
ngrama = tokenizer.transform(full)

#Now we can ngram the new dataset, with the output column being words
ngram = NGram(n=2, inputCol="ngram", outputCol="words")
ngramData = ngram.transform(ngrama)
ngramData.show()

+--------------------+--------------------+--------------------+
|               value|               ngram|               words|
+--------------------+--------------------+--------------------+
|LOS ANGELES—Visib...|[los, angeles—vis...|[los angeles—visi...|
|Did you ever hear...|[did, you, ever, ...|[did you, you eve...|
|Well Ali Baba had...|[well, ali, baba,...|[well ali, ali ba...|
|We're no stranger...|[we're, no, stran...|[we're no, no str...|
|LAKEWOOD, OH—Foll...|[lakewood,, oh—fo...|[lakewood, oh—fol...|
+--------------------+--------------------+--------------------+



In [79]:
#Pass in Dataset with a column named 'words'
#Outputs tfIdf top ten words and those wordsW2V
def tfIdf(wordData):
  # applying tf on the words data
  #NOTE: numFeatures mentioned below
  hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=229 )
  featurizedData = hashingTF.transform(wordData)
  # calculating the IDF
  idf = IDF(inputCol="rawFeatures", outputCol="features")
  idfModel = idf.fit(featurizedData)
  rescaledData = idfModel.transform(featurizedData)
  #Since with Pyspark there's no fast way to output top 10 lemma data 
  #(at least with the 6 hours I've spent looking) We have to make our own function to output the lemma#


  data = rescaledData.select("features").collect() #First we get all the idf data
  wordD = rescaledData.select("words").collect() #And all the words


  ###Due to how hashing works, each index corresponds to a word in the data.
  #When pulling out the data, we're left with a bunch of 0's from the hashing
  #So we must have the num of features equal to the text size to remove excess 0's
  #Since these documents are not the same size, the numFeatures in hashing above
  #Is calculated by amount of total words/Amount of txt files
  #The data would be better if all txt files were same size
  #The way I'm doing this is probably incorrect, but is the only way I was able to 
  #correspond the hash data with the idf values

  idfval = [] #This will store the idf values
  for x in data:
    for y in x:
        for index, z in enumerate(y): #By doing this, you get a double matrix the size of the numFeatures
          idfval.append(z)


  wordVect=[] #This will store the tokenized words
  for x in wordD:
    for y in x:
      for index,z in enumerate(y):
        wordVect.append(z)


   

#Now we merge the itfIdf with the words in a double matrix
  merged = list(zip(idfval,wordVect))
  merged.sort()
  merged.reverse() #Reverse so we get highest value first


  #Next, we need to get only the top 10 values
  top= []
  for i in range(0,10):
    top.append(merged[i][1]) #Add each other word with a space in between
  print(top)



  #W2V5. Finding similar words to top 10

  # Input data is the dataset we passed through
  word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="words", outputCol="result")
  model = word2Vec.fit(wordData)

  # showing the synonyms and cosine similarity of the word in input data
  for i in range(0,10):
    print(top[i])
    synonyms = model.findSynonyms(top[i], 5)   # its okay for certain words , real bad for others
    synonyms.show(10)

  return

In [83]:
tfIdf(wordsData)

['give.', 'run', 'never', 'just', 'gonna', 'bona', 'never', 'gonna', 'we', 'lie']
give.
+-------+------------------+
|   word|        similarity|
+-------+------------------+
|ironic.| 0.998630166053772|
|   i’ve|0.9965013861656189|
|   i'll|0.9898534417152405|
|    and| 0.979064404964447|
|  looky|0.9746612906455994|
+-------+------------------+

run
+------+------------------+
|  word|        similarity|
+------+------------------+
|became|0.9924935102462769|
|   it.|0.9891957640647888|
|   and|0.9889644980430603|
|fails.|0.9867961406707764|
|  i’ve|0.9794812798500061|
+------+------------------+

never
+----+------------------+
|word|        similarity|
+----+------------------+
| got|0.9968725442886353|
|more|0.9959987998008728|
|give|0.9895392656326294|
|you.|0.9888480305671692|
|make|0.9872699975967407|
+----+------------------+

just
+------+------------------+
|  word|        similarity|
+------+------------------+
|pincer|0.9988298416137695|
|  take|0.9813953638076782|
| tails

In [84]:
tfIdf(lemmaWord)

['give.', 'run', 'never', 'just', 'gonna', 'never', 'on.', 'gonna', 'we', 'lie']
give.
+------+------------------+
|  word|        similarity|
+------+------------------+
|nolan,|0.9990513324737549|
|became|0.9977195858955383|
| long,|0.9899457693099976|
|friend|0.9891470074653625|
|   not|0.9869879484176636|
+------+------------------+

run
+-----------+------------------+
|       word|        similarity|
+-----------+------------------+
|     order,| 0.998065173625946|
|understood.|0.9967820048332214|
|     screen|0.9961744546890259|
|         or|0.9938889741897583|
|      their|0.9898148775100708|
+-----------+------------------+

never
+------+------------------+
|  word|        similarity|
+------+------------------+
| gonna| 0.985480785369873|
|  you.| 0.985102117061615|
|  tell|0.9836183190345764|
|   say|0.9816176891326904|
|around|0.9707440733909607|
+------+------------------+

just
+---------+------------------+
|     word|        similarity|
+---------+------------------+
|

In [85]:
tfIdf(ngramData)

['run around', 'never gonna', 'each other', "d'affaires. i", 'you out', 'give. (give', 'you up.', 'shy to', 'see. never', 'gonna run']
run around
+-----------+------------------+
|       word|        similarity|
+-----------+------------------+
| (ooh, give|0.9989930391311646|
|    me. can|0.9987049698829651|
|ain't never|0.9984701871871948|
| and desert|0.9979979395866394|
|      to be|0.9966205358505249|
+-----------+------------------+

never gonna
+----------+------------------+
|      word|        similarity|
+----------+------------------+
|gonna make|0.9994514584541321|
|gonna give| 0.998485267162323|
|  you cry.|0.9981626272201538|
|   you got|0.9969722628593445|
|  and hurt|0.9965101480484009|
+----------+------------------+

each other
+------------+------------------+
|        word|        similarity|
+------------+------------------+
|      is rub|0.9925941228866577|
|   this plot|0.9920094013214111|
|      and so|0.9846543669700623|
|   “wait, so|0.9833011627197266|
|your 