# Setup Code

Install the necessary libraries

In [1]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/27/67/5158f846202d7f012d1c9ca21c3549a58fd3c6707ae8ee823adcaca6473c/pyspark-3.0.2.tar.gz (204.8MB)
[K     |████████████████████████████████| 204.8MB 67kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 18.2MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.2-py2.py3-none-any.whl size=205186687 sha256=15ece91247aaf7a9a3a36bbc9f440899bb767de26d95c57f7d9b7edb5c13e9e4
  Stored in directory: /root/.cache/pip/wheels/8b/09/da/c1f2859bcc86375dc972c5b6af4881b3603269bcc4c9be5d16
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.2


In [1]:
from __future__ import print_function
from pyspark import SparkConf
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SparkSession
from pyspark.ml.feature import NGram
from pyspark.ml.feature import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Import text files for analysis

In [2]:
with open("/content/doc1.txt","r+") as d1:
    doc1 = d1.read()
with open("/content/doc2.txt","r+") as d2:
    doc2 = d2.read()
with open("/content/doc3.txt","r+") as d3:
    doc3 = d3.read()
with open("/content/doc4.txt","r+") as d4:
    doc4 = d4.read()
with open("/content/doc5.txt","r+") as d5:
    doc5 = d5.read()

documents = [doc1,doc2,doc3,doc4,doc5]

# Task 1

## Find out the Top 10 TF-IDF words

Install necessary libraries needed for part 1

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

Transform the data

In [4]:
vect = TfidfVectorizer()
tfidf_matrix = vect.fit_transform(documents)

df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
pd.set_option('display.max_columns', 20)

Add row for value total

In [5]:
df.loc['Total'] = df.sum()

Print top 10 tfidf values and words

In [6]:
print (df.T.sort_values('Total', ascending=True).tail(10).T)

          trump        it      that      will        is        in        to  \
0      0.000000  0.016196  0.032392  0.022763  0.040491  0.194354  0.267237   
1      0.000000  0.152311  0.030462  0.171254  0.111695  0.132003  0.274160   
2      0.000000  0.121806  0.137032  0.203293  0.114193  0.190322  0.228386   
3      0.337416  0.017864  0.035729  0.000000  0.053593  0.321561  0.160780   
4      0.000000  0.036880  0.110641  0.000000  0.082981  0.184402  0.175182   
Total  0.337416  0.345058  0.346257  0.397311  0.402953  1.022642  1.105746   

            and        of       the  
0      0.332022  0.194354  0.437297  
1      0.304622  0.274160  0.324930  
2      0.213161  0.251225  0.471998  
3      0.196509  0.214374  0.321561  
4      0.156742  0.341144  0.543987  
Total  1.203056  1.275258  2.099774  


## Find out the Top 10 TF-IDF words for the lemmatized input

Install necessary libraries

In [7]:
import nltk;nltk.download('punkt');nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Declare the lemmatizer

In [8]:
lemmatizer = WordNetLemmatizer()

Proceed to tokenize and then lemmatize the words in each document

In [10]:
doc1words = nltk.word_tokenize(doc1)
doc2words = nltk.word_tokenize(doc2)
doc3words = nltk.word_tokenize(doc3)
doc4words = nltk.word_tokenize(doc4)
doc5words = nltk.word_tokenize(doc5)

doc1_lem = ' '.join([lemmatizer.lemmatize(w) for w in doc1words])
doc2_lem = ' '.join([lemmatizer.lemmatize(w) for w in doc2words])
doc3_lem = ' '.join([lemmatizer.lemmatize(w) for w in doc3words])
doc4_lem = ' '.join([lemmatizer.lemmatize(w) for w in doc4words])
doc5_lem = ' '.join([lemmatizer.lemmatize(w) for w in doc5words])

documents = [doc1_lem,doc2_lem,doc3_lem,doc4_lem,doc5_lem]

Transform the data

In [11]:
vect = TfidfVectorizer()
tfidf_matrix = vect.fit_transform(documents)

df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
pd.set_option('display.max_columns', 20)

Add row for value total

In [12]:
df.loc['Total'] = df.sum()

Print top 10 tfidf values and words

In [13]:
print (df.T.sort_values('Total', ascending=True).tail(10).T)

          trump      that        it      will        is        in        to  \
0      0.000000  0.032470  0.024352  0.022818  0.040587  0.194819  0.267876   
1      0.000000  0.030095  0.150477  0.169192  0.110350  0.130413  0.270858   
2      0.000000  0.133939  0.141380  0.198705  0.111616  0.186027  0.223232   
3      0.338136  0.035805  0.017903  0.000000  0.053708  0.322247  0.161124   
4      0.000000  0.110404  0.046002  0.000000  0.082803  0.184007  0.174807   
Total  0.338136  0.342714  0.380114  0.390715  0.399064  1.017513  1.097896   

            and        of       the  
0      0.332815  0.194819  0.438342  
1      0.300953  0.270858  0.321017  
2      0.208350  0.245555  0.461347  
3      0.196929  0.214832  0.322247  
4      0.156406  0.340413  0.542820  
Total  1.195453  1.266476  2.085773  


## Find out the Top 10 TF-IDF words for the n-gram based input

N-gram function that takes n value and integer to make list of n grams

In [14]:
def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input)-n+1):
        output.append(input[i:i+n])
    return output

In [15]:
doc1ngram = ' '.join([' '.join(x) for x in ngrams(doc1, 3)])
doc2ngram = ' '.join([' '.join(x) for x in ngrams(doc2, 3)])
doc3ngram = ' '.join([' '.join(x) for x in ngrams(doc3, 3)])
doc4ngram = ' '.join([' '.join(x) for x in ngrams(doc4, 3)])
doc5ngram = ' '.join([' '.join(x) for x in ngrams(doc5, 3)])

documents = [doc1ngram,doc2ngram,doc3ngram,doc4ngram,doc5ngram]

Transform the data

In [16]:
vect = TfidfVectorizer( ngram_range=(3,3)) # built-in ngram kwarg in TfidfVectorizer
tfidf_matrix = vect.fit_transform(documents)

df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
pd.set_option('display.max_columns', 20)

Add row for value total

In [17]:
df.loc['Total'] = df.sum()

Print top 10 tfidf values and n-gram based words

In [18]:
print (df.T.sort_values('Total', ascending=True).tail(10).T)

       to analyze to  analyze to analyze  qda miner qda  \
0           0.092516            0.092516       0.131052   
1           0.030364            0.030364       0.000000   
2           0.000000            0.000000       0.000000   
3           0.000000            0.000000       0.000000   
4           0.000000            0.000000       0.000000   
Total       0.122879            0.122879       0.131052   

       pandemic the pandemic  miner qda miner  the in the  in the in  \
0                   0.131052         0.131052    0.000000   0.000000   
1                   0.000000         0.000000    0.021203   0.021203   
2                   0.000000         0.000000    0.009545   0.009545   
3                   0.000000         0.000000    0.065284   0.065284   
4                   0.000000         0.000000    0.050015   0.050015   
Total               0.131052         0.131052    0.146046   0.146046   

       the pandemic the  the of the  of the of  
0              0.147434    0.085

# Task 2: Find W2V similar words for the top 10 TF_IDF words

## Setup Code (Task 2)

Create spark session

In [19]:
spark = SparkSession.builder.appName("TFIDF").getOrCreate()

Import text files as spark dataframe for analysis

In [20]:
documentData = spark.createDataFrame([
        (0.0, doc1),
        (0.1, doc2),
        (0.2, doc3),
        (0.3, doc4),
        (0.5, doc5)
    ], ["label", "document"])

Tokenize the documents

In [21]:
tokenizer = Tokenizer(inputCol="document", outputCol="words")
wordsData = tokenizer.transform(documentData)
print (documentData)
wordsData.show()

DataFrame[label: double, document: string]
+-----+--------------------+--------------------+
|label|            document|               words|
+-----+--------------------+--------------------+
|  0.0|Unfortunately, th...|[unfortunately,, ...|
|  0.1|Build or buy? Thi...|[build, or, buy?,...|
|  0.2|A pioneering reac...|[a, pioneering, r...|
|  0.3|In the 2020 presi...|[in, the, 2020, p...|
|  0.5|For centuries wes...|[for, centuries, ...|
+-----+--------------------+--------------------+



## Without NLP

Apply tf on the words data

In [22]:
# each entry is < 1000 words
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=1000)
featurizedData = hashingTF.transform(wordsData)

Calculate the IDF

In [23]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

Display the results

In [24]:
rescaledData.select("words", "features").show()

+--------------------+--------------------+
|               words|            features|
+--------------------+--------------------+
|[unfortunately,, ...|(1000,[1,2,4,8,9,...|
|[build, or, buy?,...|(1000,[2,4,7,8,14...|
|[a, pioneering, r...|(1000,[0,1,9,10,1...|
|[in, the, 2020, p...|(1000,[3,10,15,17...|
|[for, centuries, ...|(1000,[3,4,7,8,10...|
+--------------------+--------------------+



In [25]:
print("TF-IDF without NLP:")
for each in rescaledData.collect():
    print(each)
    print(each['rawFeatures'])
spark.stop()

TF-IDF without NLP:
(1000,[1,2,4,8,9,10,12,15,17,22,30,33,34,38,41,43,48,49,52,55,56,58,62,66,69,71,72,73,77,84,86,91,92,97,98,102,111,115,116,117,119,121,125,132,133,135,136,141,142,144,145,148,149,151,157,162,164,165,166,169,170,171,172,174,177,185,187,188,190,192,196,197,198,200,202,205,208,209,210,214,216,224,225,226,231,232,237,238,242,243,248,249,252,257,258,259,260,261,263,270,272,275,276,282,284,289,295,296,298,299,300,303,307,308,309,313,314,323,324,325,327,328,330,334,335,336,338,344,346,347,349,350,352,353,354,357,361,362,364,367,372,373,374,377,378,380,382,383,385,387,389,390,391,392,396,400,402,405,406,411,418,420,421,427,428,430,431,432,433,437,441,447,449,452,453,455,456,460,462,467,474,475,480,485,488,489,490,492,493,495,496,505,507,510,512,523,526,527,528,535,536,538,540,541,542,545,546,549,551,552,556,557,559,562,564,569,571,576,578,581,584,585,588,589,592,594,595,596,599,601,602,605,608,610,611,613,615,621,624,629,632,633,643,644,647,650,652,655,658,660,662,663,667,6

## With lemmatization

In [26]:
import nltk;nltk.download('punkt');nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Declare the lemmatizer

In [27]:
lemmatizer = WordNetLemmatizer()

Proceed to tokenize and then lemmatize the words in each document

In [28]:
doc1words = nltk.word_tokenize(doc1)
doc2words = nltk.word_tokenize(doc2)
doc3words = nltk.word_tokenize(doc3)
doc4words = nltk.word_tokenize(doc4)
doc5words = nltk.word_tokenize(doc5)

doc1_lem = ' '.join([lemmatizer.lemmatize(w) for w in doc1words])
doc2_lem = ' '.join([lemmatizer.lemmatize(w) for w in doc2words])
doc3_lem = ' '.join([lemmatizer.lemmatize(w) for w in doc3words])
doc4_lem = ' '.join([lemmatizer.lemmatize(w) for w in doc4words])
doc5_lem = ' '.join([lemmatizer.lemmatize(w) for w in doc5words])

Create spark session

In [29]:
spark = SparkSession.builder.appName("TFIDF").getOrCreate()

Import lemmatized text files as spark dataframe

In [30]:
documentData = spark.createDataFrame([
        (0.0, doc1_lem),
        (0.1, doc2_lem),
        (0.2, doc3_lem),
        (0.3, doc4_lem),
        (0.5, doc5_lem)
    ], ["label", "document"])

Tokenize the documents

In [31]:
tokenizer = Tokenizer(inputCol="document", outputCol="words")
wordsData = tokenizer.transform(documentData)
print (documentData)
wordsData.show()

DataFrame[label: double, document: string]
+-----+--------------------+--------------------+
|label|            document|               words|
+-----+--------------------+--------------------+
|  0.0|Unfortunately , t...|[unfortunately, ,...|
|  0.1|Build or buy ? Th...|[build, or, buy, ...|
|  0.2|A pioneering reac...|[a, pioneering, r...|
|  0.3|In the 2020 presi...|[in, the, 2020, p...|
|  0.5|For century weste...|[for, century, we...|
+-----+--------------------+--------------------+



Apply tf on the words data

In [32]:
# each entry is < 1000 words
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=1000)
featurizedData = hashingTF.transform(wordsData)

Calculate the IDF

In [33]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

Display the results

In [34]:
rescaledData.select("words", "features").show()

+--------------------+--------------------+
|               words|            features|
+--------------------+--------------------+
|[unfortunately, ,...|(1000,[1,2,4,9,10...|
|[build, or, buy, ...|(1000,[2,4,7,14,1...|
|[a, pioneering, r...|(1000,[1,10,11,12...|
|[in, the, 2020, p...|(1000,[2,10,15,17...|
|[for, century, we...|(1000,[0,1,3,4,7,...|
+--------------------+--------------------+



In [35]:
print("TF-IDF with lemmatization:\n")
for each in rescaledData.collect():
    print(each)
    print(each['rawFeatures'])
spark.stop()

TF-IDF with lemmatization:

(1000,[1,2,4,9,10,12,15,17,22,29,30,34,35,38,43,48,49,52,55,56,57,58,66,69,72,73,74,84,92,96,97,98,99,102,111,112,115,117,118,121,125,132,133,135,141,142,144,145,148,149,151,157,162,165,166,169,170,174,182,185,188,190,192,193,197,198,202,205,206,208,209,210,214,216,217,224,226,228,232,238,242,243,248,252,258,259,260,261,263,267,272,275,282,284,289,295,296,298,299,301,303,309,310,312,313,314,323,324,327,328,329,334,335,338,341,342,344,347,349,350,352,353,354,357,361,362,364,371,372,373,374,377,378,380,383,385,387,389,391,392,400,405,406,410,411,413,417,418,420,425,430,431,432,433,437,442,445,447,449,453,467,469,470,473,474,475,479,480,485,488,489,490,493,495,496,501,505,510,513,519,523,525,526,527,528,530,535,536,538,540,541,543,545,549,557,559,562,567,576,578,581,584,585,588,592,594,597,601,602,605,606,608,609,615,616,617,621,624,626,629,631,643,644,647,649,650,652,655,662,663,667,668,671,674,675,676,677,679,681,683,684,685,686,690,691,692,693,695,696,702,70

## With n-grams

Create spark session

In [36]:
spark = SparkSession.builder.appName("TFIDF").getOrCreate()

Import text files as spark dataframe

In [None]:
documentData = spark.createDataFrame([
        (0.0, doc1.split(' ')),
        (0.1, doc2.split(' ')),
        (0.2, doc3.split(' ')),
        (0.3, doc4.split(' ')),
        (0.5, doc5.split(' '))
    ], ["label", "document"])

Turn the text into n-grams with n = 2

In [39]:
ngram = NGram(n=2, inputCol="document", outputCol="ngrams")

ngramDataFrame = ngram.transform(documentData)

Apply tf on the words data

In [42]:
hashingTF = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=3000)
featurizedData = hashingTF.transform(ngramDataFrame)
featurizedData.cache()

DataFrame[label: double, document: array<string>, ngrams: array<string>, rawFeatures: vector]

Calculate the IDF

In [43]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [44]:
print("TF-IDF with n-grams (n=2):\n")
for each in rescaledData.collect():
    print(each)
    print(each['rawFeatures'])
spark.stop()

TF-IDF with n-grams (n=2):

(3000,[11,14,17,23,26,30,32,34,37,38,40,43,44,47,49,51,52,56,57,58,60,69,71,72,73,78,79,83,89,103,106,109,111,112,115,116,118,120,122,126,127,128,129,131,134,135,138,140,147,148,162,165,169,175,178,180,181,184,186,189,191,195,198,200,204,208,213,216,233,242,243,245,246,247,250,252,261,265,269,279,282,288,291,297,300,307,309,312,315,321,332,336,337,340,341,345,350,351,352,353,357,359,360,364,368,369,372,378,382,384,388,394,397,399,400,412,413,414,416,420,421,430,431,432,437,441,443,450,467,473,478,481,488,489,493,494,496,499,507,519,537,541,543,553,557,560,563,568,573,574,577,584,588,599,604,608,613,614,619,621,623,631,633,638,639,640,642,645,650,655,661,662,663,672,676,681,682,684,688,695,698,705,706,708,709,712,713,718,735,736,739,743,746,751,752,754,756,757,758,761,762,766,770,774,778,780,784,786,792,796,802,807,811,812,815,823,825,826,832,833,835,838,841,854,855,856,859,861,862,866,875,882,892,898,899,905,906,909,912,913,921,922,924,925,926,927,931,934,93