# Introduction du sujet: TF-IDF

# How we do TF-IDF in Spark

# Exemple

In [151]:
document1 =  "Welcome to TutorialKart. There are many tutorials covering various fields of technology."
 
document2 = "Technology has advanced a lot with the invention of semi-conductor transistor. Technology is affecting our dailylife a lot."
 
document3 = "You may find this tutorial on transistor technology interesting."

# Création de la session

In [152]:
import os
import pyspark
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SparkSession
import pandas as pd
pd.set_option('display.max_colwidth', 150)

In [153]:
sc = SparkSession\
        .builder\
        .appName("TfIdf Example")\
        .getOrCreate()

In [154]:
documents = sc.createDataFrame([
        (1, document1),
        (2, document2),
        (3, document3)
    ], ["doc_id", "doc_text"])

In [155]:
documents.toPandas()

Unnamed: 0,doc_id,doc_text
0,1,Welcome to TutorialKart. There are many tutorials covering various fields of technology.
1,2,Technology has advanced a lot with the invention of semi-conductor transistor. Technology is affecting our dailylife a lot.
2,3,You may find this tutorial on transistor technology interesting.


In [156]:
data = (documents.rdd
       .map(lambda x: (x.doc_id,x.doc_text.lower().replace('.','')
                       .replace(',','').replace(';','').
                       replace('?','').replace('!','').
                       replace(':','').split(" ")))
       .toDF()
       .withColumnRenamed("_1","doc_id")
       .withColumnRenamed("_2","features"))

In [157]:
data.toPandas()

Unnamed: 0,doc_id,features
0,1,"[welcome, to, tutorialkart, there, are, many, tutorials, covering, various, fields, of, technology]"
1,2,"[technology, has, advanced, a, lot, with, the, invention, of, semi-conductor, transistor, technology, is, affecting, our, dailylife, a, lot]"
2,3,"[you, may, find, this, tutorial, on, transistor, technology, interesting]"


# Vectorize - TF

In [163]:
htf= HashingTF(inputCol="features", outputCol="tf")#numFeatures=20)
tf = htf.transform(data)
#tf.show(truncate=False)
tf.head(2)

[Row(doc_id=1, features=['welcome', 'to', 'tutorialkart', 'there', 'are', 'many', 'tutorials', 'covering', 'various', 'fields', 'of', 'technology'], tf=SparseVector(262144, {9639: 1.0, 38068: 1.0, 48935: 1.0, 81213: 1.0, 95306: 1.0, 133828: 1.0, 164221: 1.0, 167122: 1.0, 180535: 1.0, 205044: 1.0, 235787: 1.0, 245044: 1.0})),
 Row(doc_id=2, features=['technology', 'has', 'advanced', 'a', 'lot', 'with', 'the', 'invention', 'of', 'semi-conductor', 'transistor', 'technology', 'is', 'affecting', 'our', 'dailylife', 'a', 'lot'], tf=SparseVector(262144, {9616: 1.0, 9639: 1.0, 15889: 1.0, 38068: 2.0, 85287: 1.0, 96016: 1.0, 103838: 1.0, 113764: 1.0, 126466: 1.0, 128231: 2.0, 153454: 1.0, 171778: 1.0, 205843: 1.0, 227410: 2.0, 234391: 1.0}))]

In [159]:
tf.toPandas()

Unnamed: 0,doc_id,features,tf
0,1,"[welcome, to, tutorialkart, there, are, many, tutorials, covering, various, fields, of, technology]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2,"[technology, has, advanced, a, lot, with, the, invention, of, semi-conductor, transistor, technology, is, affecting, our, dailylife, a, lot]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,3,"[you, may, find, this, tutorial, on, transistor, technology, interesting]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# IDF

In [177]:
idf = IDF(inputCol="tf",outputCol="idf")
tfidf = idf.fit(tf).transform(tf)
tfidf.head(2)

[Row(doc_id=1, features=['welcome', 'to', 'tutorialkart', 'there', 'are', 'many', 'tutorials', 'covering', 'various', 'fields', 'of', 'technology'], tf=SparseVector(262144, {9639: 1.0, 38068: 1.0, 48935: 1.0, 81213: 1.0, 95306: 1.0, 133828: 1.0, 164221: 1.0, 167122: 1.0, 180535: 1.0, 205044: 1.0, 235787: 1.0, 245044: 1.0}), idf=SparseVector(262144, {9639: 0.2877, 38068: 0.0, 48935: 0.6931, 81213: 0.6931, 95306: 0.6931, 133828: 0.6931, 164221: 0.6931, 167122: 0.6931, 180535: 0.6931, 205044: 0.6931, 235787: 0.6931, 245044: 0.6931})),
 Row(doc_id=2, features=['technology', 'has', 'advanced', 'a', 'lot', 'with', 'the', 'invention', 'of', 'semi-conductor', 'transistor', 'technology', 'is', 'affecting', 'our', 'dailylife', 'a', 'lot'], tf=SparseVector(262144, {9616: 1.0, 9639: 1.0, 15889: 1.0, 38068: 2.0, 85287: 1.0, 96016: 1.0, 103838: 1.0, 113764: 1.0, 126466: 1.0, 128231: 2.0, 153454: 1.0, 171778: 1.0, 205843: 1.0, 227410: 2.0, 234391: 1.0}), idf=SparseVector(262144, {9616: 0.6931, 9639: 

In [211]:
test=tfidf.select('tf').show()
test

+--------------------+
|                  tf|
+--------------------+
|(262144,[9639,380...|
|(262144,[9616,963...|
|(262144,[38068,50...|
+--------------------+



In [204]:
df=tfidf.toPandas()
df


Unnamed: 0,doc_id,features,tf,idf
0,1,"[welcome, to, tutorialkart, there, are, many, tutorials, covering, various, fields, of, technology]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2,"[technology, has, advanced, a, lot, with, the, invention, of, semi-conductor, transistor, technology, is, affecting, our, dailylife, a, lot]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,3,"[you, may, find, this, tutorial, on, transistor, technology, interesting]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [162]:
res = tfidf.rdd.map(lambda x:(x.doc_id,x.features,x.tf,x.idf,(None if x.idf is None else x.idf.values.sum())))
for r in res.take(10):
    print (r)

(1, ['welcome', 'to', 'tutorialkart', 'there', 'are', 'many', 'tutorials', 'covering', 'various', 'fields', 'of', 'technology'], SparseVector(262144, {9639: 1.0, 38068: 1.0, 48935: 1.0, 81213: 1.0, 95306: 1.0, 133828: 1.0, 164221: 1.0, 167122: 1.0, 180535: 1.0, 205044: 1.0, 235787: 1.0, 245044: 1.0}), SparseVector(262144, {9639: 0.2877, 38068: 0.0, 48935: 0.6931, 81213: 0.6931, 95306: 0.6931, 133828: 0.6931, 164221: 0.6931, 167122: 0.6931, 180535: 0.6931, 205044: 0.6931, 235787: 0.6931, 245044: 0.6931}), 7.219153878051234)
(2, ['technology', 'has', 'advanced', 'a', 'lot', 'with', 'the', 'invention', 'of', 'semi-conductor', 'transistor', 'technology', 'is', 'affecting', 'our', 'dailylife', 'a', 'lot'], SparseVector(262144, {9616: 1.0, 9639: 1.0, 15889: 1.0, 38068: 2.0, 85287: 1.0, 96016: 1.0, 103838: 1.0, 113764: 1.0, 126466: 1.0, 128231: 2.0, 153454: 1.0, 171778: 1.0, 205843: 1.0, 227410: 2.0, 234391: 1.0}), SparseVector(262144, {9616: 0.6931, 9639: 0.2877, 15889: 0.6931, 38068: 0.0, 8

In [171]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType


sum_ = udf(lambda v: float(v.values.sum()), DoubleType())
tfidf.withColumn("idf_sum", sum_("idf")).show()

+------+--------------------+--------------------+--------------------+------------------+
|doc_id|            features|                  tf|                 idf|           idf_sum|
+------+--------------------+--------------------+--------------------+------------------+
|     1|[welcome, to, tut...|(262144,[9639,380...|(262144,[9639,380...| 7.219153878051234|
|     2|[technology, has,...|(262144,[9616,963...|(262144,[9616,963...|10.279424672742797|
|     3|[you, may, find, ...|(262144,[38068,50...|(262144,[38068,50...| 5.139712336371398|
+------+--------------------+--------------------+--------------------+------------------+



In [26]:
sc.stop()