<a href="https://colab.research.google.com/github/freddyduitama/GVD/blob/master/0_5_TFIDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# instala el ambiente de spark..solo se corre una vez
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://apache.osuosl.org/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
!tar xf spark-2.4.0-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
#Configura variables de ambiente
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.0-bin-hadoop2.7"

In [0]:
#importa package
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import expr
from pyspark.sql.functions import col, column
import numpy as np

In [0]:
# define  la sesion SPARK. 
conf = SparkConf().setAppName("ejemplo").setMaster("local[*]")
sc = SparkContext(conf=conf)

In [0]:
#Crea la sesion
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Datos de entrada y preprocesamiento

In [0]:
# Dataframe como entrada. 
from pyspark.ml.feature import HashingTF, IDF, Tokenizer,  CountVectorizer

sentenceData = spark.createDataFrame([
    (0.0, "I Use Spark tools Tools  Framework"),
    (1.0, "Java Framework Clasess Tools  classes"),
    (2.0, "Use classes and objects"),
    (3.0, "Logistic regression models use simple and powerfull concepts"),
    (4.0,  "model and additional regression models use tools")
], ["id", "sentence"])

In [44]:
#Divide en token cada documento. Tokenizer splits each sentence into words using. For each sentence (bag of words)
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
wordsData.select("id","sentence","words").show(truncate=False)

+---+------------------------------------------------------------+---------------------------------------------------------------------+
|id |sentence                                                    |words                                                                |
+---+------------------------------------------------------------+---------------------------------------------------------------------+
|0.0|I Use Spark tools Tools  Framework                          |[i, use, spark, tools, tools, , framework]                           |
|1.0|Java Framework Clasess Tools  classes                       |[java, framework, clasess, tools, , classes]                         |
|2.0|Use classes and objects                                     |[use, classes, and, objects]                                         |
|3.0|Logistic regression models use simple and powerfull concepts|[logistic, regression, models, use, simple, and, powerfull, concepts]|
|4.0|model and additional regression mode

In [0]:
# Elimina StopWords
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
FilteredDoc = remover.transform(wordsData)

# calculo TF . Dos m√©todos

In [0]:
# Opciion 1. Usar HashingTF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=64)
featurizedData = hashingTF.transform(FilteredDoc)
featurizedData.select("id","words","rawFeatures").show(truncate=False)

In [47]:
# Opcion 2: fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="TF", vocabSize=128)
cv_model = cv.fit(FilteredDoc)
cv_result = cv_model.transform(FilteredDoc)
print ("Out of CountVectorizer function")
cv_result.select('id','words','TF').show(truncate=False)
print ("Vocabulary from CountVectorizerModel is \n" + str(cv_model.vocabulary))

Out of CountVectorizer function
+---+---------------------------------------------------------------------+------------------------------------------------------------+
|id |words                                                                |TF                                                          |
+---+---------------------------------------------------------------------+------------------------------------------------------------+
|0.0|[i, use, spark, tools, tools, , framework]                           |(19,[0,1,4,7,9,10],[2.0,1.0,1.0,1.0,1.0,1.0])               |
|1.0|[java, framework, clasess, tools, , classes]                         |(19,[0,4,6,7,16,18],[1.0,1.0,1.0,1.0,1.0,1.0])              |
|2.0|[use, classes, and, objects]                                         |(19,[1,2,6,17],[1.0,1.0,1.0,1.0])                           |
|3.0|[logistic, regression, models, use, simple, and, powerfull, concepts]|(19,[1,2,3,5,12,13,14,15],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|4.0|[mod

#Calculo IDF

In [49]:
idf = IDF(inputCol="TF", outputCol="IDF")
idfModel = idf.fit(cv_result)
rescaledData = idfModel.transform(cv_result)
rescaledData.select('id','words',col('IDF').alias('vocabSize,[  index   ],    [idf]')).show(truncate=False)

+---+---------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |words                                                                |vocabSize,[  index   ],    [idf]                                                                                                                                                    |
+---+---------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0|[i, use, spark, tools, tools, , framework]                           |(19,[0,1,4,7,9,10],[0.8109302162163288,0.1823215567939546,0.6931471805599453,0.6931471805599453,1.0986122886681098,1.0986122886681098])                   

#calculo TFiDF

In [0]:
# covert SPARK dataFrame to panda dataFrame
pandaDf = rescaledData.select("IDF").toPandas()

In [54]:
pandaDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 1 columns):
IDF    5 non-null object
dtypes: object(1)
memory usage: 120.0+ bytes


In [56]:
pandaDf

Unnamed: 0,IDF
0,"(0.8109302162163288, 0.1823215567939546, 0.0, ..."
1,"(0.4054651081081644, 0.0, 0.0, 0.0, 0.69314718..."
2,"(0.0, 0.1823215567939546, 0.4054651081081644, ..."
3,"(0.0, 0.1823215567939546, 0.4054651081081644, ..."
4,"(0.4054651081081644, 0.1823215567939546, 0.405..."


In [58]:
pandaDF.values

array([[SparseVector(19, {0: 0.8109, 1: 0.1823, 4: 0.6931, 5: 0.6931, 13: 1.0986, 17: 1.0986})],
       [SparseVector(19, {0: 0.4055, 3: 0.6931, 4: 0.6931, 5: 0.6931, 9: 1.0986, 12: 1.0986})],
       [SparseVector(19, {1: 0.1823, 2: 0.4055, 3: 0.6931, 11: 1.0986})],
       [SparseVector(19, {1: 0.1823, 2: 0.4055, 6: 0.6931, 7: 0.6931, 8: 1.0986, 10: 1.0986, 14: 1.0986, 15: 1.0986})],
       [SparseVector(19, {0: 0.4055, 1: 0.1823, 2: 0.4055, 6: 0.6931, 7: 0.6931, 16: 1.0986, 18: 1.0986})]],
      dtype=object)

# Aggregating Sparse and Dense Vectors in PySpark.
Ver: https://danvatterott.com/blog/2018/07/08/aggregating-sparse-and-dense-vectors-in-pyspark/