# Criando UDF

In [1]:
try:
    !pip install pyspark=="2.4.5"  --quiet
except:
 print("Running throw py file.")

In [2]:
from pyspark import SparkContext as sc
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark import SparkFiles
from pyspark.sql.types import StringType, BooleanType, IntegerType, FloatType, ArrayType
import pyspark

In [3]:
spark = SparkSession\
        .builder\
        .appName("Estudo Spark - UDF Functions - Fabio Kfouri")\
        .getOrCreate()

## Leitura de dados usando uma fonte pública

Fonte de Dados:

https://data.humdata.org/dataset/faostat-prices-for-brazil

In [6]:
#spark.sparkContext.addFile('https://data.humdata.org/dataset/bdf7bcca-28ae-47c5-8993-c87a7c5c04c0/resource/c16c15f5-efaf-4950-b848-94935987d312/download/producer-prices_bra.csv')
spark.sparkContext.addFile("sherlock\sherlock.txt")

In [70]:
#df = spark.read.csv(SparkFiles.get("producer-prices_bra.csv"), header = True, sep = ",")
#text
df = spark.read.text(SparkFiles.get("sherlock.txt"))

In [71]:
df.show(5, False)

+----------------------------------------------------------------------------+
|value                                                                       |
+----------------------------------------------------------------------------+
|Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle|
|                                                                            |
|This eBook is for the use of anyone anywhere at no cost and with            |
|almost no restrictions whatsoever.  You may copy it, give it away or        |
|re-use it under the terms of the Project Gutenberg License included         |
+----------------------------------------------------------------------------+
only showing top 5 rows



In [72]:
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol="value", outputCol="words")
df = tokenizer.transform(df)
df.show()

+--------------------+--------------------+
|               value|               words|
+--------------------+--------------------+
|Project Gutenberg...|[project, gutenbe...|
|                    |                  []|
|This eBook is for...|[this, ebook, is,...|
|almost no restric...|[almost, no, rest...|
|re-use it under t...|[re-use, it, unde...|
|with this eBook o...|[with, this, eboo...|
|                    |                  []|
|                    |                  []|
|Title: The Advent...|[title:, the, adv...|
|                    |                  []|
|Author: Arthur Co...|[author:, arthur,...|
|                    |                  []|
|Release Date: Nov...|[release, date:, ...|
|Last Updated: May...|[last, updated:, ...|
|                    |                  []|
|   Language: English|[language:, english]|
|                    |                  []|
|Character set enc...|[character, set, ...|
|                    |                  []|
|*** START OF THIS...|[***, star

## Criando um boolean UDF

In [73]:
short_udf = F.udf(lambda x:\
                 True if not x or len(x) < 6 else False, BooleanType())

In [74]:
df.select(short_udf('value').alias('is short'), 'value').show()

+--------+--------------------+
|is short|               value|
+--------+--------------------+
|   false|Project Gutenberg...|
|    true|                    |
|   false|This eBook is for...|
|   false|almost no restric...|
|   false|re-use it under t...|
|   false|with this eBook o...|
|    true|                    |
|    true|                    |
|   false|Title: The Advent...|
|    true|                    |
|   false|Author: Arthur Co...|
|    true|                    |
|   false|Release Date: Nov...|
|   false|Last Updated: May...|
|    true|                    |
|   false|   Language: English|
|    true|                    |
|   false|Character set enc...|
|    true|                    |
|   false|*** START OF THIS...|
+--------+--------------------+
only showing top 20 rows



## Praticando com coluna de Array

In [75]:
TRIVIAL_TOKENS = {'', 'u', 'p', '1', '4', 'r', '7', '0', 'g', 'x', 'n', 'v', '6',\
                  'e', 't', 'm', 'f', 'o', '9', 'z', 'k', '5', 's', 'w', 'b', 'h', \
                  'l', '3', '2', 'c', 'q', 'pp', 'j', '8', 'y'}

In [76]:
# UDF removes items in TRIVIAL_TOKENS from array
rm_trivial_udf = F.udf(lambda x:
                     list(set(x) - TRIVIAL_TOKENS) if x
                     else x,
                     ArrayType(StringType()))

In [84]:
df_after = df.withColumn('in', rm_trivial_udf('words'))\
                    .withColumn('out', rm_trivial_udf('words'))
df_after.show()

+--------------------+--------------------+--------------------+--------------------+
|               value|               words|                  in|                 out|
+--------------------+--------------------+--------------------+--------------------+
|Project Gutenberg...|[project, gutenbe...|[by, doyle, conan...|[by, doyle, conan...|
|                    |                  []|                  []|                  []|
|This eBook is for...|[this, ebook, is,...|[cost, at, this, ...|[cost, at, this, ...|
|almost no restric...|[almost, no, rest...|[copy, it,, away,...|[copy, it,, away,...|
|re-use it under t...|[re-use, it, unde...|[under, re-use, i...|[under, re-use, i...|
|with this eBook o...|[with, this, eboo...|[at, this, or, wi...|[at, this, or, wi...|
|                    |                  []|                  []|                  []|
|                    |                  []|                  []|                  []|
|Title: The Advent...|[title:, the, adv...|[adventures

In [85]:
df_after.where(F.array_contains('words','gutenberg')).show()

+--------------------+--------------------+--------------------+--------------------+
|               value|               words|                  in|                 out|
+--------------------+--------------------+--------------------+--------------------+
|re-use it under t...|[re-use, it, unde...|[under, re-use, i...|[under, re-use, i...|
|*** START OF THIS...|[***, start, of, ...|[start, project, ...|[start, project, ...|
|Produced by an an...|[produced, by, an...|[produced, by, vo...|[produced, by, vo...|
|End of the Projec...|[end, of, the, pr...|[by, adventures, ...|[by, adventures, ...|
|*** END OF THIS P...|[***, end, of, th...|[project, this, *...|[project, this, *...|
|Produced by an an...|[produced, by, an...|[produced, by, vo...|[produced, by, vo...|
|concept and trade...|[concept, and, tr...|[a, concept, is, ...|[a, concept, is, ...|
|THE FULL PROJECT ...|[the, full, proje...|[full, gutenberg,...|[full, gutenberg,...|
|1.C. The Project ...|[1.c., the, proje...|[("the, arc

## Criando um UDF Vetor

In [86]:
from pyspark.ml.feature import CountVectorizer

In [87]:
cv = CountVectorizer (inputCol = 'words', outputCol = 'features')

In [88]:
model = cv.fit(df)
result = model.transform(df)

In [89]:
result.show(5, False)

+----------------------------------------------------------------------------+----------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+
|value                                                                       |words                                                                                   |features                                                                                                          |
+----------------------------------------------------------------------------+----------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+
|Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle|[project, gutenberg's, the, adventures, of, sherlock, holmes,, by, arthur

In [91]:
df_after.show()

+--------------------+--------------------+--------------------+--------------------+
|               value|               words|                  in|                 out|
+--------------------+--------------------+--------------------+--------------------+
|Project Gutenberg...|[project, gutenbe...|[by, doyle, conan...|[by, doyle, conan...|
|                    |                  []|                  []|                  []|
|This eBook is for...|[this, ebook, is,...|[cost, at, this, ...|[cost, at, this, ...|
|almost no restric...|[almost, no, rest...|[copy, it,, away,...|[copy, it,, away,...|
|re-use it under t...|[re-use, it, unde...|[under, re-use, i...|[under, re-use, i...|
|with this eBook o...|[with, this, eboo...|[at, this, or, wi...|[at, this, or, wi...|
|                    |                  []|                  []|                  []|
|                    |                  []|                  []|                  []|
|Title: The Advent...|[title:, the, adv...|[adventures

In [93]:
result = model.transform(df_after.withColumnRenamed('in', 'words2')\
                               .withColumnRenamed('words2', 'in')\
                                .withColumnRenamed('vec', 'invec'))
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|               value|               words|                  in|                 out|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Project Gutenberg...|[project, gutenbe...|[by, doyle, conan...|[by, doyle, conan...|(14556,[0,3,39,92...|
|                    |                  []|                  []|                  []|   (14556,[1],[1.0])|
|This eBook is for...|[this, ebook, is,...|[cost, at, this, ...|[cost, at, this, ...|(14556,[0,2,3,14,...|
|almost no restric...|[almost, no, rest...|[copy, it,, away,...|[copy, it,, away,...|(14556,[1,11,12,4...|
|re-use it under t...|[re-use, it, unde...|[under, re-use, i...|[under, re-use, i...|(14556,[0,3,11,12...|
|with this eBook o...|[with, this, eboo...|[at, this, or, wi...|[at, this, or, wi...|(14556,[17,20,28,...|
|                    |               

In [95]:
result.drop('sentence').show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|               value|               words|                  in|                 out|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Project Gutenberg...|[project, gutenbe...|[by, doyle, conan...|[by, doyle, conan...|(14556,[0,3,39,92...|
|                    |                  []|                  []|                  []|   (14556,[1],[1.0])|
|This eBook is for...|[this, ebook, is,...|[cost, at, this, ...|[cost, at, this, ...|(14556,[0,2,3,14,...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [99]:
result = model.transform(result.withColumnRenamed('out', 'words3'))\
        .withColumnRenamed('words3', 'out')\
        .withColumnRenamed('vec', 'outvec')
result.select('invec', 'outvec').show(3, False)	

IllegalArgumentException: 'requirement failed: Column features already exists.'

In [62]:
first_udf = F.udf(lambda x:
            float(x.indices[0]) 
            if (x and hasattr(x, "toArray") and x.numNonzeros())
            else 0.0,
            FloatType())

In [43]:
df.select(first_udf("in").alias("result")).distinct().show(5)

AnalysisException: "cannot resolve '`in`' given input columns: [value, words];;\n'Project [<lambda>('in) AS result#206]\n+- Project [value#0, UDF(value#0) AS words#7]\n   +- Relation[value#0] text\n"