In [1]:
try:
    !pip install pyspark=="2.4.5"  --quiet
except:
    print("Running throw py file.")

In [27]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import lower, col
import pandas as pd
import os

# Criando a session Spark

In [3]:
spark = SparkSession\
        .builder\
        .appName("Analise Sherlock homes - Fabio Kfouri")\
        .getOrCreate()

In [4]:
spark

In [43]:
#read book
df = spark.read.text("sherlock.txt")

Leitura da primeira linha da obra

In [44]:
print(df.first())

Row(value="Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle")


Quantidade de linhas

In [45]:
print(df.count())

12309


Visualizando um trecho da obra. Truncate setado como false permite a visualização de textos mais longos

In [46]:
df.show(15, truncate = False)

+----------------------------------------------------------------------------+
|value                                                                       |
+----------------------------------------------------------------------------+
|Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle|
|                                                                            |
|This eBook is for the use of anyone anywhere at no cost and with            |
|almost no restrictions whatsoever.  You may copy it, give it away or        |
|re-use it under the terms of the Project Gutenberg License included         |
|with this eBook or online at www.gutenberg.net                              |
|                                                                            |
|                                                                            |
|Title: The Adventures of Sherlock Holmes                                    |
|                                                   

Transformando em LowerCase e definido um Alias

In [47]:
df = df.select(lower(col('value')).alias('value'))
df.show(15, truncate = False)

+----------------------------------------------------------------------------+
|value                                                                       |
+----------------------------------------------------------------------------+
|project gutenberg's the adventures of sherlock holmes, by arthur conan doyle|
|                                                                            |
|this ebook is for the use of anyone anywhere at no cost and with            |
|almost no restrictions whatsoever.  you may copy it, give it away or        |
|re-use it under the terms of the project gutenberg license included         |
|with this ebook or online at www.gutenberg.net                              |
|                                                                            |
|                                                                            |
|title: the adventures of sherlock holmes                                    |
|                                                   

Replacing de textos e termos

In [48]:
df = df.select(F.regexp_replace('value', 'Mr\.', 'Mr').alias('value'))
df = df.select(F.regexp_replace('value', 'don\'t', 'do not').alias('value'))
#df = df.select(F.regexp_replace('value', '\'s', 'do not').alias('value'))

Tokenizing Text, retorna uma matriz de sequencias de caracteres

In [55]:
df1= df.select(F.split('Value', '[ ]').alias('words'))
df1.show(truncate=False)

+---------------------------------------------------------------------------------------------+
|words                                                                                        |
+---------------------------------------------------------------------------------------------+
|[project, gutenberg's, the, adventures, of, sherlock, holmes,, by, arthur, conan, doyle]     |
|[]                                                                                           |
|[this, ebook, is, for, the, use, of, anyone, anywhere, at, no, cost, and, with]              |
|[almost, no, restrictions, whatsoever., , you, may, copy, it,, give, it, away, or]           |
|[re-use, it, under, the, terms, of, the, project, gutenberg, license, included]              |
|[with, this, ebook, or, online, at, www.gutenberg.net]                                       |
|[]                                                                                           |
|[]                                     

Divide o texto e remove simbolos indesejados

In [56]:
punctuation = "_|.\?\!\",\'\[\]\*():;<>"
df2 = df.select(F.split('value', '[ %s]' % punctuation).alias('words'))
df2.show(truncate=False)

+---------------------------------------------------------------------------------------------------+
|words                                                                                              |
+---------------------------------------------------------------------------------------------------+
|[project, gutenberg, s, the, adventures, of, sherlock, holmes, , by, arthur, conan, doyle]         |
|[]                                                                                                 |
|[this, ebook, is, for, the, use, of, anyone, anywhere, at, no, cost, and, with]                    |
|[almost, no, restrictions, whatsoever, , , you, may, copy, it, , give, it, away, or]               |
|[re-use, it, under, the, terms, of, the, project, gutenberg, license, included]                    |
|[with, this, ebook, or, online, at, www, gutenberg, net]                                           |
|[]                                                                               

Explodindo em Array o campo words

In [57]:
df3 = df2.select(F.explode('words').alias('word'))
df3.show()

+----------+
|      word|
+----------+
|   project|
| gutenberg|
|         s|
|       the|
|adventures|
|        of|
|  sherlock|
|    holmes|
|          |
|        by|
|    arthur|
|     conan|
|     doyle|
|          |
|      this|
|     ebook|
|        is|
|       for|
|       the|
|       use|
+----------+
only showing top 20 rows



Tamanho de comparacao entre os dataframes

In [58]:
print(df.count(), df3.count())

12309 126580


Removendo colunas vazias (empty)

In [59]:
noblank_df = df3.where(F.length('word') > 0)
print(df3.count(), noblank_df.count())

126580 110448


Adicionando um ID no dataset

In [61]:
df4 = noblank_df.select('word', F.monotonically_increasing_id().alias('id'))
df4.show()

+----------+---+
|      word| id|
+----------+---+
|   project|  0|
| gutenberg|  1|
|         s|  2|
|       the|  3|
|adventures|  4|
|        of|  5|
|  sherlock|  6|
|    holmes|  7|
|        by|  8|
|    arthur|  9|
|     conan| 10|
|     doyle| 11|
|      this| 12|
|     ebook| 13|
|        is| 14|
|       for| 15|
|       the| 16|
|       use| 17|
|        of| 18|
|    anyone| 19|
+----------+---+
only showing top 20 rows



### Particionando os dados

In [80]:
df5 = df4.withColumn('title', F.when(df4.id < 25000, 'Preface')
                             .when(df4.id < 25000, 'Chapter 1')
                             .when(df4.id < 25000, 'Chapter 2')
                             .otherwise('Chapter 3'))

In [81]:
df5.show()

+----------+---+-------+
|      word| id|  title|
+----------+---+-------+
|   project|  0|Preface|
| gutenberg|  1|Preface|
|         s|  2|Preface|
|       the|  3|Preface|
|adventures|  4|Preface|
|        of|  5|Preface|
|  sherlock|  6|Preface|
|    holmes|  7|Preface|
|        by|  8|Preface|
|    arthur|  9|Preface|
|     conan| 10|Preface|
|     doyle| 11|Preface|
|      this| 12|Preface|
|     ebook| 13|Preface|
|        is| 14|Preface|
|       for| 15|Preface|
|       the| 16|Preface|
|       use| 17|Preface|
|        of| 18|Preface|
|    anyone| 19|Preface|
+----------+---+-------+
only showing top 20 rows



Colocando uma nova coluna chamada part

In [82]:
df5 = df5.withColumn('part', F.when(df5.id < 25000, 0)
                             .when(df5.id < 25000, 1)
                             .when(df5.id < 25000, 2)
                             .otherwise(3))

In [83]:
df5.show()

+----------+---+-------+----+
|      word| id|  title|part|
+----------+---+-------+----+
|   project|  0|Preface|   0|
| gutenberg|  1|Preface|   0|
|         s|  2|Preface|   0|
|       the|  3|Preface|   0|
|adventures|  4|Preface|   0|
|        of|  5|Preface|   0|
|  sherlock|  6|Preface|   0|
|    holmes|  7|Preface|   0|
|        by|  8|Preface|   0|
|    arthur|  9|Preface|   0|
|     conan| 10|Preface|   0|
|     doyle| 11|Preface|   0|
|      this| 12|Preface|   0|
|     ebook| 13|Preface|   0|
|        is| 14|Preface|   0|
|       for| 15|Preface|   0|
|       the| 16|Preface|   0|
|       use| 17|Preface|   0|
|        of| 18|Preface|   0|
|    anyone| 19|Preface|   0|
+----------+---+-------+----+
only showing top 20 rows



Reparticionando baseado em uma coluna

In [93]:
df6 = df5.repartition(4, 'part')
print(df5.rdd.getNumPartitions(), df6.rdd.getNumPartitions(), )

1 4


In [90]:
df6.show()

+----------+---+-------+----+
|      word| id|  title|part|
+----------+---+-------+----+
|   project|  0|Preface|   0|
| gutenberg|  1|Preface|   0|
|         s|  2|Preface|   0|
|       the|  3|Preface|   0|
|adventures|  4|Preface|   0|
|        of|  5|Preface|   0|
|  sherlock|  6|Preface|   0|
|    holmes|  7|Preface|   0|
|        by|  8|Preface|   0|
|    arthur|  9|Preface|   0|
|     conan| 10|Preface|   0|
|     doyle| 11|Preface|   0|
|      this| 12|Preface|   0|
|     ebook| 13|Preface|   0|
|        is| 14|Preface|   0|
|       for| 15|Preface|   0|
|       the| 16|Preface|   0|
|       use| 17|Preface|   0|
|        of| 18|Preface|   0|
|    anyone| 19|Preface|   0|
+----------+---+-------+----+
only showing top 20 rows



In [97]:
df6.coalesce(1).write.csv('spark_output/df6')