# Text Classification

Este fará usando conceitos de Machine Learning uma previsão da última palavra de uma frase/setença de uma pessoa. 

In [49]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pyspark
import pandas as pd
import os

In [3]:
spark = SparkSession\
        .builder\
        .appName("Spark Endword Prediction  - Fabio Kfouri")\
        .getOrCreate()
spark

In [7]:
#read book
df = spark.read.text("sherlock\sherlock.txt")
df.show(3)

+--------------------+
|               value|
+--------------------+
|Project Gutenberg...|
|                    |
|This eBook is for...|
+--------------------+
only showing top 3 rows



In [137]:
# transformando em minuscula
df1 = df.select(F.lower(F.col('value')).alias('value'))
# Replace de termos
df2 = df1.select(F.regexp_replace('value', 'mr\.', 'mr').alias('value'))
df2 = df1.select(F.regexp_replace('value', 'don\'t', 'do not').alias('value'))
# Tokenizando e removendo simbolos indesejados
punctuation = "_|.\?\!\",\'\[\]\*():;<>”“’"
df3 = df2.select(F.split('value', '[ %s]' % punctuation).alias('word'))
df3.show(5, False)

+------------------------------------------------------------------------------------------+
|word                                                                                      |
+------------------------------------------------------------------------------------------+
|[project, gutenberg, s, the, adventures, of, sherlock, holmes, , by, arthur, conan, doyle]|
|[]                                                                                        |
|[this, ebook, is, for, the, use, of, anyone, anywhere, at, no, cost, and, with]           |
|[almost, no, restrictions, whatsoever, , , you, may, copy, it, , give, it, away, or]      |
|[re-use, it, under, the, terms, of, the, project, gutenberg, license, included]           |
+------------------------------------------------------------------------------------------+
only showing top 5 rows



#### Obtendo o ultimo elemento do array

In [138]:
#df4 = df3.filter('doc <> []').withColumn("last", df3['doc'][-1])
#where(F.size('doc') > 0)
df4 = df3.withColumn("endword", F.element_at(F.col("word"),-1)).where(F.length('endword') > 0)
df4.show(5, False)

+------------------------------------------------------------------------------------------+--------+
|word                                                                                      |endword |
+------------------------------------------------------------------------------------------+--------+
|[project, gutenberg, s, the, adventures, of, sherlock, holmes, , by, arthur, conan, doyle]|doyle   |
|[this, ebook, is, for, the, use, of, anyone, anywhere, at, no, cost, and, with]           |with    |
|[almost, no, restrictions, whatsoever, , , you, may, copy, it, , give, it, away, or]      |or      |
|[re-use, it, under, the, terms, of, the, project, gutenberg, license, included]           |included|
|[with, this, ebook, or, online, at, www, gutenberg, net]                                  |net     |
+------------------------------------------------------------------------------------------+--------+
only showing top 5 rows



Selecionando/separando as sentenças cujo a ultima palavra seja she, he, hers, his, her, him e com as que não tem essas palavras

In [139]:
df_true = df4.where("endword in ('she', 'he', 'hers', 'his', 'her', 'him')")\
            .withColumn('label', F.lit(1))

df_false = df4.where("endword not in ('she', 'he', 'hers', 'his', 'her', 'him')")\
              .withColumn('label', F.lit(0))

print(df4.count(), df_true.count(), df_false.count())

6337 266 6071


Combinando o postivo com o negativo

In [140]:
df5 = df_true.union(df_false)
df5.show(5, False)
print(df5.count())

+--------------------------------------------------------------------------------------+-------+-----+
|word                                                                                  |endword|label|
+--------------------------------------------------------------------------------------+-------+-----+
|[to, sherlock, holmes, she, is, always, , the, , woman, , i, have, seldom, heard, him]|him    |1    |
|[were, abhorrent, to, his, cold, , precise, but, admirably, balanced, mind, , he]     |he     |1    |
|[from, time, to, time, i, heard, some, vague, account, of, his, doings, , of, his]    |his    |1    |
|[keen, desire, to, see, holmes, again, , and, to, know, how, he, was, employing, his] |his    |1    |
|[against, the, blind, , he, was, pacing, the, room, swiftly, , eagerly, , with, his]  |his    |1    |
+--------------------------------------------------------------------------------------+-------+-----+
only showing top 5 rows

6337


In [141]:
df5.sample(False, .1, 42).show(5, False)

+----------------------------------------------------------------------------------------+-------+-----+
|word                                                                                    |endword|label|
+----------------------------------------------------------------------------------------+-------+-----+
|[writes, upon, bohemian, paper, and, prefers, wearing, a, mask, to, showing, his]       |his    |1    |
|[, there, are, three, hundred, pounds, in, gold, and, seven, hundred, in, notes, , , he]|he     |1    |
|[investigation, which, my, friend, had, on, hand, , there, was, something, in, his]     |his    |1    |
|[she, half, drew, it, out, , when, i, cried, out, that, it, was, a, false, alarm, , she]|she    |1    |
|[an, absolute, imbecile, in, his, profession, , he, has, one, positive, virtue, , he]   |he     |1    |
+----------------------------------------------------------------------------------------+-------+-----+
only showing top 5 rows



## Tranformando Texto para Formato Vetor

In [142]:
TRIVIAL_TOKENS = {'', 'u', 'p', '1', '4', 'r', '7', '0', 'g', 'x', 'n', 'v', '6',\
                  'e', 't', 'm', 'f', 'o', '9', 'z', 'k', '5', 's', 'w', 'b', 'h', \
                  'l', '3', '2', 'c', 'q', 'pp', 'j', '8', 'y'}

In [143]:
from pyspark.sql.types import StringType, BooleanType, IntegerType, FloatType, ArrayType

# UDF removes items in TRIVIAL_TOKENS from array
rm_trivial_udf = F.udf(lambda x:
                     list(set(x) - TRIVIAL_TOKENS) if x
                     else x,
                     ArrayType(StringType()))

In [144]:
df6 = df5.withColumn('in', rm_trivial_udf('word'))
df6.show()

+--------------------+-------+-----+--------------------+
|                word|endword|label|                  in|
+--------------------+-------+-----+--------------------+
|[to, sherlock, ho...|    him|    1|[woman, holmes, i...|
|[were, abhorrent,...|     he|    1|[cold, admirably,...|
|[from, time, to, ...|    his|    1|[his, account, i,...|
|[keen, desire, to...|    his|    1|[again, was, his,...|
|[against, the, bl...|    his|    1|[pacing, room, wa...|
|[i, could, not, h...|    his|    1|[which, at, his, ...|
|[writes, upon, bo...|    his|    1|[writes, his, wea...|
|[barbaric, opulen...|     he|    1|[by, which, was, ...|
|[, five, attempts...|    her|    1|[ransacked, twice...|
|[she, will, do, i...|    she|    1|[it, soul, a, not...|
|[, there, are, th...|     he|    1|[notes, gold, he,...|
|[investigation, w...|    his|    1|[something, which...|
|[times, before, i...|     he|    1|[certain, was, in...|
|[tweed-suited, an...|    his|    1|[of, his, old, as...|
|[, well, , re

### Modo 1 - Criando Model

In [162]:
from pyspark.ml.feature import CountVectorizer


In [163]:
cv = CountVectorizer (inputCol = 'word', outputCol = 'features')

In [164]:
model = cv.fit(df6)
df7 = model.transform(df6)
df7.show(5)

+--------------------+-------+-----+--------------------+--------------------+
|                word|endword|label|                  in|            features|
+--------------------+-------+-----+--------------------+--------------------+
|[to, sherlock, ho...|    him|    1|[woman, holmes, i...|(7659,[0,1,3,6,14...|
|[were, abhorrent,...|     he|    1|[cold, admirably,...|(7659,[0,6,11,13,...|
|[from, time, to, ...|    his|    1|[his, account, i,...|(7659,[0,3,4,6,13...|
|[keen, desire, to...|    his|    1|[again, was, his,...|(7659,[0,2,6,10,1...|
|[against, the, bl...|    his|    1|[pacing, room, wa...|(7659,[0,1,10,11,...|
+--------------------+-------+-----+--------------------+--------------------+
only showing top 5 rows



In [165]:
df8 = model.transform(df5.withColumnRenamed('in', 'words'))\
        .withColumnRenamed('words', 'in')\
        .withColumnRenamed('vec', 'invec')
df8.drop('sentence').show(5)

+--------------------+-------+-----+--------------------+
|                word|endword|label|            features|
+--------------------+-------+-----+--------------------+
|[to, sherlock, ho...|    him|    1|(7659,[0,1,3,6,14...|
|[were, abhorrent,...|     he|    1|(7659,[0,6,11,13,...|
|[from, time, to, ...|    his|    1|(7659,[0,3,4,6,13...|
|[keen, desire, to...|    his|    1|(7659,[0,2,6,10,1...|
|[against, the, bl...|    his|    1|(7659,[0,1,10,11,...|
+--------------------+-------+-----+--------------------+
only showing top 5 rows



## Split para base de treino e base de teste
70% do dataset para a base de treino e 30% para a base de teste, com seed de 42

In [166]:
df_train, df_test = df8.randomSplit((.70, .30), 42)
print(df_train.count(), df_test.count())

4434 1903


## Treinamento usado Regressão Logística

In [167]:
from pyspark.ml.classification import LogisticRegression

Numero de iteracoes de treinamento é 100.

Os parametros regParam e elasticNetParam são associados a regularização liquida elastica

In [174]:
logistic = LogisticRegression(maxIter=100, regParam =0.4, elasticNetParam = 0.0)

Treinamento do modelo usando a funcao fit.

In [175]:
df_fitted = logistic.fit(df_train)
df_fitted

LogisticRegressionModel: uid = LogisticRegression_759de081ecf7, numClasses = 2, numFeatures = 7659

In [176]:
print("Training iterations: ", df_fitted.summary.totalIterations)

Training iterations:  18
