# Text Classification

Este fará usando conceitos de Machine Learning uma previsão da última palavra de uma frase/setença de uma pessoa. 

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pyspark
import pandas as pd
import os

In [2]:
spark = SparkSession\
        .builder\
        .appName("Spark Endword Prediction  - Fabio Kfouri")\
        .getOrCreate()
spark

In [3]:
#read book
df = spark.read.text("sherlock\sherlock.txt")
df.show(3)

+--------------------+
|               value|
+--------------------+
|Project Gutenberg...|
|                    |
|This eBook is for...|
+--------------------+
only showing top 3 rows



In [4]:
# transformando em minuscula
df1 = df.select(F.lower(F.col('value')).alias('value'))
# Replace de termos
df2 = df1.select(F.regexp_replace('value', 'mr\.', 'mr').alias('value'))
df2 = df1.select(F.regexp_replace('value', 'don\'t', 'do not').alias('value'))
# Tokenizando e removendo simbolos indesejados
punctuation = "_|.\?\!\",\'\[\]\*():;<>”“’"
df3 = df2.select(F.split('value', '[ %s]' % punctuation).alias('word'))
df3.show(5, False)

+------------------------------------------------------------------------------------------+
|word                                                                                      |
+------------------------------------------------------------------------------------------+
|[project, gutenberg, s, the, adventures, of, sherlock, holmes, , by, arthur, conan, doyle]|
|[]                                                                                        |
|[this, ebook, is, for, the, use, of, anyone, anywhere, at, no, cost, and, with]           |
|[almost, no, restrictions, whatsoever, , , you, may, copy, it, , give, it, away, or]      |
|[re-use, it, under, the, terms, of, the, project, gutenberg, license, included]           |
+------------------------------------------------------------------------------------------+
only showing top 5 rows



#### Obtendo o ultimo elemento do array

In [5]:
#df4 = df3.filter('doc <> []').withColumn("last", df3['doc'][-1])
#where(F.size('doc') > 0)
df4 = df3.withColumn("endword", F.element_at(F.col("word"),-1)).where(F.length('endword') > 0)
df4.show(5, False)

+------------------------------------------------------------------------------------------+--------+
|word                                                                                      |endword |
+------------------------------------------------------------------------------------------+--------+
|[project, gutenberg, s, the, adventures, of, sherlock, holmes, , by, arthur, conan, doyle]|doyle   |
|[this, ebook, is, for, the, use, of, anyone, anywhere, at, no, cost, and, with]           |with    |
|[almost, no, restrictions, whatsoever, , , you, may, copy, it, , give, it, away, or]      |or      |
|[re-use, it, under, the, terms, of, the, project, gutenberg, license, included]           |included|
|[with, this, ebook, or, online, at, www, gutenberg, net]                                  |net     |
+------------------------------------------------------------------------------------------+--------+
only showing top 5 rows



### Removendo a ultima palavra

In [6]:
from pyspark.sql.types import StringType, BooleanType, IntegerType, FloatType, ArrayType

# UDF remove a ultima palavra
rm_last_word = F.udf(lambda x:
                     x[:-1] ,
                     ArrayType(StringType()))

In [7]:
df4 = df4.withColumn('word', rm_last_word('word'))
df4.show(5, False)

+-----------------------------------------------------------------------------------+--------+
|word                                                                               |endword |
+-----------------------------------------------------------------------------------+--------+
|[project, gutenberg, s, the, adventures, of, sherlock, holmes, , by, arthur, conan]|doyle   |
|[this, ebook, is, for, the, use, of, anyone, anywhere, at, no, cost, and]          |with    |
|[almost, no, restrictions, whatsoever, , , you, may, copy, it, , give, it, away]   |or      |
|[re-use, it, under, the, terms, of, the, project, gutenberg, license]              |included|
|[with, this, ebook, or, online, at, www, gutenberg]                                |net     |
+-----------------------------------------------------------------------------------+--------+
only showing top 5 rows



Selecionando/separando as sentenças cujo a ultima palavra seja she, he, hers, his, her, him e com as que não tem essas palavras

In [8]:
df_true = df4.where("endword in ('she', 'he', 'hers', 'his', 'her', 'him','them', 'us', 'they','himself', 'herself', 'we')")\
            .withColumn('label', F.lit(1))

df_false = df4.where("endword not in ('she', 'he', 'hers', 'his', 'her', 'him','them', 'us', 'they','himself', 'herself', 'we')")\
              .withColumn('label', F.lit(0))

print(df4.count(), df_true.count(), df_false.count())

6337 342 5995


Combinando o postivo com o negativo

In [9]:
df5 = df_true.union(df_false)
df5.show(5, False)
print(df5.count())

+---------------------------------------------------------------------------------+-------+-----+
|word                                                                             |endword|label|
+---------------------------------------------------------------------------------+-------+-----+
|[to, sherlock, holmes, she, is, always, , the, , woman, , i, have, seldom, heard]|him    |1    |
|[were, abhorrent, to, his, cold, , precise, but, admirably, balanced, mind, ]    |he     |1    |
|[from, time, to, time, i, heard, some, vague, account, of, his, doings, , of]    |his    |1    |
|[keen, desire, to, see, holmes, again, , and, to, know, how, he, was, employing] |his    |1    |
|[against, the, blind, , he, was, pacing, the, room, swiftly, , eagerly, , with]  |his    |1    |
+---------------------------------------------------------------------------------+-------+-----+
only showing top 5 rows

6337


In [10]:
df5.sample(False, .1, 42).show(5, False)

+-------------------------------------------------------------------------------------+-------+-----+
|word                                                                                 |endword|label|
+-------------------------------------------------------------------------------------+-------+-----+
|[, quite, so, , , he, answered, , lighting, a, cigarette, , and, throwing]           |himself|1    |
|[, five, attempts, have, been, made, , twice, burglars, in, my, pay, ransacked]      |her    |1    |
|[she, will, do, it, , you, do, not, know, her, , but, she, has, a, soul, of, steel, ]|she    |1    |
|[, she, will, not, be, able, to, , but, i, hear, the, rumble, of, wheels, , it, is]  |her    |1    |
|[the, hurrying, swarm, of, pedestrians, , it, was, difficult, to, realise, as]       |we     |1    |
+-------------------------------------------------------------------------------------+-------+-----+
only showing top 5 rows



## Tranformando Texto para Formato Vetor

In [11]:
TRIVIAL_TOKENS = {'', 'u', 'p', '1', '4', 'r', '7', '0', 'g', 'x', 'n', 'v', '6',\
                  'e', 't', 'm', 'f', 'o', '9', 'z', 'k', '5', 's', 'w', 'b', 'h', \
                  'l', '3', '2', 'c', 'q', 'pp', 'j', '8', 'y'}

In [12]:
from pyspark.sql.types import StringType, BooleanType, IntegerType, FloatType, ArrayType

# UDF removes items in TRIVIAL_TOKENS from array
rm_trivial_udf = F.udf(lambda x:
                     list(set(x) - TRIVIAL_TOKENS) if x
                     else x,
                     ArrayType(StringType()))

In [13]:
df6 = df5.withColumn('in', rm_trivial_udf('word'))
df6.show(3, False)

+---------------------------------------------------------------------------------+-------+-----+---------------------------------------------------------------------------+
|word                                                                             |endword|label|in                                                                         |
+---------------------------------------------------------------------------------+-------+-----+---------------------------------------------------------------------------+
|[to, sherlock, holmes, she, is, always, , the, , woman, , i, have, seldom, heard]|him    |1    |[woman, holmes, i, to, is, seldom, always, sherlock, the, she, have, heard]|
|[were, abhorrent, to, his, cold, , precise, but, admirably, balanced, mind, ]    |he     |1    |[cold, admirably, his, abhorrent, to, but, precise, balanced, mind, were]  |
|[from, time, to, time, i, heard, some, vague, account, of, his, doings, , of]    |his    |1    |[his, account, i, time, vague, to

### Modo 1 - Criando Model

In [14]:
from pyspark.ml.feature import CountVectorizer


In [15]:
cv = CountVectorizer (inputCol = 'word', outputCol = 'features')

In [16]:
model = cv.fit(df6)
df7 = model.transform(df6)
df7.show(5)

+--------------------+-------+-----+--------------------+--------------------+
|                word|endword|label|                  in|            features|
+--------------------+-------+-----+--------------------+--------------------+
|[to, sherlock, ho...|    him|    1|[woman, holmes, i...|(7424,[0,1,3,5,14...|
|[were, abhorrent,...|     he|    1|[cold, admirably,...|(7424,[0,5,13,23,...|
|[from, time, to, ...|    his|    1|[his, account, i,...|(7424,[0,3,4,5,13...|
|[keen, desire, to...|    his|    1|[again, was, he, ...|(7424,[0,2,5,10,1...|
|[against, the, bl...|    his|    1|[pacing, room, wa...|(7424,[0,1,10,11,...|
+--------------------+-------+-----+--------------------+--------------------+
only showing top 5 rows



In [17]:
df8 = model.transform(df5.withColumnRenamed('in', 'words'))\
        .withColumnRenamed('words', 'in')\
        .withColumnRenamed('vec', 'invec')
df8.drop('sentence').show(5)

+--------------------+-------+-----+--------------------+
|                word|endword|label|            features|
+--------------------+-------+-----+--------------------+
|[to, sherlock, ho...|    him|    1|(7424,[0,1,3,5,14...|
|[were, abhorrent,...|     he|    1|(7424,[0,5,13,23,...|
|[from, time, to, ...|    his|    1|(7424,[0,3,4,5,13...|
|[keen, desire, to...|    his|    1|(7424,[0,2,5,10,1...|
|[against, the, bl...|    his|    1|(7424,[0,1,10,11,...|
+--------------------+-------+-----+--------------------+
only showing top 5 rows



## Split para base de treino e base de teste
70% do dataset para a base de treino e 30% para a base de teste, com seed de 42

In [18]:
df_train, df_test = df8.randomSplit((.70, .30), 42)
print(df_train.count(), df_test.count())

4434 1903


## Treinamento usado Regressão Logística

In [19]:
from pyspark.ml.classification import LogisticRegression

Numero de iteracoes de treinamento é 100.

Os parametros regParam e elasticNetParam são associados a regularização liquida elastica

In [20]:
logistic = LogisticRegression(maxIter=100, regParam =0.4, elasticNetParam = 0.0)

Treinamento do modelo usando a funcao fit.

In [21]:
df_trained = logistic.fit(df_train)
df_trained

LogisticRegressionModel: uid = LogisticRegression_6502c330ea17, numClasses = 2, numFeatures = 7424

In [22]:
print("Training iterations: ", df_trained.summary.totalIterations)

Training iterations:  18


## Aplicando um modelo para avaliação

In [23]:
predicted = df_trained.transform(df_test)
predicted.show(3)

+--------------------+-------+-----+--------------------+--------------------+--------------------+----------+
|                word|endword|label|            features|       rawPrediction|         probability|prediction|
+--------------------+-------+-----+--------------------+--------------------+--------------------+----------+
|[, and, the, lady...|    his|    1|(7424,[0,1,2,3,14...|[3.23825423668798...|[0.96224874387154...|       0.0|
|[, but, this, mai...|    her|    1|(7424,[0,3,5,7,19...|[2.87586248878459...|[0.94664025343764...|       0.0|
|[, colonel, lysan...|     he|    1|(7424,[0,6,20,21,...|[3.50581086098324...|[0.97085265437349...|       0.0|
+--------------------+-------+-----+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



In [24]:
type(predicted)

pyspark.sql.dataframe.DataFrame

In [25]:
for x in predicted.take(8):
    print()
    if x.label != int(x.prediction):
        print("INCORRECT ==> ")
    for y in predicted.schema.names:
        print(y,":", x[y])


INCORRECT ==> 
word : ['', 'and', 'the', 'lady', '', 'i', 'fancy', '', 'is', 'miss', 'stoner', '', '', 'observed', 'holmes', '', 'shading']
endword : his
label : 1
features : (7424,[0,1,2,3,14,33,166,175,453,530,613,6669],[6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])
rawPrediction : [3.238254236687987,-3.238254236687987]
probability : [0.9622487438715439,0.037751256128456055]
prediction : 0.0

INCORRECT ==> 
word : ['', 'but', 'this', 'maid', '', 'alice', '', 'as', 'i', 'understand', '', 'deposes', 'that', 'she', 'went', 'to']
endword : her
label : 1
features : (7424,[0,3,5,7,19,23,29,35,201,275,660,921,4786],[4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])
rawPrediction : [2.8758624887845974,-2.8758624887845974]
probability : [0.9466402534376451,0.053359746562354905]
prediction : 0.0

INCORRECT ==> 
word : ['', 'colonel', 'lysander', 'stark', 'stopped', 'at', 'last', 'before', 'a', 'low', 'door', '', 'which']
endword : he
label : 1
features : (7424,[0,6,20,21,87,95,129,382

In [26]:
x = predicted.first
print("Right" if x.label == int(x.prediction) else "Wrong")

AttributeError: 'function' object has no attribute 'label'

In [27]:
x

<bound method DataFrame.first of DataFrame[word: array<string>, endword: string, label: int, features: vector, rawPrediction: vector, probability: vector, prediction: double]>

## Avaliando a acuracia da classificação

In [28]:
model_stats = df_trained.evaluate(df_test)
model_stats

<pyspark.ml.classification.BinaryLogisticRegressionSummary at 0x22cb82c0f48>

In [29]:
print("\nAcurracy: %2f" % model_stats.areaUnderROC)


Acurracy: 0.536431
