<h1 style="text-align:center">Creación de modelo de analisis de sentimientos</h1>

### Iniciamos datos, constantes y módulos

In [2]:
import os
import sparknlp
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.sql import functions as F

spark = sparknlp.start()

print(f"Versión de spark: {spark.version}")
print(f"Versión de sparknlp: {sparknlp.version()}")

Versión de spark: 3.1.1
Versión de sparknlp: 3.0.1


In [3]:
MODEL_NAME = 'webinar-sentiment-dl-v2'
LOGS_PATH = os.path.join('.', 'logs', MODEL_NAME)

In [4]:
dataframe = spark \
    .read \
    .option("header", True) \
    .option("delimiter", ",") \
    .csv("./data/archive/Tweets.csv")

### Creación de pipeline de entrenamiento

In [5]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

use = UniversalSentenceEncoder.pretrained()\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

sentiment = ClassifierDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("sentiment") \
    .setLabelColumn("airline_sentiment") \
    .setEnableOutputLogs(True) \
    .setMaxEpochs(100) \
    .setOutputLogsPath(LOGS_PATH) \
    .setValidationSplit(.25)

finisher = Finisher() \
    .setInputCols(["sentiment"])\
    .setOutputCols(["result"])


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [6]:
sentiment_pipeline = Pipeline(
  stages=[
      documentAssembler,
      use,
      sentiment,
      finisher
  ]
)

In [7]:
dataframe = dataframe.select('text', 'airline_sentiment').where(F.col("text").isNotNull())

dataframe.groupBy('airline_sentiment').count().show()

+-----------------+-----+
|airline_sentiment|count|
+-----------------+-----+
|         positive| 2363|
|          neutral| 3099|
|         negative| 9170|
+-----------------+-----+



In [8]:
pos_df = dataframe.where("airline_sentiment = 'positive'")
neu_df = dataframe.where("airline_sentiment = 'neutral'")
neg_df = dataframe.where("airline_sentiment = 'negative'")

neg_df = neg_df.sample(0.3)

dataframe = pos_df.union(neu_df).union(neg_df)

dataframe.groupBy('airline_sentiment').count().show()

+-----------------+-----+
|airline_sentiment|count|
+-----------------+-----+
|         positive| 2363|
|          neutral| 3099|
|         negative| 2766|
+-----------------+-----+



In [9]:
sentiment_model = sentiment_pipeline.fit(dataframe)

In [10]:
sentiment_model.stages[2].write().overwrite().save(f'./models/{MODEL_NAME}')

### Cargamos el modelo que entrenamos

In [22]:
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")

use = UniversalSentenceEncoder.pretrained() \
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

classsifierdl = ClassifierDLModel.load(f'./models/{MODEL_NAME}') \
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")

finisher = Finisher() \
    .setInputCols(["class"])\
    .setOutputCols(["result"])

pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl,
        finisher
    ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [None]:
from pyspark.sql.types import StringType

dfTest = spark.createDataFrame([
    "I am disappointed in the service you have given me",
    "I loved traveling with you, thank you for everything",
    "Maybe"
], StringType()).toDF("description")

In [24]:
pipeline_trained = pipeline.fit(spark.createDataFrame([['']]).toDF("text"))

In [26]:
pipeline_trained.transform(dfTest).show(truncate=False)

+---------------------------------------------------+----------+
|description                                        |result    |
+---------------------------------------------------+----------+
|I am disappointed in the service you have given me |[negative]|
|I love traveling with you, thank you for everything|[positive]|
+---------------------------------------------------+----------+

