> # Mestrado Engenharia Informática
>
> ## **Algoritmos para Big Data**

> ### **Felipe Silva** # 121851

> ### **Fernando Piedade** # 109266

> **_*2023/24*_**


# **Model Training**


# Initial settings


## Additional packages and imports


In [1]:
import findspark, pyspark

from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [2]:
import os
import shutil

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [3]:
import nltk

nltk.download("wordnet")

from nltk.stem import WordNetLemmatizer

# Lemmatization reduces the words to their root or base forms, known as lemma. It was performed by WordNetLemmatizer() from the nltk.stem module. The lemmatizer called the lemmatize() function on each token present in the text and then combined the results.


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/felipesilva/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import RegexTokenizer # tokenizer
from pyspark.ml.feature import HashingTF, IDF # vectorizer
from pyspark.ml.feature import StopWordsRemover # to remove stop words
from pyspark.ml.classification import LogisticRegression, LinearSVC, OneVsRest, NaiveBayes # ml models
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.ml.evaluation import MulticlassClassificationEvaluator # to evaluate the model
from pyspark.mllib.evaluation import MulticlassMetrics # # performance metrics

In [5]:
# Create the Spark session

findspark.init()
findspark.find()

spark = SparkSession\
        .builder\
        .appName("ADBFinacialReportsSecModelTraining")\
        .config("spark.sql.shuffle.partitions",100)\
        .config("spark.driver.memory", "16G")\
        .config("spark.sql.repl.eagereval.enabled",True)\
        .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/02 23:26:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

## Collect data from local directory


In [7]:
data_dir = "../Datasets/financial-reports-sec/parquet/large/"

## Useful functions


In [8]:
def remove_if_exists(path):
  if os.path.exists(path):
    shutil.rmtree(path)

## Apply Lemmatizer


In [9]:
class Lemmatizer(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
  def __init__(self, inputCol=None, outputCol=None):
    super(Lemmatizer, self).__init__()
    self.lemmatizer = WordNetLemmatizer()
    self._setDefault(inputCol=inputCol, outputCol=outputCol)
    self._set(inputCol=inputCol, outputCol=outputCol)
  
  def _transform(self, df):
    lemmatize_udf = F.udf(lambda words: [self.lemmatizer.lemmatize(word) for word in words], ArrayType(StringType()))
    return df.withColumn(self.getOutputCol(), lemmatize_udf(df[self.getInputCol()]))

## Read parquet files


In [10]:
df_test = spark.read.parquet(data_dir + "test.parquet")
df_train = spark.read.parquet(data_dir + "train.parquet")

## Tokenizer


In [11]:
# convert sentences to list of words
tokenizer = RegexTokenizer(inputCol="report", outputCol="words", pattern="\\W")

# to remove stop words like is, the, in, etc.
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# reduces the words to their root or base forms, known as lemma.
lemmatizer = Lemmatizer(inputCol="filtered", outputCol="lemmatized")

# Calculate term frequency in each article
hashing_tf = HashingTF(inputCol="filtered", outputCol="raw_features",  numFeatures=50000)
hashing_tf_with_lemmatizer = HashingTF(inputCol="lemmatized", outputCol="raw_features",  numFeatures=50000)

# Inverse document frequency
idf = IDF(inputCol="raw_features", outputCol="features")

## Classification Models

### Logistic Regression

### Linear SVC

### Naive Bayes


In [12]:
# model object
lr = LogisticRegression(regParam=0.3, maxIter=50)
ovr_lr = OneVsRest(classifier=lr)

lsvc = LinearSVC(maxIter=50, regParam=0.3)
ovr_lsvc = OneVsRest(classifier=lsvc)

nb = NaiveBayes()
ovr_nb = OneVsRest(classifier=nb)

In [13]:
pipeline_lr = Pipeline(stages=[tokenizer, stopwords_remover, hashing_tf, idf, ovr_lr])
pipeline_lsvc = Pipeline(stages=[tokenizer, stopwords_remover, hashing_tf, idf, ovr_lsvc])
pipeline_nb = Pipeline(stages=[tokenizer, stopwords_remover, hashing_tf, idf, ovr_nb])

pipeline_lr_lemmatizer = Pipeline(stages=[tokenizer, stopwords_remover, lemmatizer, hashing_tf_with_lemmatizer, idf, lr])
pipeline_lsvc_lemmatizer = Pipeline(stages=[tokenizer, stopwords_remover, lemmatizer, hashing_tf_with_lemmatizer, idf, ovr_lsvc])
pipeline_nb_lemmatizer = Pipeline(stages=[tokenizer, stopwords_remover, lemmatizer, hashing_tf_with_lemmatizer, idf, ovr_nb])

In [14]:
pipelines = [
  { "name": "LogisticRegression", "pipeline": pipeline_lr },
  { "name": "LinearSVC", "pipeline": pipeline_lsvc },
  { "name": "NaiveBayes", "pipeline": pipeline_nb },
  { "name": "LogisticRegression_Lemmatized", "pipeline": pipeline_lr_lemmatizer },
  { "name": "LinearSVC_Lemmatized", "pipeline": pipeline_lsvc_lemmatizer },
  { "name": "NaiveBayes_Lemmatized", "pipeline": pipeline_nb_lemmatizer }
]

## Training Pipeline Models


In [15]:
for el in pipelines:
  name = el["name"]
  pipeline = el["pipeline"]

  print(name)

  path = f"pipelines/{name}"
  remove_if_exists(path)
  pipeline.save(path)

  pipeline_model = pipeline.fit(df_train)

  path = f"pipelines_model/{name}"
  remove_if_exists(path)
  pipeline_model.save(path)


LogisticRegression


                                                                                

LinearSVC


                                                                                

NaiveBayes


                                                                                

LogisticRegression_Lemmatized


24/06/03 00:01:36 WARN TaskSetManager: Stage 2888 contains a task of very large size (3197 KiB). The maximum recommended task size is 1000 KiB.


LinearSVC_Lemmatized


                                                                                

NaiveBayes_Lemmatized


                                                                                