# Khai báo biến môi trường

In [None]:
import os
os.environ["PYSPARK_PYTHON"]="python3.7"
os.environ["PYSPARK_DRIVER_PYTHON"]="python3.7"

# Khởi tạo Spark Session

In [2]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
            .master("spark://spark-master:7077") \
            .appName("ModelSentiment") \
            .getOrCreate()

# Đọc dữ liệu sau khi gán nhãn dể  huấn luyện mô hình

In [3]:
df = spark.read.format("csv").options(header='true').load("hdfs://namenode/user/root/input/data_sentiment.csv")

In [4]:
df.printSchema()

root
 |-- comment: string (nullable = true)
 |-- sentiment: string (nullable = true)



In [5]:
df = df.na.drop().dropDuplicates()

# Xử lí dữ liệu

In [6]:
df.groupBy("sentiment") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(30)

+------------------+------+
|         sentiment| count|
+------------------+------+
|                 1|678723|
|                -1|354761|
|                  |    95|
|                ,1|    20|
|           however|    20|
|          though."|    10|
|       thank you."|     8|
|             etc."|     7|
|               ,-1|     7|
|            though|     6|
|              the |     6|
|              but |     6|
|                I |     6|
|          thanks."|     5|
|               but|     5|
| please fix this."|     5|
|              well|     5|
|                )"|     5|
|                 2|     5|
|           thanks"|     4|
|              and |     4|
|               etc|     4|
|            thanks|     4|
|            simple|     4|
|             still|     3|
|         otherwise|     3|
|               too|     3|
|               fun|     3|
|            but I |     3|
|         nothing."|     3|
+------------------+------+
only showing top 30 rows



In [7]:
df = df.filter((df.sentiment=="1") | (df.sentiment=="-1"))

In [8]:
df.groupBy("sentiment") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(30)

+---------+------+
|sentiment| count|
+---------+------+
|        1|678723|
|       -1|354761|
+---------+------+



In [9]:
### Randomly split data into training and test sets. set seed for reproducibility

(trainingData, testData) = df.randomSplit([.7, .3], seed=100)

In [10]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF, OneHotEncoder, StringIndexer, VectorAssembler

tokenizer = RegexTokenizer(inputCol="comment", outputCol="tokens", pattern="\\s+|[,.()\"]")

# default stop words 
# StopWordsRemover.loadDefaultStopWords(language='english')

add_stopwords = []

remover = StopWordsRemover(inputCol="tokens", outputCol="commentTokens").setStopWords(add_stopwords)

# bag of words count
cv = CountVectorizer(inputCol="commentTokens", outputCol="cv", vocabSize=200000, minDF=5)

idf = IDF(inputCol="cv", outputCol="features")

label = StringIndexer(inputCol="sentiment", outputCol="label")

# Huấn luyện mô hình

In [11]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=100, regParam=.02, elasticNetParam=.3)

In [12]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, label, lr])

In [13]:
model = pipeline.fit(trainingData)

# Đánh giá mô hình

In [14]:
predictions = model.transform(testData)

In [15]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()

evaluator.evaluate(predictions)

0.9583883370104008

# Lưu mô hình để sử dụng phân loại comment

In [16]:
model.save("hdfs://namenode/user/root/model")
# model.write().overwrite().save("hdfs://namenode/user/root/model")