In [1]:
import os
os.environ["PYSPARK_PYTHON"]="python3.7"
os.environ["PYSPARK_DRIVER_PYTHON"]="python3.7"

In [2]:
!pip install numpy

Collecting numpy
  Downloading numpy-1.19.4-cp37-cp37m-manylinux2010_x86_64.whl (14.5 MB)
[K     |████████████████████████████████| 14.5 MB 526 kB/s eta 0:00:01
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.19.4


In [3]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
            .master("spark://spark-master:7077") \
            .appName("ModelSentiment") \
            .getOrCreate()

In [4]:
df = spark.read.format("csv").options(header='true').load("hdfs://namenode/user/root/input/data_sentiment.csv")

In [5]:
df.printSchema()

root
 |-- comment: string (nullable = true)
 |-- sentiment: string (nullable = true)



In [6]:
df = df.na.drop().dropDuplicates()

In [7]:
df.groupBy("sentiment") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(30)

+------------------+------+
|         sentiment| count|
+------------------+------+
|                 1|678723|
|                -1|354761|
|                  |    95|
|           however|    20|
|                ,1|    20|
|          though."|    10|
|       thank you."|     8|
|             etc."|     7|
|               ,-1|     7|
|                I |     6|
|            though|     6|
|              the |     6|
|              but |     6|
|               but|     5|
|          thanks."|     5|
| please fix this."|     5|
|              well|     5|
|                )"|     5|
|                 2|     5|
|           thanks"|     4|
|              and |     4|
|               etc|     4|
|            simple|     4|
|            thanks|     4|
|           in fact|     3|
|             still|     3|
|               too|     3|
|         otherwise|     3|
|               fun|     3|
|            but I |     3|
+------------------+------+
only showing top 30 rows



In [8]:
df = df.filter((df.sentiment=="1") | (df.sentiment=="-1"))

In [9]:
df.groupBy("sentiment") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(30)

+---------+------+
|sentiment| count|
+---------+------+
|        1|678723|
|       -1|354761|
+---------+------+



In [10]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="comment", outputCol="words", pattern="\\W")

# default stop words 
# StopWordsRemover.loadDefaultStopWords(language='english')

# stop words
add_stopwords = [] # standard stop words

stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [11]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")

In [12]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)

In [13]:
dataset.show(5)

+--------------------+---------+--------------------+--------------------+--------------------+-----+
|             comment|sentiment|               words|            filtered|            features|label|
+--------------------+---------+--------------------+--------------------+--------------------+-----+
|I like that there...|        1|[i, like, that, t...|[i, like, that, t...|(10000,[0,1,2,3,4...|  0.0|
|I've been using t...|        1|[i, ve, been, usi...|[i, ve, been, usi...|(10000,[0,1,2,3,4...|  0.0|
|This  app is a gr...|        1|[this, app, is, a...|[this, app, is, a...|(10000,[4,5,6,7,8...|  0.0|
| Great speed camera.|        1|[great, speed, ca...|[great, speed, ca...|(10000,[30,390,53...|  0.0|
|عند اجراء مزامنة ...|       -1|                  []|                  []|       (10000,[],[])|  1.0|
+--------------------+---------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [14]:
### Randomly split data into training and test sets. set seed for reproducibility

(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

In [27]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

0.7641344028646035

In [15]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

In [16]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

# Run cross validations
cvModel = cv.fit(trainingData)
# this will likely take a fair amount of time because of the amount of models that we're creating and testing

# Use test set here so we can measure the accuracy of our model on new data
predictions = cvModel.transform(testData)

# cvModel uses the best model found from the Cross Validation
# Evaluate best model

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.907730721050583

In [23]:
cvModel.save("hdfs://namenode/user/root/model")

In [29]:
from pyspark.ml.tuning import CrossValidatorModel

crossValidatorModel = CrossValidatorModel.load("hdfs://namenode/user/root/model")