<a href="https://colab.research.google.com/github/dalgual/aidatasci/blob/main/mlp_multi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Amazon books rating


In [None]:
%pyspark

from pyspark.sql.types import *
from pyspark.sql.functions import *

from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression, GBTClassifier, RandomForestClassifier, LinearSVC, MultilayerPerceptronClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer, MinMaxScaler, CountVectorizer, IDF, Tokenizer, StopWordsRemover, RegexTokenizer, HashingTF, Word2Vec
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator

from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from time import time

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.storagelevel import StorageLevel

#####Spark Submit setting

In [None]:
%pyspark

IS_SPARK_SUBMIT_CLI = True
if IS_SPARK_SUBMIT_CLI:
    sc = SparkContext.getOrCreate()
    spark = SparkSession(sc)

In [None]:
%pyspark

# Limit the log
spark.sparkContext.setLogLevel("WARN")

#####Read data

In [None]:
%pyspark

df = spark.read.csv('/user/hlin54/Books_rating.csv', inferSchema=True, header=True)

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-4280381872412895>:1[0m
[0;32m----> 1[0m df [38;5;241m=[39m [43mspark[49m[38;5;241;43m.[39;49m[43mread[49m[38;5;241;43m.[39;49m[43mcsv[49m[43m([49m[38;5;124;43m'[39;49m[38;5;124;43m/user/hlin54/Books_rating.csv[39;49m[38;5;124;43m'[39;49m[43m,[49m[43m [49m[43minferSchema[49m[38;5;241;43m=[39;49m[38;5;28;43;01mTrue[39;49;00m[43m,[49m[43m [49m[43mheader[49m[38;5;241;43m=[39;49m[38;5;28;43;01mTrue[39;49;00m[43m)[49m

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m

In [None]:
%pyspark

booksSchema = StructType([
  StructField("Id", IntegerType(), False),
  StructField("Title", StringType(), False),
  StructField("Price", FloatType(), False),
  StructField("User_id", StringType(), False),
  StructField("profileName", StringType(), False),
  StructField("review/helpfulness", StringType(), False),
  StructField("review/score", IntegerType(), False),
  StructField("review/time", IntegerType(), False),
  StructField("review/summary", StringType(), False),
  StructField("review/text", StringType(), False),
])

In [None]:
%pyspark

print('data count:', df.count())

data count: 10000


#####Select the column we need (for multiclass classification)

In [None]:
%pyspark

df_new = df.select(((col("Price")).cast("Float")), "review/summary", ((col("review/time")).cast("Float")), ((col("review/score").cast("Int")).alias("score")))

#####Remove weird values for our label column

In [None]:
%pyspark

df_new = df_new.filter(col("score") <= 5)

#####Replace average values to missing values(column : Price, review/time)

In [None]:
%pyspark

price_avg = df_new.agg({'Price': 'mean'}).collect()[0][0]
df_new = df_new.fillna(price_avg, subset = ["Price"])

time_avg = df_new.agg({'review/time': 'mean'}).collect()[0][0]
df_new = df_new.fillna(time_avg, subset = ["review/time"])


#####Drop NA values

In [None]:
%pyspark

df_new = df_new.withColumn("review/summary",translate(col("review/summary"), "/", " ")) \
       .withColumn("review/summary",translate(col("review/summary"), "(", " ")) \
       .withColumn("review/summary",translate(col("review/summary"), ")", " "))

df_new = df_new.withColumn("review/summary",regexp_replace(col('review/summary'), '[^A-Za-z ]+', ''))
df_new = df_new.withColumn("review/summary",regexp_replace(col('review/summary'), ' +', ' '))
df_new = df_new.withColumn("review/summary",lower(col('review/summary')))

df_new.select("review/summary").show(10,False)

+----------------------------------------------------------+
|review/summary                                            |
+----------------------------------------------------------+
|nice collection of julie strain images                    |
|really enjoyed it                                         |
|essential for every personal and public library           |
|phlip nel gives silly seuss a serious treatment           |
|good academic overview                                    |
|one of americas greatest creative talents                 |
|a memorably excellent survey of dr seuss many achievements|
|academia at its best                                      |
|and to think that i read it on the tram                   |
|fascinating account of a genius at work                   |
+----------------------------------------------------------+
only showing top 10 rows



In [None]:
%pyspark

df_new = df_new.na.replace("nan", None)
df_new = df_new.dropna()

In [None]:
df_new.persist(StorageLevel.DISK_ONLY_2)

#####Check the label column do not contain weird values

In [None]:
%pyspark

unique_names = df_new.select("score").distinct()
print("unique score")
unique_names.show()

unique score
+-----+
|score|
+-----+
|    1|
|    3|
|    5|
|    4|
|    2|
+-----+



In [None]:
%pyspark

df_new.show(20)

+---------+--------------------+-----------+-----+
|    Price|      review/summary|review/time|score|
+---------+--------------------+-----------+-----+
|19.570179|nice collection o...| 9.406368E8|    4|
|19.570179|   really enjoyed it|1.0957248E9|    5|
|19.570179|essential for eve...|1.0787904E9|    5|
|19.570179|phlip nel gives s...|1.0907136E9|    4|
|19.570179|good academic ove...|1.1079936E9|    4|
|19.570179|one of americas g...|1.1271744E9|    4|
|19.570179|a memorably excel...|1.1001312E9|    5|
|19.570179|academia at its best|   1.2312E9|    5|
|19.570179|and to think that...|1.2098592E9|    5|
|19.570179|fascinating accou...|1.0763712E9|    4|
|     19.4|outstanding resou...|   9.9144E8|    5|
|     19.4|small churches ca...|1.2917664E9|    5|
|     19.4|not just for pastors|1.2483072E9|    5|
|     19.4|small church past...|  1.22256E9|    5|
|    10.95|            not good|1.1170656E9|    1|
|    10.95|  here is my opinion|1.1195712E9|    4|
|    10.95|        buyer beware

#####Check the data do not contain missing values

In [None]:
%pyspark

df_new.select([count(when(isnull(c), c)).alias(c) for c in df_new.columns]).show()

+-----+--------------+-----------+-----+
|Price|review/summary|review/time|score|
+-----+--------------+-----------+-----+
|    0|             0|          0|    0|
+-----+--------------+-----------+-----+



In [None]:
%pyspark

print("1:", df_new.filter(df_new["score"] == 1).count(), df_new.filter(df_new["score"] == 1).count()/df_new.count())
print("2:", df_new.filter(df_new["score"] == 2).count(), df_new.filter(df_new["score"] == 2).count()/df_new.count())
print("3:", df_new.filter(df_new["score"] == 3).count(), df_new.filter(df_new["score"] == 3).count()/df_new.count())
print("4:", df_new.filter(df_new["score"] == 4).count(), df_new.filter(df_new["score"] == 4).count()/df_new.count())
print("5:", df_new.filter(df_new["score"] == 5).count(), df_new.filter(df_new["score"] == 5).count()/df_new.count())

1: 726 0.0730161922960877
2: 569 0.05722618927888967
3: 864 0.08689530322840189
4: 2049 0.2060746253645781
5: 5735 0.5767876898320426


###Follow Medium

In [None]:
%pyspark

vec_assembler = VectorAssembler(inputCols=["Price", "review/time"], outputCol="numVec")
df_numvector = vec_assembler.transform(df_new)

minmax_price = MinMaxScaler(inputCol="numVec", outputCol="normVec")
model_num = minmax_price.fit(df_numvector)
normVec = model_num.transform(df_numvector)

#normVec.show(2,False)

+---------+--------------------------------------+-----------+-----+--------------------------------+----------------------------------------+
|Price    |review/summary                        |review/time|score|numVec                          |normVec                                 |
+---------+--------------------------------------+-----------+-----+--------------------------------+----------------------------------------+
|19.570179|nice collection of julie strain images|9.406368E8 |4    |[19.570178985595703,9.406368E8] |[0.08098151160973638,0.2415293751942804]|
|19.570179|really enjoyed it                     |1.0957248E9|5    |[19.570178985595703,1.0957248E9]|[0.08098151160973638,0.5205160087037612]|
+---------+--------------------------------------+-----------+-----+--------------------------------+----------------------------------------+
only showing top 2 rows



In [None]:
%pyspark

tokenizer1 = Tokenizer(inputCol="review/summary", outputCol="words1")
words1 = tokenizer1.transform(normVec)

stopwords_remover1 = StopWordsRemover(inputCol='words1',outputCol='filtered_tokens1')
filtered_tokens1 = stopwords_remover1.transform(words1)


cv1 = CountVectorizer(vocabSize=2**12, inputCol="filtered_tokens1", outputCol='cv1')
model_cv = cv1.fit(filtered_tokens1)
cv1_result = model_cv.transform(filtered_tokens1)

idf1 = IDF(inputCol='cv1', outputCol="features1", minDocFreq=5)
model_idf = idf1.fit(cv1_result)
idf1_result = model_idf.transform(cv1_result)

#idf1_result.show(2,False)

+---------+--------------------------------------+-----------+-----+--------------------------------+----------------------------------------+---------------------------------------------+-----------------------------------------+----------------------------------------------------+---------------------------------------------------------------------------------+
|Price    |review/summary                        |review/time|score|numVec                          |normVec                                 |words1                                       |filtered_tokens1                         |cv1                                                 |features1                                                                        |
+---------+--------------------------------------+-----------+-----+--------------------------------+----------------------------------------+---------------------------------------------+-----------------------------------------+--------------------------------------

In [None]:
%pyspark

label_stringIdx = StringIndexer(inputCol="score", outputCol="label", stringOrderType= "alphabetAsc")
feature_data = label_stringIdx.fit(idf1_result).transform(idf1_result)

#feature_data = feature_data.withColumnRenamed("features1","features")
featVect = VectorAssembler(inputCols=["normVec", "features1"], outputCol="features")
output_df = featVect.transform(feature_data)

#output_df.show(2,False)

+---------+--------------------------------------+-----------+-----+--------------------------------+----------------------------------------+---------------------------------------------+-----------------------------------------+----------------------------------------------------+---------------------------------------------------------------------------------+-----+-------------------------------------------------------------------------------------------------+
|Price    |review/summary                        |review/time|score|numVec                          |normVec                                 |words1                                       |filtered_tokens1                         |cv1                                                 |features1                                                                        |label|features                                                                                         |
+---------+--------------------------------------+----------

In [None]:
%pyspark

#output_df.select("features").show(2,False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                        |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|(6947,[0,1,79,115],[0.08098151160973638,0.2415293751942804,5.5411629870609005,5.708217071724067])                                                               |
|(6947,[0,1,40,431],[0.08098151160973638,0.5205160087037612,5.015069891164122,6.6397752757290105])                                                               |
|(6947,[0,1,63,93,235,397,703],[0.08098151160973638,0.490052844264843,5.312904335079921,5.621205694734437,6.208992359636556,6.565667303575289,7.007500055854328])|
|(6947,[0,1,405,793,10

In [None]:
%pyspark

output_df = output_df.select('features','label')
train, test = output_df.randomSplit([0.7,0.3])

print("train_test_split")

In [None]:
%pyspark

features = output_df.select(['features']).collect()
features_count = len(features[0][0])
class_count = output_df.select(countDistinct("label")).collect()
classes = class_count[0][0]

#layers = [features_count, features_count+1, features_count, classes]
#layers = [features_count, 21, 20, classes]
#layers = [features_count, 51, 50, classes]
layers = [features_count, 20, classes]
print("layer_setting_complete")

#mlp = MultilayerPerceptronClassifier(maxIter = 100, layers = layers, blockSize = 128, seed = 1234)
mlp = MultilayerPerceptronClassifier(layers = layers)

print("mlp_setting_complete")

In [None]:
%pyspark

print("features_count:", features_count)

features_count: 6947


In [None]:
%pyspark

#paramGrid_mlp = (ParamGridBuilder()
#             .addGrid(mlp.blockSize, [64, 128])
#             .addGrid(mlp.maxIter, [50, 100])
#             .build())

paramGrid_mlp = (ParamGridBuilder()
             .addGrid(mlp.blockSize, [128])
             .addGrid(mlp.maxIter, [100])
             .build())

In [None]:
%pyspark

print("start_CV")
start3 = time()

cv_mlp = CrossValidator(estimator=mlp, evaluator=MulticlassClassificationEvaluator(), estimatorParamMaps=paramGrid_mlp)
model_mlp = cv_mlp.fit(train)

#tv_mlp = TrainValidationSplit(estimator=mlp, evaluator=MulticlassClassificationEvaluator(), estimatorParamMaps=paramGrid_mlp, trainRatio=0.7)
#model_mlp = tv_mlp.fit(train)

#model_mlp = mlp.fit(train)

end3 = time()
phrase = 'Multilayer perceptron classifier'
print('{} takes {} seconds'.format(phrase, (end3 - start3))) #round(end - start, 2)))

Multilayer perceptron classifier takes 722.4632556438446 seconds


In [None]:
%pyspark

predictions_mlp = model_mlp.transform(test)

evaluator = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol="prediction")

precision = evaluator.evaluate(predictions_mlp, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions_mlp, {evaluator.metricName: "weightedRecall"})
accuracy = evaluator.evaluate(predictions_mlp, {evaluator.metricName: "accuracy"})
F1 = evaluator.evaluate(predictions_mlp, {evaluator.metricName: "f1"})

# Print the evaluation metrics
print("Multilayer perceptron")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", F1)

LogisticRegression
Accuracy: 0.5442448436460412
Precision: 0.5079834114464995
Recall: 0.5442448436460412
F1: 0.5217976663355133


In [None]:
%pyspark

predictions_mlp.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(6947,[0,1],[0.01...|  4.0|[-0.0370655029532...|[0.14397816085843...|       4.0|
|(6947,[0,1],[0.02...|  2.0|[-0.0376089105461...|[0.14318424165549...|       4.0|
|(6947,[0,1],[0.03...|  4.0|[-0.0342110245494...|[0.13778691447038...|       4.0|
|(6947,[0,1],[0.03...|  3.0|[-0.0597215066347...|[0.11363872314568...|       4.0|
|(6947,[0,1],[0.03...|  4.0|[-0.0362711391178...|[0.14069885849411...|       4.0|
|(6947,[0,1],[0.04...|  4.0|[-0.1236578178016...|[0.08746596986687...|       4.0|
|(6947,[0,1],[0.06...|  3.0|[-0.0631485570580...|[0.11337313644069...|       4.0|
|(6947,[0,1],[0.07...|  4.0|[-0.0440803833424...|[0.14248497417651...|       4.0|
|(6947,[0,1],[0.08...|  2.0|[-0.2086787800975...|[0.06451973754811...|       4.0|
|(6947,[0,1],[0.

In [None]:
%pyspark

predictionAndLabels_mlp = predictions_mlp.select("prediction", "label").rdd
metrics_mlp = MulticlassMetrics(predictionAndLabels_mlp)
confusion_matrix_mlp = metrics_mlp.confusionMatrix().toArray()
print("Confusion matrix_MLP:")
print(confusion_matrix_mlp)



Confusion matrix_MLP:
[[  52.   23.   17.   29.  103.]
 [  21.   35.   21.   20.   80.]
 [  19.   13.   32.   60.  134.]
 [  13.   19.   40.  172.  361.]
 [  46.   38.   58.  255. 1345.]]
