In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import os
import re
import string
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
from pyspark.mllib.linalg import DenseVector
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import array_remove
import wordcloud

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Exploratory_Analysis") \
    .config("spark.executor.memory", '8g') \
    .config("spark.executor.cores", '4') \
    .config('spark.cores.max', '4') \
    .config('spark.driver.memory', '8g') \
    .getOrCreate()

sc = spark.sparkContext

In [3]:
data = spark.read.format('csv'). \
    option("header", "true"). \
    option("inferSchema", "true"). \
    load("C:/Users/Eri/Documents/PSTAT 135/model_df.csv")

In [4]:
data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- text: string (nullable = true)
 |-- look: string (nullable = true)
 |-- smell: string (nullable = true)
 |-- taste: string (nullable = true)
 |-- feel: string (nullable = true)
 |-- overall: string (nullable = true)
 |-- score: string (nullable = true)
 |-- style: string (nullable = true)



In [5]:
data.show(5)

+---+--------------------+----+-----+-----+----+-------+-----+------------+
|_c0|                text|look|smell|taste|feel|overall|score|       style|
+---+--------------------+----+-----+-----+----+-------+-----+------------+
|  0|   Beautiful, cry...|4.75|  4.0| 4.25|4.25|   4.25| 4.22|American IPA|
|  1|   Poured a bit l...|3.75| 3.75| 3.75|3.75|   3.75| 3.75|American IPA|
|  2|   355ml can. Bri...|4.25|  4.0|  4.0|4.25|    4.0| 4.04|American IPA|
|  3|   Quite balanced...|4.25|  4.5| 4.25| 4.5|   4.25| 4.34|American IPA|
|  4|   Can: Poured a ...|3.75| 3.75| 3.75|3.75|   3.75| 3.75|American IPA|
+---+--------------------+----+-----+-----+----+-------+-----+------------+
only showing top 5 rows



In [6]:
data.take(5)

[Row(_c0=0, text='\xa0\xa0 Beautiful, crystal clear pour with a nice head. Good retention, awesome lace. Pleasant citrus aroma, but not as powerful as some. Real good flavour here, lots of late hops, with a good bitterness and a light hop acidity. Thanks Yardsale! \xa0', look='4.75', smell='4.0', taste='4.25', feel='4.25', overall='4.25', score='4.22', style='American IPA'),
 Row(_c0=1, text='\xa0\xa0 Poured a bit lively and had to wait for it to settle very nice lacing. Slightly hazy golden straw. Floral notes maybe a hint of citrus. Quite smooth on the palate a touch bitter and definitely a pungent finish as advertised. Somewhat of a dry finish and something about the lingering taste is off. The mouth coating is a touch creamy. \xa0', look='3.75', smell='3.75', taste='3.75', feel='3.75', overall='3.75', score='3.75', style='American IPA'),
 Row(_c0=2, text='\xa0\xa0 355ml can. Bright copper with a dense foamy head that dissipates slowly leaving thin lacing. Pine, lemon, lime on the n

In [36]:
from pyspark.sql.functions import regexp_replace,col
data1 = data.select('text', 'style')
#for multiple regex expressions use OR |
data1 = data1.withColumn('text', regexp_replace(col('text'), "\\.|\xa0|!|,|:", ""))

In [10]:
data1.show(10)

+--------------------+------------+
|                text|       style|
+--------------------+------------+
|[Beautiful, cryst...|American IPA|
|[Poured, a, bit, ...|American IPA|
|[355ml, can, Brig...|American IPA|
|[Quite, balanced,...|American IPA|
|[Can, Poured, a, ...|American IPA|
|[Can, bought, onl...|American IPA|
|[Passing, through...|American IPA|
|[Yes, please, I'v...|American IPA|
|[A, well, put, to...|American IPA|
|[355ml, can, the,...|American IPA|
+--------------------+------------+
only showing top 10 rows



In [38]:
#get rid of white space and separate each word 
data1 = data1.withColumn('text', split(data1['text'], ' '))

In [39]:
data1 = data1.withColumn('text', array_remove('text', ''))

In [40]:
remover = StopWordsRemover(inputCol="text", outputCol="filtered")
data2 = remover.transform(data1)
data2 = data2.select('filtered', 'style')

In [41]:
data2.show(10)

+--------------------+------------+
|            filtered|       style|
+--------------------+------------+
|[Beautiful, cryst...|American IPA|
|[Poured, bit, liv...|American IPA|
|[355ml, Bright, c...|American IPA|
|[Quite, balanced,...|American IPA|
|[Poured, clear, a...|American IPA|
|[bought, online, ...|American IPA|
|[Passing, Swift, ...|American IPA|
|[Yes, please, hop...|American IPA|
|[well, put, toget...|American IPA|
|[355ml, latest, n...|American IPA|
+--------------------+------------+
only showing top 10 rows



In [42]:
cloud = data2.join()

TypeError: join() missing 1 required positional argument: 'other'

In [43]:
data2.printSchema()

root
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- style: string (nullable = true)



In [44]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
wordcloud = WordCloud(max_font_size=40).generate(cloud)

TypeError: expected string or bytes-like object

In [None]:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()