In [7]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline 
from pyspark.sql import functions as F
import os
import re
import string
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import *

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Exploratory_Analysis") \
    .config("spark.executor.memory", '8g') \
    .config("spark.executor.cores", '4') \
    .config('spark.cores.max', '4') \
    .config('spark.driver.memory', '8g') \
    .getOrCreate()

sc = spark.sparkContext

In [8]:
data = spark.read.format('csv'). \
    option("header", "true"). \
    option("inferSchema", "true"). \
    load("/Users/gnolasco/Desktop/Python_Projects/model_df.csv")

In [9]:
data = data.sampleBy("style", fractions = {'American IPA': 75000/264697,
                                           'American Imperial IPA': 75000/188767,
                                           'American Pale Ale (APA)': 75000/109400,
                                           'Belgian Saison': 75000/78402}, seed = 69)

In [10]:
from pyspark.sql.functions import regexp_replace,col

main = data.select('text', 'style')

#for multiple regex expressions use OR |
main = main.withColumn('text', regexp_replace(col('text'), "\\.|\xa0|!|,|:", ""))

main = main.filter(main['text'] != ' ')

In [15]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
main = tokenizer.transform(main)

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
main = remover.transform(main)

htf = HashingTF(inputCol="filtered", outputCol="tf", numFeatures=10000) 
main = htf.transform(main)

stringIndexer = StringIndexer(inputCol="style", outputCol="label")
main = stringIndexer.fit(main).transform(main)

va = VectorAssembler(inputCols=["tf"], outputCol="features")
main = va.transform(main)

In [16]:
train, test = main.randomSplit([0.8, 0.2], seed=69)

In [17]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10, regParam=0.01)

model = lr.fit(train)

In [18]:
prediction = model.transform(test)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(prediction)