In [19]:
# spark must be first found using findspark package
import findspark
findspark.init()

In [20]:
# establish a spark session with 50 executors
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .master("yarn") \
        .appName("testing") \
        .config("spark.executor.instances", "50") \
        .config("spark.executor.memory","5g") \
        .config("spark.driver.memory","30g") \
        .config("spark.executor.cores",'1') \
        .config("spark.scheduler.mode","FIFO") \
        .config("spark.driver.maxResultSize", '4g') \
        .getOrCreate()

In [54]:
# when wanting to stop the spark session
spark.stop()

In [None]:
# import the review dataset
review = spark.read.parquet('/yelp/review.parquet').repartition(300).cache()
business = spark.read.parquet('/yelp/business.parquet').repartition(100).cache()
users = spark.read.parquet('/yelp/users.parquet').repartition(200).cache()

In [None]:
from pyspark.sql.functions import greatest
# generate column for total aggregated count of votes
review = review.withColumn('totalvotes', review.cool+review.funny+review.useful)

# select only the reviews where one has at least 50 %
review = review.withColumn('max_vote', greatest(review.cool,review.funny, review.useful))
review = review.withColumn('max_ratio', review.max_vote/review.totalvotes)
review = review.where( (review.max_vote / review.totalvotes) >=0.4)

# filter out reviews with less than 10 reviews
review = review.where(review.totalvotes >= 10)
review.count()

In [26]:
#import lemmatized files, change format
lemmatized = spark.read.json("/yelp/flat_lem.json").repartition(150).cache()


In [40]:
from pyspark.sql.functions import greatest
# generate column for total aggregated count of votes
lemmatized = lemmatized.withColumn('totalvotes', lemmatized.cool+lemmatized.funny+lemmatized.useful)

# select only the reviews where one has at least 50 %
lemmatized = lemmatized.withColumn('max_vote', greatest(lemmatized.cool,lemmatized.funny, lemmatized.useful))
lemmatized = lemmatized.withColumn('max_ratio', lemmatized.max_vote/lemmatized.totalvotes)
lemmatized = lemmatized.where( (lemmatized.max_vote / lemmatized.totalvotes) > 0.5)

# filter out reviews with less than 10 reviews
lemmatized = lemmatized.where(lemmatized.totalvotes > 10)
lemmatized.count()

34607

In [53]:
# WORD2VEC fitting
import time
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec
start = time.time()

#tokenize
#tokenizer = Tokenizer(inputCol="text", outputCol="words")
#dataset = tokenizer.transform(lemmatized)

#drop original text column
#dataset = dataset.drop("text")

# Stop word removal
stopremove = StopWordsRemover(inputCol='text',outputCol='cleaned')
dataset = stopremove.transform(lemmatized)

dataset = dataset.drop('text').repartition(300).cache()

#fit a word2vec model 
word2Vec = Word2Vec(vectorSize=64, minCount=0, numPartitions=300, inputCol="cleaned", outputCol="word2vec")
model = word2Vec.fit(dataset)
dataset = model.transform(dataset).drop('cleaned')

end = time.time()

In [None]:
model.

In [47]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="text", outputCol="features", vocabSize=3, minDF=2.0)

model = cv.fit(lemmatized)
result = model.transform(lemmatized)


In [51]:
result.select('features').take(10)

[Row(features=SparseVector(3, {0: 1.0})),
 Row(features=SparseVector(3, {1: 2.0, 2: 1.0})),
 Row(features=SparseVector(3, {0: 2.0, 1: 1.0, 2: 1.0})),
 Row(features=SparseVector(3, {0: 2.0, 1: 1.0, 2: 2.0})),
 Row(features=SparseVector(3, {})),
 Row(features=SparseVector(3, {0: 1.0, 1: 8.0, 2: 1.0})),
 Row(features=SparseVector(3, {1: 1.0})),
 Row(features=SparseVector(3, {0: 1.0, 2: 2.0})),
 Row(features=SparseVector(3, {0: 1.0, 1: 1.0, 2: 3.0})),
 Row(features=SparseVector(3, {1: 1.0}))]

In [33]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="max_compliment_type", outputCol="label")
dataset = indexer.fit(dataset).transform(dataset).cache()

In [34]:
from pyspark.ml.classification import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator
# Concept of pipeline
# Additionally how to cross validate
logit = LogisticRegression(featuresCol='word2vec',labelCol='label',)
cart = DecisionTreeClassifier(featuresCol='word2vec',labelCol='label')
gbt = GBTClassifier(featuresCol='word2vec',labelCol='label')
rf = RandomForestClassifier(featuresCol='word2vec',labelCol="label")

paramGrid_logit = ParamGridBuilder() \
    .addGrid(logit.regParam, [0,0.01,0.07, 0.1, 0.5, 0.75, 1, 2]) \
    .build()
    
paramGrid_cart = ParamGridBuilder() \
    .addGrid(cart.maxDepth, [10,12,15]) \
    .build()

paramGrid_rf = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [15]) \
    .addGrid(rf.numTrees, [100]) \
    .build()
    
paramGrid_gbt = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [10,12,15]) \
    .addGrid(gbt.stepSize,[0.01]) \
    .addGrid(gbt.maxIter,[20]) \
    .build()

evaluator = MulticlassClassificationEvaluator(labelCol='label')
    

cv_logit = CrossValidator(estimator=logit,evaluator=evaluator,estimatorParamMaps=paramGrid_logit,numFolds=5)
cv_cart = CrossValidator(estimator=cart,evaluator=evaluator,estimatorParamMaps=paramGrid_cart,numFolds=5)
cv_gbt = CrossValidator(estimator=gbt,evaluator=evaluator,estimatorParamMaps=paramGrid_gbt,numFolds=5)
cv_rf = CrossValidator(estimator=rf, evaluator=evaluator, numFolds=5, estimatorParamMaps=paramGrid_rf)

In [35]:
cvmodel_logit = cv_logit.fit(dataset)

In [38]:
print(cvmodel_logit.avgMetrics)
print(cvmodel_cart.avgMetrics)

[0.7871575761399972, 0.7780178339788828, 0.7741612149013655, 0.7740891864963063, 0.7716377694783928, 0.7712821644531765, 0.7710310540338433, 0.7707834248325989]
[0.7964712722846773, 0.793964562429327, 0.7877835003671942]


In [37]:
cvmodel_cart = cv_cart.fit(dataset)

In [16]:
cvmodel_gbt = cv_gbt.fit(dataset)

Py4JJavaError: An error occurred while calling o3817.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 8635.0 failed 4 times, most recent failure: Lost task 0.3 in stage 8635.0 (TID 767402, slave-1, executor 11): java.lang.IllegalArgumentException: requirement failed: GBTClassifier was given dataset with invalid label 2.0.  Labels must be in {0,1}; note that GBTClassifier currently only supports binary classification.
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$1.apply(GBTClassifier.scala:153)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$1.apply(GBTClassifier.scala:151)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1354)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.take(RDD.scala:1327)
	at org.apache.spark.ml.tree.impl.DecisionTreeMetadata$.buildMetadata(DecisionTreeMetadata.scala:112)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:105)
	at org.apache.spark.ml.regression.DecisionTreeRegressor.train(DecisionTreeRegressor.scala:125)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.boost(GradientBoostedTrees.scala:291)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.run(GradientBoostedTrees.scala:53)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:175)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:59)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at sun.reflect.GeneratedMethodAccessor86.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.IllegalArgumentException: requirement failed: GBTClassifier was given dataset with invalid label 2.0.  Labels must be in {0,1}; note that GBTClassifier currently only supports binary classification.
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$1.apply(GBTClassifier.scala:153)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$1.apply(GBTClassifier.scala:151)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [None]:
cvmodel_rf = cv_rf.fit(dataset)

In [39]:
cvmodel_rf

NameError: name 'cvmodel_rf' is not defined

In [None]:
# Using PCA on the new data to understand variance
from pyspark.ml.feature import PCA

pca = PCA(k=10, inputCol= "word2vec",outputCol="pca_text")
pca_model = pca.fit(dataset)
pca_result = pca_model.transform(dataset)

In [None]:
pca_model.explainedVariance.values

In [None]:
import numpy as np
import matplotlib.pyplot as plt
#np.arange(len(result.columns))
#result.columns
plt.plot(np.arange(10), np.array(pca_model.explainedVariance.values))
plt.title('Explained Variance - PCA')
plt.ylabel('Proportion of Variance Explained')
plt.xlabel('Principal Component')
plt.show()

In [None]:
pca_result.printSchema()

In [None]:
pca_result_pd = pca_result.toPandas()
pca_result_pd['maxcat'] = pca_result_pd[['cool','useful','funny']].idxmax(axis = 1 )
pca_result_pd['maxcat'] = pca_result_pd['maxcat'].astype('category')
pca_result_pd['maxcat_code'] = pca_result_pd['maxcat'].cat.codes

In [None]:
np.array(pca_result.pca_text)

In [None]:
def extract_pca(row):
    return tuple(row.pca_text.toArray().tolist())
pca_result = pca_result.rdd.map(extract_pca).toDF()

In [None]:
pca_result_pd = pca_result.toPandas()

In [None]:
pca_result_pd

In [None]:
from MulticoreTSNE import MulticoreTSNE as TSNE
tsne= TSNE(early_exaggeration=10,n_jobs=20)
tsne_output = tsne.fit_transform(np.array(pca_result_pd.pca_text))

In [None]:
plt.scatter(tsne_output[:,0],tsne_output[:,1],, c)
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111,projection='3d')

ax.scatter(xs=result_pd._1, ys=result_pd._2, zs = result_pd._3)
plt.title('3d Representation of Word2Vec Embeddings')
plt.show()

In [None]:
result_pd['maxcat'] = data[['cool','useful','funny']].idxmax(axis = 1 )
result_pd['maxcat'] = result_pd['maxcat'].astype('category')
result_pd['maxcat_code'] = result_pd['maxcat'].cat.codes

In [None]:
result_pd