# Chapter 6

## Handling imbalanced data

In [29]:
from pyspark.sql import SparkSession 

spark = SparkSession.builder \
    .master('local[*]') \
    .appName("Intro") \
    .getOrCreate()

In [30]:
df = spark.createDataFrame(
    [
        (0, "Hi I think pyspark is cool ","happy"),
        (1, "All I want is a pyspark cluster","indifferent"),
        (2, "I finally understand how ML works","fulfill"),
        (3, "Yet another sentence about pyspark and ML","indifferent"),
        (4, "Why didn't I know about mllib before","sad"),
        (5, "Yes, I can","happy")
    ], 
    ["id", "sentence", "sentiment"]
)

# Start Featurization process

# Leverage algo to target Imbalanced Data: 
Similar featurization process as before, only now we have one label for every data entry

In [31]:
from pyspark.ml.feature import StopWordsRemover, Tokenizer

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="clean_words")

df = remover.transform(tokenizer.transform(df))

df.show()

25/10/13 17:02:05 WARN StopWordsRemover: Default locale set was [en_GT]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


+---+--------------------+-----------+--------------------+--------------------+
| id|            sentence|  sentiment|               words|         clean_words|
+---+--------------------+-----------+--------------------+--------------------+
|  0|Hi I think pyspar...|      happy|[hi, i, think, py...|[hi, think, pyspa...|
|  1|All I want is a p...|indifferent|[all, i, want, is...|[want, pyspark, c...|
|  2|I finally underst...|    fulfill|[i, finally, unde...|[finally, underst...|
|  3|Yet another sente...|indifferent|[yet, another, se...|[yet, another, se...|
|  4|Why didn't I know...|        sad|[why, didn't, i, ...|       [know, mllib]|
|  5|          Yes, I can|      happy|      [yes,, i, can]|              [yes,]|
+---+--------------------+-----------+--------------------+--------------------+



In [32]:
from pyspark.ml.feature import NGram

ngram = NGram(n=3, inputCol="clean_words", outputCol="ngrams")

test = ngram.transform(df)
test.show(5,truncate=False)

+---+-----------------------------------------+-----------+-------------------------------------------------+-------------------------------------+---------------------------------------------------------------------+
|id |sentence                                 |sentiment  |words                                            |clean_words                          |ngrams                                                               |
+---+-----------------------------------------+-----------+-------------------------------------------------+-------------------------------------+---------------------------------------------------------------------+
|0  |Hi I think pyspark is cool               |happy      |[hi, i, think, pyspark, is, cool]                |[hi, think, pyspark, cool]           |[hi think pyspark, think pyspark cool]                               |
|1  |All I want is a pyspark cluster          |indifferent|[all, i, want, is, a, pyspark, cluster]          |[want, pyspark, clu

In [33]:
df = test

In [34]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml import Pipeline

hashtf = HashingTF(numFeatures=2**16, inputCol="ngrams", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=2) # minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")

pipeline = Pipeline(stages=[hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(test)
train_df = pipelineFit.transform(test)

train_df.select("features","label").show(5,truncate=False)

+-----------------------------------------+-----+
|features                                 |label|
+-----------------------------------------+-----+
|(65536,[16887,26010],[0.0,0.0])          |0.0  |
|(65536,[57587],[0.0])                    |1.0  |
|(65536,[34782,39758],[0.0,0.0])          |2.0  |
|(65536,[11730,34744,49304],[0.0,0.0,0.0])|1.0  |
|(65536,[],[])                            |3.0  |
+-----------------------------------------+-----+
only showing top 5 rows


25/10/13 17:02:06 WARN DAGScheduler: Broadcasting large task binary with size 1079.9 KiB
25/10/13 17:02:06 WARN DAGScheduler: Broadcasting large task binary with size 1079.9 KiB
25/10/13 17:02:07 WARN DAGScheduler: Broadcasting large task binary with size 1079.9 KiB


# Using GBTClassifier with specifiying strategy

In [35]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", featuresCol="features", featureSubsetStrategy="log2")
model = rf.fit(train_df)

25/10/13 17:02:07 WARN DAGScheduler: Broadcasting large task binary with size 1107.7 KiB
25/10/13 17:02:07 WARN DAGScheduler: Broadcasting large task binary with size 1107.7 KiB
25/10/13 17:02:07 WARN DAGScheduler: Broadcasting large task binary with size 1107.6 KiB
25/10/13 17:02:07 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 6 (= number of training instances)
25/10/13 17:02:07 WARN DAGScheduler: Broadcasting large task binary with size 1751.7 KiB
25/10/13 17:02:07 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
25/10/13 17:02:07 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 6 (= number of training instances)
25/10/13 17:02:07 WARN DAGScheduler: Broadcasting large task binary with size 1751.7 KiB
25/10/13 17:02:07 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB


In [36]:
# Make predictions.
predictions = model.transform(train_df)

# Select example rows to display.
predictions

DataFrame[id: bigint, sentence: string, sentiment: string, words: array<string>, clean_words: array<string>, ngrams: array<string>, tf: vector, features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double]

In [37]:
predictions.printSchema()

root
 |-- id: long (nullable = true)
 |-- sentence: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- clean_words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ngrams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tf: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [38]:
predictions.select("rawPrediction", "label", "probability", "prediction").show(5, truncate=True)

25/10/13 17:02:07 WARN DAGScheduler: Broadcasting large task binary with size 1148.1 KiB
25/10/13 17:02:08 WARN DAGScheduler: Broadcasting large task binary with size 1148.1 KiB
25/10/13 17:02:08 WARN DAGScheduler: Broadcasting large task binary with size 1148.1 KiB


+--------------------+-----+--------------------+----------+
|       rawPrediction|label|         probability|prediction|
+--------------------+-----+--------------------+----------+
|[7.06428571428571...|  0.0|[0.35321428571428...|       1.0|
|[7.06428571428571...|  1.0|[0.35321428571428...|       1.0|
|[7.06428571428571...|  2.0|[0.35321428571428...|       1.0|
|[7.06428571428571...|  1.0|[0.35321428571428...|       1.0|
|[7.06428571428571...|  3.0|[0.35321428571428...|       1.0|
+--------------------+-----+--------------------+----------+
only showing top 5 rows
