## PySpark for NLP 

In [2]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

ModuleNotFoundError: No module named 'pyspark'

### Create a fake example
  - Need to import pyspark.sql module for dataframe

In [1]:
from pyspark.sql import SparkSession
# initialise sparkContext
spark = SparkSession.builder \
    .master("local") \
    .appName("fakeExample") \
    .config("spark.executor.memory", "5gb") \
    .config("spark.cores.max", "6") \
    .getOrCreate()

In [42]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
dt = sc.parallelize(['aaa\tgggg:0.2|ggs:.3','bbb\tpppp:0.2|pps:.3' ])
dt2 = dt.map(lambda x: x.replace("\t", ":none|")).flatMap(lambda x: x.split("|")).map(lambda x: x.split(":"))
dt2.toDF().show()

+----+----+
|  _1|  _2|
+----+----+
| aaa|none|
|gggg| 0.2|
| ggs|  .3|
| bbb|none|
|pppp| 0.2|
| pps|  .3|
+----+----+



In [3]:
sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

sentenceData.show()
sentenceData.printSchema()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi I heard about ...|
|  0.0|I wish Java could...|
|  1.0|Logistic regressi...|
+-----+--------------------+

root
 |-- label: double (nullable = true)
 |-- sentence: string (nullable = true)



## Tokernization

In [4]:
# initialize an instance
tokenizer = Tokenizer(inputCol= "sentence", outputCol="terms")
termData = tokenizer.transform(sentenceData)
display(termData)
termData.show()
termData.printSchema()

DataFrame[label: double, sentence: string, terms: array<string>]

+-----+--------------------+--------------------+
|label|            sentence|               terms|
+-----+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|
|  0.0|I wish Java could...|[i, wish, java, c...|
|  1.0|Logistic regressi...|[logistic, regres...|
+-----+--------------------+--------------------+

root
 |-- label: double (nullable = true)
 |-- sentence: string (nullable = true)
 |-- terms: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [45]:
hashingTF = HashingTF(inputCol="terms",  outputCol="rawfeature", numFeatures=10)
featureizeData = hashingTF.transform(termData)
display(featureizeData)
featureizeData.show()
termData.printSchema()
featureizeData.select("rawfeature").show()
a = featureizeData.collect()
a

DataFrame[label: double, sentence: string, terms: array<string>, rawfeature: vector]

+-----+--------------------+--------------------+--------------------+
|label|            sentence|               terms|          rawfeature|
+-----+--------------------+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|(10,[0,5,7,9],[1....|
|  0.0|I wish Java could...|[i, wish, java, c...|(10,[2,3,5,7,9],[...|
|  1.0|Logistic regressi...|[logistic, regres...|(10,[3,4,5,6,8],[...|
+-----+--------------------+--------------------+--------------------+

root
 |-- label: double (nullable = true)
 |-- sentence: string (nullable = true)
 |-- terms: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------------------+
|          rawfeature|
+--------------------+
|(10,[0,5,7,9],[1....|
|(10,[2,3,5,7,9],[...|
|(10,[3,4,5,6,8],[...|
+--------------------+



[Row(label=0.0, sentence='Hi I heard about Spark', terms=['hi', 'i', 'heard', 'about', 'spark'], rawfeature=SparseVector(10, {0: 1.0, 5: 1.0, 7: 2.0, 9: 1.0})),
 Row(label=0.0, sentence='I wish Java could use case classes', terms=['i', 'wish', 'java', 'could', 'use', 'case', 'classes'], rawfeature=SparseVector(10, {2: 1.0, 3: 1.0, 5: 1.0, 7: 1.0, 9: 3.0})),
 Row(label=1.0, sentence='Logistic regression models are neat', terms=['logistic', 'regression', 'models', 'are', 'neat'], rawfeature=SparseVector(10, {3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 8: 1.0}))]

In [47]:
idf = IDF(inputCol="rawfeature", outputCol="feature")
idfModel = idf.fit(featureizeData)
rescalData = idfModel.transform(featureizeData)

rescalData.select("label", "feature").show()

+-----+--------------------+
|label|             feature|
+-----+--------------------+
|  0.0|(10,[0,5,7,9],[0....|
|  0.0|(10,[2,3,5,7,9],[...|
|  1.0|(10,[3,4,5,6,8],[...|
+-----+--------------------+

