In [42]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline


from src.nlp_pipeline import get_pipeline


In [43]:
data_file = 'data/excerpts.json'
raw_df = spark.read.json(data_file)

raw_df.printSchema()
print("row count: ", raw_df.count())
raw_df.show(3)


# create copy of raw_df incase I mess things up :P
df = raw_df

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- excerpt_number: long (nullable = true)
 |-- title: string (nullable = true)

row count:  9050
+--------------+--------------------+--------------+---------------+
|        author|             excerpt|excerpt_number|          title|
+--------------+--------------------+--------------+---------------+
|CharlesDickens|A CHRISTMAS CAROL...|             0|AChristmasCarol|
|CharlesDickens|Mind! I don't mea...|             1|AChristmasCarol|
|CharlesDickens|Scrooge never pai...|             2|AChristmasCarol|
+--------------+--------------------+--------------+---------------+
only showing top 3 rows



In [44]:
df.createOrReplaceTempView("df")

tiny_df = spark.sql("""
        SELECT author, title, excerpt, excerpt_number
        FROM df
        WHERE excerpt_number = 25
        ORDER BY author, title
        """).persist()

tiny_df.show()

+--------------+--------------------+--------------------+--------------+
|        author|               title|             excerpt|excerpt_number|
+--------------+--------------------+--------------------+--------------+
|CharlesDickens|     AChristmasCarol|It was not an agr...|            25|
|CharlesDickens|    ATaleOfTwoCities|“So soon?” || Mis...|            25|
|CharlesDickens|    DavidCopperfield|‘Peggotty,’ says ...|            25|
|CharlesDickens|   GreatExpectations|“What’s in the bo...|            25|
|CharlesDickens|         OliverTwist|'Walk in,' said t...|            25|
|    JaneAusten|                Emma|She was so busy i...|            25|
|    JaneAusten|       MansfieldPark|Fanny was too muc...|            25|
|    JaneAusten|          Persuasion|But Mrs Clay was ...|            25|
|    JaneAusten|   PrideAndPrejudice|“Not as you repre...|            25|
|    JaneAusten| SenseAndSensibility|"It is but a cott...|            25|
|      JohnMuir|MyFirstSummerInTh...|H

In [15]:
nlp_pipeline = get_pipeline()
tiny_df = nlp_pipeline.fit(tiny_df).transform(tiny_df).persist()
tiny_df.printSchema()

root
 |-- author: string (nullable = true)
 |-- title: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- excerpt_number: long (nullable = true)
 |-- author_label: integer (nullable = true)
 |-- title_label: integer (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- termfreq: vector (nullable = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)



In [18]:
tiny_df = tiny_df.drop("author_label").drop("title_label")
tiny_df.printSchema()

root
 |-- author: string (nullable = true)
 |-- title: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- excerpt_number: long (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- termfreq: vector (nullable = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)



In [38]:
author_labler = StringIndexer(inputCol="author", outputCol="author_id")
title_labler = StringIndexer(inputCol="title", outputCol="title_id")
row_labler = StringIndexer(inputCol="excerpt", outputCol="row_id")
vector_ider = VectorAssembler(
    inputCols=["author_id", "title_id", "excerpt_number"],
    outputCol="id_vector")

id_pipeline = Pipeline(stages=[author_labler, title_labler, vec_ider])

In [39]:
labeled_df = id_pipeline.fit(tiny_df).transform(tiny_df)
labeled_df.printSchema()

root
 |-- author: string (nullable = true)
 |-- title: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- excerpt_number: long (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- termfreq: vector (nullable = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)
 |-- author_id: double (nullable = true)
 |-- title_id: double (nullable = true)
 |-- id_vector: vector (nullable = true)



In [41]:
labeled_df.createOrReplaceTempView("Labeled")

spark.sql("""
        SELECT author, author_id
             , title, title_id
             , id_vector, words
        FROM Labeled
        Order BY id_vector
        """).show()

+--------------+---------+--------------------+--------+---------------+--------------------+
|        author|author_id|               title|title_id|      id_vector|               words|
+--------------+---------+--------------------+--------+---------------+--------------------+
|     MarkTwain|      0.0|  TheInnocentsAbroad|     0.0| [0.0,0.0,25.0]|[we, steamed, dow...|
|     MarkTwain|      0.0|TheAdventuresOfHu...|     2.0| [0.0,2.0,25.0]|[“, how, you, tal...|
|     MarkTwain|      0.0|          RoughingIt|     7.0| [0.0,7.0,25.0]|[when, a, party, ...|
|     MarkTwain|      0.0|TheTragedyofPuddn...|    12.0|[0.0,12.0,25.0]|[", dey, ai, n't,...|
|     MarkTwain|      0.0|AConnecticutYanke...|    17.0|[0.0,17.0,25.0]|[now, sir, kay, a...|
|CharlesDickens|      1.0|   GreatExpectations|     6.0| [1.0,6.0,25.0]|[“, what, ’s, in,...|
|CharlesDickens|      1.0|     AChristmasCarol|    11.0|[1.0,11.0,25.0]|[it, was, not, an...|
|CharlesDickens|      1.0|    ATaleOfTwoCities|    14.0|[1.0