# Building the full dataframe:

With our full pipeline defined in a script (nlp_pipline.py) we can now save the resulting dataframe to a parquet file for easy access in future notebooks!

In [None]:
from src.nlp_pipeline import get_pipeline

%autoreload 2

In [None]:
pipeline = get_pipeline()

data_file = 'data/excerpts.json'
df = spark.read.json(data_file)

# Create a Small Test DF

In [None]:
df.createOrReplaceTempView("df")

tiny_df = spark.sql("""
        SELECT author, title, excerpt, excerpt_number
        FROM df
        WHERE excerpt_number BETWEEN 15 AND 25
        ORDER BY author, title
        """).persist()

In [None]:
tiny_df.createOrReplaceTempView("Sample")

spark.sql("""
        SELECT author, title, COUNT(DISTINCT excerpt) as num_excerpts
        FROM Sample
        GROUP BY author, title
        ORDER BY author, title
        """).show()

In [None]:
%%time
tiny_df = pipeline.fit(tiny_df).transform(tiny_df)
save_loc = "data/sample_df.parquet"
tiny_df.write.mode('overwrite').save(save_loc, format="parquet")

# Create the Full Dataframe

In [None]:
%%time

# Word2Vec takes quite a while on my tiny computer!
# CPU times: user 578 ms, sys: 358 ms, total: 936 ms
# Wall time: 2h 12min 45s

save_loc = "data/data.parquet"

df_final = pipeline.fit(df).transform(df)
df_final.write.mode('overwrite').save(save_loc, format="parquet")

# Consider: Add in spark sql querries for some of the more interesting columns (just for kicks)

In [None]:
T = "parquet.`data/data.parquet`"

spark.sql("""
        SELECT author_id, title
             , words, tfidf, w2v_2D
        FROM {}
        LIMIT 5
        """.format(T)).show()

In [None]:
# create multiple dataframes:
# tfidf_author
# tfidf_title
# w2v_author
# w2v_title
# w2v2d_author_title

# ?include col w/ excerpt id?
# # for quick look up of misclassified passages?