## Assignment 6 Part 2
by Charlie Mei cm3947

Write a Pyspark program based on the other provided Class Exercise, which:

- Loads your previously obtained dataset of Webhose news articles into a Spark dataframe
- Cleans up and tokenizes article bodies using the RegexTokenizer and Stopword remover functions provided in the Class Exercise
- Trains a Word2Vec model based on the output column produced in step 2
- Implements any sample search query, as shown in Class Exercise, and produces matching article titles

In [2]:
import numpy as np
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load(disable=['parser', 'tagger','ner'])

In [4]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/8e/b0/bf9020b56492281b9c9d8aae8f44ff51e1bc91b3ef5a884385cb4e389a40/pyspark-3.0.0.tar.gz (204.7MB)
[K     |████████████████████████████████| 204.7MB 53kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 38.1MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.0-py2.py3-none-any.whl size=205044182 sha256=b7a62d87e47d4cc2d34e0a3eaa27567fc54260a94192dccf82bc08ede3a04651
  Stored in directory: /root/.cache/pip/wheels/57/27/4d/ddacf7143f8d5b76c45c61ee2e43d9f8492fc5a8e78ebd7d37
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.0


In [5]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext() 
sqlContext = SQLContext(sc)
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel
from nltk.stem.wordnet import WordNetLemmatizer
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec

In [6]:
def cleanup_pretokenize(text):
    #text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'http\S+', '', text)
    text = text.replace("'s", " ")
    text = text.replace("n't", " not ")
    text = text.replace("'ve", " have ")
    text = text.replace("'re", " are ")
    text = text.replace("I'm"," I am ")
    text = text.replace("you're"," you are ")
    text = text.replace("You're"," You are ")
    text = text.replace("-"," ")
    text = text.replace("/"," ")
    text = text.replace("("," ")
    text = text.replace(")"," ")
    text = text.replace("%"," percent ")
    return text

lmtzr = WordNetLemmatizer()
def text_cleanup(row):
    desc = row[2].strip().lower()
    tokens = [w.lemma_ for w in nlp(cleanup_pretokenize(desc))]
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if len(token) > 3]
    #tokens = [lmtzr.lemmatize(token,'v') for token in tokens]
    row[2] = ' '.join(tokens)
    return row

regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'tokens')
swr = StopWordsRemover(inputCol = 'tokens', outputCol = 'tokens_sw_removed')

def cossim(v1, v2): 
    return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / (np.sqrt(np.dot(v2, v2))+.1)

#### Loading the Webhose Dataset and Data Cleaning

In [7]:
# Load data into Spark Dataframe
crunch_df = sqlContext.read.option('header', 'true').option('delimiter', ',').option('inferSchema', 'true').json('webhose_netflix.json')

In [8]:
crunchbase_data = crunch_df['uuid','title','text']

cols = [0, 1, 2]
crunchbase_rdd = crunchbase_data.select('*').rdd.map(lambda row: [row[i] for i in cols]).filter(lambda row: row[2] is not None)
crunchbase_df = sqlContext.createDataFrame(crunchbase_rdd, ['uuid', 'title', 'text'])
crunchbase_df.show(2)

+--------------------+--------------------+--------------------+
|                uuid|               title|                text|
+--------------------+--------------------+--------------------+
|f890670c140631022...|13 Reasons Why: T...|The controversial...|
|f1da1d6c5ddf6b095...|Judge gives contr...|A federal judge i...|
+--------------------+--------------------+--------------------+
only showing top 2 rows



In [9]:
# Tokenize and remove stopwords
df_tokens = regexTokenizer.transform(crunchbase_df)
desc_swr = swr.transform(df_tokens)
desc_swr.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                uuid|               title|                text|              tokens|   tokens_sw_removed|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|f890670c140631022...|13 Reasons Why: T...|The controversial...|[the, controversi...|[controversial, 1...|
|f1da1d6c5ddf6b095...|Judge gives contr...|A federal judge i...|[a, federal, judg...|[federal, judge, ...|
|f431c194e4eddacdd...|A TV reboot of Bo...|WhatsApp If you e...|[whatsapp, if, yo...|[whatsapp, enjoye...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



#### Training a Word2Vec Model

In [10]:
# Train a word2vec model
word2vec = Word2Vec(vectorSize = 300, minCount = 5, inputCol = 'tokens_sw_removed', outputCol = 'wordvectors')
model = word2vec.fit(desc_swr)
wordvectors = model.transform(desc_swr)
#wordvectors.select('wordvectors').show(1, truncate = True)
crunchbase_desc = wordvectors.select('uuid','title','wordvectors').rdd.toDF()
crunchbase_desc.show(5)

+--------------------+--------------------+--------------------+
|                uuid|               title|         wordvectors|
+--------------------+--------------------+--------------------+
|f890670c140631022...|13 Reasons Why: T...|[-0.0801881681807...|
|f1da1d6c5ddf6b095...|Judge gives contr...|[-0.0088495124698...|
|f431c194e4eddacdd...|A TV reboot of Bo...|[-0.0838881237240...|
|5930a57af03089f5d...|2-Pack: Ideaworks...|[-0.0542835764252...|
|050149948217f53d4...|Already-Obese Ave...|[-0.0393892194748...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



#### Searching for matching articles

In [19]:
chunk = crunchbase_desc.take(50000)
def search_for_similar_articles(SEARCH_QUERY):
    # Turn query into a word vector
    query_df  = sc.parallelize([(1,SEARCH_QUERY)]).toDF(['index','text'])
    query_tok = regexTokenizer.transform(query_df)
    query_swr = swr.transform(query_tok)
    query_vec = model.transform(query_swr)
    query_vec = query_vec.select('wordvectors').collect()[0][0]

    # Find similar articles
    sim_rdd = sc.parallelize((i[0], i[1], float(cossim(query_vec, i[2]))) for i in chunk)
    sim_df  = sqlContext.createDataFrame(sim_rdd).\
                   withColumnRenamed('_1', 'crunchbase_uuid').\
                   withColumnRenamed('_2', 'name').\
                   withColumnRenamed('_3', 'similarity').\
                   orderBy("similarity", ascending = False)
    return sim_df

In [20]:
sim_df = search_for_similar_articles(SEARCH_QUERY = '13 Reasons Why')
sim_df.show(20, truncate=False)

+----------------------------------------+-------------------------------------------------------------------------------------------------------------------+------------------+
|crunchbase_uuid                         |name                                                                                                               |similarity        |
+----------------------------------------+-------------------------------------------------------------------------------------------------------------------+------------------+
|1ac02b291edbd50ab93054d6a47cea96339ed17f|Watch the Trailer for the Final Season of '13 Reasons Why'!                                                        |0.6813239495615256|
|b27f092a2c9be2a7150a912639f9791a2381eb74|Netflix’s 13 Reasons Why Final Season Trailer Released                                                             |0.628123089230771 |
|ffe5e336bfbd2663e10900cbaf176ba0f3eed48a|13 Reasons Why: The Final Season gets new trailer from Netflix      