* Visit http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/
* Read about word2vec and download the Google pre-trained model binary

**Required**:

* For any selected title in your Webhose dataset (of size 10000), write a Python program than calculates 100 most similar titles from the remaining 9999:
    * Calculate pairwise similarity using word2vec model
    * Sort in reverse order to determine 100 titles with highest scores
    * Print the selected title and those 100 similar titles
* Review information about Apache Spark here
* Review and download DataBricks Word2Vec Spark Training notebook here
* Train word2vec model, based on the texts of the articles in your Webhose dataset, using Spark Word2Vec function

Submission should consist of Jupyter Notebook file along with output 


# **Google pre-trained model binary Word2vec**

In [2]:
import os
import json

import gensim, operator
from scipy import spatial
import numpy as np
from gensim.models import KeyedVectors

model_path = '/home/elia/Dropbox/Harrisburg/ANLY_610/codes/models/wordvec/'
data_path = '/home/elia/Dropbox/Harrisburg/ANLY_610/codes/data/'

In [7]:
def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

model_word2vec = load_wordvec_model('Word2Vec', 'GoogleNews-vectors-negative300.bin.gz', True)

Loading Word2Vec model...
Finished loading Word2Vec model...


In [4]:
def load_webhouse_data(data_name): 
    data_file = data_path + data_name + '.json'
    with open(data_file) as json_data:
        data = json.load(json_data)
        return data
    
data = load_webhouse_data('trump')

In [48]:
def vec_similarity(input1, input2, vectors):
    term_vectors = [np.zeros(300), np.zeros(300)]
    terms = [input1, input2]
        
    for index, term in enumerate(terms):
        for i, t in enumerate(term.split(' ')):
            try:
                term_vectors[index] += vectors[t]
            except:
                term_vectors[index] += 0
        
    result = (1 - spatial.distance.cosine(term_vectors[0], term_vectors[1]))
    if result is 'nan':
        result = 0
        
    return result

# function checks whether the input words are present in the vocabulary for the model
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
            
    return output

# function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    if len(s1words) == 0 or len(s2words) == 0:
        return 0 
    output = vectors.n_similarity(s1words, s2words)
    return output

In [76]:
def get_most_similar_titles(data, data_id, sim_tit_num=100): 
    if data_id >= len(data): 
        raise IOError('The id to select title should be between 0 and {}'.format(len(data)))
    title = data[data_id]['title']        
    comparison_matrix = np.array(
        [[data[index]['title'], calc_similarity(title, data[index]['title'], model_word2vec)] 
         for index in range(len(data)) if index != data_id])
    return np.sort(comparison_matrix, axis=0)[::-1][:100]

title_id = 23
print("100 titles most similar to {}".format(data[title_id]['title']))
print("---------------------------------------------------------------------------------------------------------")
print("similarity | title ")
print("---------------------------------------------------------------------------------------------------------")
for line in get_most_similar_titles(data, title_id): 
    print("{0:.7f}".format(float(line[1])) + "  | {}".format(str(line[0])))

100 titles most similar to The Latest: President Trump signs Bibles at Alabama church
---------------------------------------------------------------------------------------------------------
similarity | title 
---------------------------------------------------------------------------------------------------------
0.9999999  | 🔥 Trump just blurted out that Michael Cohen “directly asked me for a pardon” — thereby declaring himself a fact witness and fair game for deposition. Stable Genius Strikes Again.
0.9999999  | 🔥 Trump calls Russia investigation a ‘collusion witch hoax’ in rambling and inaccurate White House rant: he wrongly suggested the judge who sentenced Paul Manafort, to 47 months in jail had said “there was no collusion with Russia”.
0.9999999  | 👉 Trump says his former lawyer and fixer Michael Cohen 'directly asked me for a pardon' via Hvper.com
0.9999999  | 👉 Trump says his former lawyer and fixer Michael Cohen 'directly asked me for a pardon' via Hvper.com
0.9999999  | 👉


# **pyspark section**

In [2]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'

from pyspark.conf import SparkConf
from pyspark import SparkContext
sc = SparkContext() 
config = sc.getConf()
config.set('spark.cores.max','4')
config.set('spark.executor.memory', '4G')
config.set('spark.driver.maxResultSize', '4g')
config.set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')
config.set('spark.kryoserializer.buffer.max', '256m')
config.set("spark.driver.cores", "3")

sc.stop()

In [3]:
from pyspark.sql import SQLContext
sc = SparkContext(conf = config) 
sqlContext = SQLContext(sc)
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel
from nltk.stem.wordnet import WordNetLemmatizer
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec

print("Using Apache Spark Version", sc.version)

Using Apache Spark Version 2.4.1


In [4]:
import spacy

def cleanup_pretokenize(text):
    #text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'http\S+', '', text)
    text = text.replace("'s", " ")
    text = text.replace("n't", " not ")
    text = text.replace("'ve", " have ")
    text = text.replace("'re", " are ")
    text = text.replace("I'm"," I am ")
    text = text.replace("you're"," you are ")
    text = text.replace("You're"," You are ")
    text = text.replace("-"," ")
    text = text.replace("/"," ")
    text = text.replace("("," ")
    text = text.replace(")"," ")
    text = text.replace("%"," percent ")
    return text

lmtzr = WordNetLemmatizer()
def text_cleanup(row):
    desc = row[2].strip().lower()
    #tokens = [w.lemma_ for w in nlp(cleanup_pretokenize(desc))]
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if len(token) > 3]
    tokens = [lmtzr.lemmatize(token,'v') for token in tokens]
    row[2] = ' '.join(tokens)
    return row

regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'tokens')
swr = StopWordsRemover(inputCol = 'tokens', outputCol = 'tokens_sw_removed')

In [6]:
# load json 
def load_json_data(data_name): 
    data_file = data_path + data_name + '.json'
    return sqlContext.read.json(data_file)

data_df = load_json_data('trump')
data_df.printSchema()

root
 |-- author: string (nullable = true)
 |-- crawled: string (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- locations: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- sentiment: string (nullable = true)
 |    |-- organizations: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- sentiment: string (nullable = true)
 |    |-- persons: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- sentiment: string (nullable = true)
 |-- external_links: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- highlightText: string (nullable = true)
 |-- highlightTitle: string (nullable = true)
 |-- language: string (nullable = true)
 |-- ord_in_thread: long (nullable = 

In [9]:
text_df = data_df.select('text')
text_df.show(5)
df_tokens = regexTokenizer.transform(text_df)
df_tokens.show(5)
desc_swr = swr.transform(df_tokens)
desc_swr.show(5)

+--------------------+
|                text|
+--------------------+
|WASHINGTON (AP) -...|
|Trump surveys Ala...|
|The Beat with Ari...|
|BEAUREGARD, Ala. ...|
|Trump should reco...|
+--------------------+
only showing top 5 rows

+--------------------+--------------------+
|                text|              tokens|
+--------------------+--------------------+
|WASHINGTON (AP) -...|[washington, ap, ...|
|Trump surveys Ala...|[trump, surveys, ...|
|The Beat with Ari...|[the, beat, with,...|
|BEAUREGARD, Ala. ...|[beauregard, ala,...|
|Trump should reco...|[trump, should, r...|
+--------------------+--------------------+
only showing top 5 rows

+--------------------+--------------------+--------------------+
|                text|              tokens|   tokens_sw_removed|
+--------------------+--------------------+--------------------+
|WASHINGTON (AP) -...|[washington, ap, ...|[washington, ap, ...|
|Trump surveys Ala...|[trump, surveys, ...|[trump, surveys, ...|
|The Beat with Ari...|

In [10]:
word2vec = Word2Vec(vectorSize = 300, minCount = 5, inputCol = 'tokens_sw_removed', outputCol = 'wordvectors')
model = word2vec.fit(desc_swr)

In [11]:
wordvectors = model.transform(desc_swr)
text_desc = wordvectors.select('text','wordvectors')
text_desc.show(5)

+--------------------+--------------------+
|                text|         wordvectors|
+--------------------+--------------------+
|WASHINGTON (AP) -...|[-0.0749518844910...|
|Trump surveys Ala...|[-0.0200785594274...|
|The Beat with Ari...|[-0.1803129903661...|
|BEAUREGARD, Ala. ...|[-0.0259655292435...|
|Trump should reco...|[-0.0499572096919...|
+--------------------+--------------------+
only showing top 5 rows



In [12]:
synonyms = model.findSynonyms("pompeo", 20)   
synonyms.show()

+------------+-------------------+
|        word|         similarity|
+------------+-------------------+
|        mike| 0.5678425431251526|
|  emphasised| 0.5327044725418091|
|     hawkish| 0.4932403564453125|
|     gokhale|0.47736334800720215|
|      bolton|0.47003406286239624|
|      echoed|0.45853370428085327|
|       vijay| 0.4505883753299713|
|     advisor| 0.4262145161628723|
|   secretary| 0.4251237213611603|
|      mullen|0.40693843364715576|
|counterparts|0.40623241662979126|
|  undermined|0.38990429043769836|
|      scored| 0.3882525563240051|
|  emphasized| 0.3881252706050873|
|      yaqing| 0.3880544602870941|
|     qureshi| 0.3836938738822937|
|      biegun|  0.380886971950531|
|        hale| 0.3719295263290405|
|       pence|0.37069055438041687|
| embarrassed|0.36694759130477905|
+------------+-------------------+



In [13]:
SEARCH_QUERY = "secretary of energy in the Trump administration"
query_df  = sc.parallelize([(1,SEARCH_QUERY)]).toDF(['index', 'text'])
query_tok = regexTokenizer.transform(query_df)
query_swr = swr.transform(query_tok)
query_swr.show()
query_vec = model.transform(query_swr)
query_vec = query_vec.select('wordvectors').collect()[0][0]

+-----+--------------------+--------------------+--------------------+
|index|                text|              tokens|   tokens_sw_removed|
+-----+--------------------+--------------------+--------------------+
|    1|secretary of ener...|[secretary, of, e...|[secretary, energ...|
+-----+--------------------+--------------------+--------------------+



In [17]:
def cossim(v1, v2): 
    return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / (np.sqrt(np.dot(v2, v2))+.1)
chunk = text_desc.collect()
sim_rdd = sc.parallelize((i[0], float(cossim(query_vec, i[1]))) for i in chunk)

In [19]:
sim_df  = sqlContext.createDataFrame(sim_rdd).\
                   withColumnRenamed('_1', 'name').\
                   withColumnRenamed('_2', 'similarity').\
                   orderBy("similarity", ascending = False)
sim_df.show(2, truncate = False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------