# Word2Vec: PySpark using Spark and Hadoop 3

This was previously running on 2.4.4. See also the spark-util notebook.

This can use a Yarn cluster or the master directly. Yarn can use the local file system if it is specified by a URI.

Hive should be used and there are extra paths for classes and Database drivers added. 

The k1 host has systemd services to start hadoop and hive under a spark.target. 

## Spark

Configuration

In [1]:
# this shows all the results in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%config Application.log_level="WARN"

import sys
import os
import shutil
import tempfile
import urllib.request

import findspark
findspark.init() # You need to init before you can import the Spark Context

In [3]:
os.environ['HADOOP_CONF_DIR']
os.environ['SPARK_CONF_DIR']
os.environ['PATH']
os.environ['SPARK_DIST_CLASSPATH']

'/misc/build/1/hadoop/etc/hadoop/hadoop'

'/misc/build/1/hadoop/etc/spark/conf'

'.:/home/weaves/.local/bin:/home/weaves/bin:/misc/conda/condabin:/misc/conda/envs/toot/bin:/misc/conda/bin:/a/hypr/home/hypr/disk01/W-media/cache/sdkman/candidates/scala/current/bin:/a/hypr/home/hypr/disk01/W-media/cache/sdkman/candidates/sbt/current/bin:/a/hypr/home/hypr/disk01/W-media/cache/sdkman/candidates/maven/current/bin:/a/hypr/home/hypr/disk01/W-media/cache/sdkman/candidates/kotlin/current/bin:/a/hypr/home/hypr/disk01/W-media/cache/sdkman/candidates/java/current/bin:/a/hypr/home/hypr/disk01/W-media/cache/sdkman/candidates/gradle/current/bin:/snap/bin:/usr/local/bin:/usr/bin:/bin:/usr/local/libexec:/usr/lib/git-core:/usr/local/games:/usr/games:/misc/sistemoj/share/bin'

'/misc/build/1/hadoop/etc/hadoop/hadoop:/misc/share/1/hadoop-3.3.5/share/hadoop/common/lib/*:/misc/share/1/hadoop-3.3.5/share/hadoop/common/*:/misc/share/1/hadoop-3.3.5/share/hadoop/hdfs:/misc/share/1/hadoop-3.3.5/share/hadoop/hdfs/lib/*:/misc/share/1/hadoop-3.3.5/share/hadoop/hdfs/*:/misc/share/1/hadoop-3.3.5/share/hadoop/mapreduce/*:/misc/share/1/hadoop-3.3.5/share/hadoop/yarn:/misc/share/1/hadoop-3.3.5/share/hadoop/yarn/lib/*:/misc/share/1/hadoop-3.3.5/share/hadoop/yarn/*'

In [4]:
from importlib import reload  # Not needed in Python 2
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.WARN, datefmt='%I:%M:%S')

<module 'logging' from '/misc/conda/envs/toot/lib/python3.9/logging/__init__.py'>

In [5]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.mllib.feature import Word2Vec

from pyspark.conf import SparkConf

In [6]:
useYARN = False
useYARN = True

# YARN - test which access will work.
cstr = "spark://k1:7077"
if useYARN:
    cstr = "yarn"

# The HDFS url needs to be put using the hdfs tool. The HTTP one needs the web-server on (flora) j1
# the file url does works for YARN and master-slave.
url = "hdfs:///weaves/input/text8"
url = "http://flora/cache/text/text"
url = "file:///a/l/X-image/cache/text/text8"

In [7]:
conf = SparkConf().setAppName("word2vec").setMaster(cstr)

In [8]:
# the semi-colon should suppress the output, but doesn't with the Interactivity setting above, so x is used.
x = conf.set("spark.driver.cores", 4);
x = conf.set("spark.driver.memory", "4g");
x = conf.set("spark.executor.cores", 4);
x = conf.set("spark.executor.memory", "4g");
x = conf.set("spark.executor.instances", 4);
x = conf.set("spark.sql.warehouse.dir", "file:///home/hadoop/data/hive");
x = conf.set("spark.sql.catalogImplementation", "hive");
x = conf.set("spark.hadoop.fs.permissions.umask-mode", "002");
x = conf.set("spark.driver.extraClassPath", ":/misc/build/0/classes/:/usr/share/java/postgresql.jar");
conf.getAll()

[('spark.app.name', 'word2vec'),
 ('spark.master', 'yarn'),
 ('spark.driver.cores', '4'),
 ('spark.driver.memory', '4g'),
 ('spark.executor.cores', '4'),
 ('spark.executor.memory', '4g'),
 ('spark.executor.instances', '4'),
 ('spark.sql.warehouse.dir', 'file:///home/hadoop/data/hive'),
 ('spark.sql.catalogImplementation', 'hive'),
 ('spark.hadoop.fs.permissions.umask-mode', '002'),
 ('spark.driver.extraClassPath',
  ':/misc/build/0/classes/:/usr/share/java/postgresql.jar')]

## Spark Instantiation

Create the Spark session and check it.

In [9]:
spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
sc = SparkContext.getOrCreate()

In [10]:
# that configuration should now be in the spark context.
spark.sparkContext.getConf().getAll() 

[('spark.executor.instances', '4'),
 ('spark.driver.cores', '4'),
 ('spark.driver.memory', '4g'),
 ('spark.hadoop.fs.permissions.umask-mode', '002'),
 ('spark.executor.cores', '4'),
 ('spark.driver.extraJavaOptions',
  '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.use

In [11]:
# And it can be checked in the Web UI on the Environment tab. 
# The executors tab will only one
sc

In [12]:
## to copy to a local file - not useful on a Spark Yarn cluster
# with urllib.request.urlopen(url) as response:
#    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
#        shutil.copyfileobj(response, tmp_file)

## Hive

Load a file and send it to Hive.

In [13]:
# this is a lazy instruction and actions nothing on the executors
rdd0 = sc.textFile(url).map(lambda row: row.split(" "))
type(rdd0)

pyspark.rdd.PipelinedRDD

In [14]:
# alternatively read as a dataframe, this will appear in thhe SQL/Dataframe tab. This is an immmediate operation.
df0 = spark.read.text(url)
df0.printSchema()
df0.show()

root
 |-- value: string (nullable = true)

+--------------------+
|               value|
+--------------------+
| anarchism origin...|
+--------------------+



In [15]:
type(rdd0)
type(df0)

pyspark.rdd.PipelinedRDD

pyspark.sql.dataframe.DataFrame

In [16]:
df1 = spark.sql("show databases")

In [17]:
df1.show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [18]:
# the finalTable should persist between invocations of Hive

df1 = spark.sql("show tables")
df1.show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [19]:
df0 = spark.read.text(url)
df0.printSchema()
df0.show()

root
 |-- value: string (nullable = true)

+--------------------+
|               value|
+--------------------+
| anarchism origin...|
+--------------------+



In [20]:
df0.createOrReplaceTempView("tempTable")

In [21]:
spark.sql("drop table if exists finalTable")
spark.sql("create table finalTable AS select * from tempTable")

DataFrame[]

DataFrame[]

In [23]:
df0 = spark.sql("show databases")
df0.show()
df0 = spark.sql("show tables")
df0.show()
df0 = spark.sql("select count(*) from finalTable")

+---------+
|namespace|
+---------+
|  default|
+---------+

+---------+----------+-----------+
|namespace| tableName|isTemporary|
+---------+----------+-----------+
|  default|finaltable|      false|
|  default|    xusers|      false|
|         | temptable|       true|
+---------+----------+-----------+



## MLLib Method

Demonstrate access to an MLLib method

In [None]:
# This takes a 45 minutes for a 100 MByte file.
word2Vec = Word2Vec()
model = word2Vec.fit(rdd0)

In [None]:
synonyms = model.findSynonyms('iran', 40)
type(synonyms)
syns0=list(synonyms)
len(syns0)

In [None]:
syns0

In [None]:
synonyms = model.findSynonyms('insensible', 40)
type(synonyms)
syns0=list(synonyms)
len(syns0)

In [None]:
synonyms = model.findSynonyms('sensible', 40)
type(synonyms)
syns0=list(synonyms)
len(syns0)

In [None]:
model.__dir__()
len(model.getVectors())
model.__dir__()
## This shows it isn't of a list.

In [None]:
# a sample of words
vv=list(model.getVectors())
s0=vv[1:10]

In [None]:
# vectors used to transform the first word of the sample
v0 = model.transform(s0[0])
len(v0)

In [None]:
v0

In [None]:
# Synonyms to china by cosine.

In [None]:
len(list(syns0))

In [None]:
import pandas as pd 

# synonyms are easily captured. This is proximity to 'china'
l0 = pd.DataFrame.from_records(list(syns0), columns =['L', 'R'])
print(l0.head())

In [None]:
synonyms = model.findSynonyms(l0.iloc[0].loc['L'], 40)
syns0 = list(synonyms)[1:10]
l0 = pd.DataFrame.from_records(list(syns0), columns =['L', 'R']) 
print(l0.head())

In [None]:
synonyms = model.findSynonyms(l0.iloc[0].loc['L'], 1)
next(synonyms)

In [None]:
l0.iloc[0].loc['L']


In [None]:
l0[['L']]
l0[['R']]

In [None]:
url

In [None]:
spark.sql("create table finalTable as select * from tempTable")