In [4]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.0.3'

os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (108.157.162.103)] [                                                                               Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (108.157.162.103)] [0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Connected to cloud.r-proje                                                                               Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Connected to cloud.r-proje                                                                               Hit:4 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
0% [1 InRelease gpgv 88.7 kB] [Connected to cloud.r-project.org (108.157.162.10                                                   

In [5]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Hashing").getOrCreate()

In [6]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover

In [7]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.1/22-big-data/day_2/unreal_restaurant_tweets.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("unreal_restaurant_tweets.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+--------------------+
|   Restaurant Tweets|
+--------------------+
|@UnrealRestaurant...|
|@UnrealRestaurant...|
|@UnrealRestaurant...|
|@UnrealRestaurant...|
|"@UnrealRestauran...|
+--------------------+



In [9]:
# Tokenize DataFrame
tokenizer = Tokenizer(inputCol="Restaurant Tweets", outputCol="tokens")
wordsData = tokenizer.transform(df)
wordsData.show()

+--------------------+--------------------+
|   Restaurant Tweets|              tokens|
+--------------------+--------------------+
|@UnrealRestaurant...|[@unrealrestauran...|
|@UnrealRestaurant...|[@unrealrestauran...|
|@UnrealRestaurant...|[@unrealrestauran...|
|@UnrealRestaurant...|[@unrealrestauran...|
|"@UnrealRestauran...|["@unrealrestaura...|
+--------------------+--------------------+



In [10]:
# Remove stop words
# Instantiate Remover
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
# Transform and show data
remover.transform(wordsData).show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+
|Restaurant Tweets                                                                                                     |tokens                                                                                                                                 |filtered                                                                                                     |
+----------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+----------

In [11]:
# Run the hashing term frequency
hashing = HashingTF(inputCol="tokens", outputCol="hashedValues")

# Transform into a DF
hashed_df = hashing.transform(wordsData)


In [12]:
# Display new DataFrame
hashed_df.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Restaurant Tweets                                                                                                     |tokens                                                                                                                                 |hashedValues                                                                                                                                                                                  |
+-------------------------------------------------------------------------------------------------------

In [13]:
# Fit the IDF on the data set 
idf = IDF(inputCol="hashedValues", outputCol="features")
idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)

In [14]:
# Display the DataFrame
rescaledData.select("tokens", "features").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|tokens                                                                                                                                 |features                                                                                                                                                                                                                                                                               