## NLP - Hashing

---

In [1]:
#Import SparkSession Library
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StopWordsRemover

In [2]:
#Start a sparkSession
spark = SparkSession.builder.appName("Airline").getOrCreate()

In [3]:
#Read in the CSV File
dataframe = spark.read.format('csv').option('header','true').load('data/airlines.csv')
dataframe.show()

+--------------------+
|      Airline Tweets|
+--------------------+
|@VirginAmerica pl...|
|@VirginAmerica se...|
|@VirginAmerica do...|
|@VirginAmerica Ar...|
|@VirginAmerica aw...|
+--------------------+



In [4]:
#Tokenize the dataframe
tokened = Tokenizer(inputCol='Airline Tweets', outputCol='words')
tokened_transform = tokened.transform(dataframe)
tokened_transform.show()

+--------------------+--------------------+
|      Airline Tweets|               words|
+--------------------+--------------------+
|@VirginAmerica pl...|[@virginamerica, ...|
|@VirginAmerica se...|[@virginamerica, ...|
|@VirginAmerica do...|[@virginamerica, ...|
|@VirginAmerica Ar...|[@virginamerica, ...|
|@VirginAmerica aw...|[@virginamerica, ...|
+--------------------+--------------------+



In [5]:
#Remove Stop Words
stop_list = ['@virginamerica','$30', '@VirginAmerica']
remover = StopWordsRemover(inputCol='words', outputCol='filtered',stopWords=stop_list)
removed_dataframe = remover.transform(tokened_transform)
removed_dataframe.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+
|Airline Tweets                                                                                                                         |words                                                                                                                                                          |filtered                                                                                                                                  |
+-----------------------------------------------------------------------------------------------------------------------------

In [6]:
#Run the Hashing Term Frequency
hashing = HashingTF(inputCol='filtered', outputCol='hashedValues', numFeatures=pow(2,4))

#Transform into a DF
hashed_df = hashing.transform(removed_dataframe)
hashed_df.show()

+--------------------+--------------------+--------------------+--------------------+
|      Airline Tweets|               words|            filtered|        hashedValues|
+--------------------+--------------------+--------------------+--------------------+
|@VirginAmerica pl...|[@virginamerica, ...|[plus, you've, ad...|(16,[3,4,5,7,8,9,...|
|@VirginAmerica se...|[@virginamerica, ...|[seriously, would...|(16,[0,1,2,3,4,9,...|
|@VirginAmerica do...|[@virginamerica, ...|[do, you, miss, m...|(16,[0,1,8,10,11,...|
|@VirginAmerica Ar...|[@virginamerica, ...|[are, the, hours,...|(16,[0,1,2,4,7,9,...|
|@VirginAmerica aw...|[@virginamerica, ...|[awaiting, my, re...|(16,[0,3,4,6,7,8,...|
+--------------------+--------------------+--------------------+--------------------+



In [7]:
# Fit the IDF on the data set 
idf = IDF(inputCol='hashedValues',outputCol='features')
idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)

In [9]:
#Display the Dataframe
rescaledData.select('words','features').show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|words                                                                                                                                                          |features                                                                                                                                                                                                             |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------

In [10]:
spark.stop()