In [None]:
#To start a Spark Session:

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('nlp').getOrCreate()

#To stop:

spark.stop()

In [None]:
#Spark Dataframes

#To create
dataframe = spark.createDataFrame([
    (0, 'Example 1'),
    (1, 'Example 2'),
    (2, 'Example 3')])

#To display
dataframe.show()

In [None]:
#Loading data
from pyspark import SparkFiles

url = 'blah.com'

spark.sparkContext.addFile(url)

df = spark.read_csv(SparkFiles.get('data.csv'), sep = ',', header = True)


In [None]:
#To tokenize words

from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol = 'sentence', outputCol = 'words')

tokenized = tokenizer.transform(dataframe)
tokenized.show(truncate = False)

In [None]:
#To remove stop words
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol = 'raw', outputCol = 'filtered')

remover.transform(sentenceData).show(truncate = False)

#To add words to remove, you pass a list

stop_list = ["blah", "whatever", "okay"]

remover = StopWordsRemover(inputCol = 'words', outputCol = 'filtered', stopWords = stop_list)

In [None]:
#Hashing

from pyspark.ml.feature import HashingTF

hashing = HashingTF(inputCol = 'tokens', outputCol = 'hashedValues', numFeatures = pow(2, 4))

hashed_df = hashing.transform(df)

In [None]:
#IDF

from pyspark.ml.feature import IDF

idf = IDF(inputCol = 'hashedValues', outputCol = 'features')

idfModel = idf.fit(hashed_df)

rescaledData = idfModel.transform(hashed_df)

In [None]:
#User Defined functions in spark

from pyspark.sql.function import col, udf
from pyspark.sql.types import IntegerType

def word_list_length(word_list):
    return len(word_list)

count_tokens = udf(word_list_length, IntegerType())

#To put that all together with SQL
tokenized.select('Column A', 'Column B').withColumn('New Column', count_tokens(col('words').show(truncate = False)))