## Data preparation

We apply the following transformation to the input text data:

+ Clean strings
+ Tokenize (`String -> Array<String>`)
+ Remove stop words
+ Stem words
+ Create bigrams

#### 0. Create DataFrame

In [3]:
# Set table name
table_name = "faam_dataset_v4"

# Create DF from table
tweet_df = sqlContext.table(table_name)

# Random sampling (20%)
tweet_df = tweet_df.sample(False, 0.2)

# Display schema and data sample
tweet_df.printSchema()
print("Size of the DataFrame: {} records".format(tweet_df.count()))

#### 1. Clean text string

In [5]:
from pyspark.sql.functions import col, lower, regexp_replace, split

def clean_text(c):
  c = lower(c)
  c = regexp_replace(c, "^rt ", "")
  c = regexp_replace(c, "(https?\://)\S+", "")
  c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
  #c = split(c, "\\s+") tokenization...
  return c

clean_text_df = tweet_df.select(clean_text(col("text")).alias("text"))

clean_text_df.printSchema()
clean_text_df.show(10)

#### 2. Tokenize

In [7]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="text", outputCol="vector")
vector_df = tokenizer.transform(clean_text_df).select("vector")

vector_df.printSchema()
vector_df.show(10)

#### 3. Remove stop words

In [9]:
from pyspark.ml.feature import StopWordsRemover

# Define a list of stop words or use default list
remover = StopWordsRemover()
stopwords = remover.getStopWords() 

# Display default list
stopwords[:10]

In [10]:
# Specify input/output columns
remover.setInputCol("vector")
remover.setOutputCol("vector_no_stopw")

# Transform existing dataframe with the StopWordsRemover
vector_no_stopw_df = remover.transform(vector_df).select("vector_no_stopw")

# Display
vector_no_stopw_df.printSchema()
vector_no_stopw_df.show()

#### 4. Stem tokens

In [12]:
# Import stemmer library
from nltk.stem.porter import *

# Instantiate stemmer object
stemmer = PorterStemmer()

# Quick test of the stemming function
tokens = ["thanks", "its", "proverbially", "unexpected", "running"]
for t in tokens:
  print(stemmer.stem(t))

In [13]:
# Create stemmer python function
def stem(in_vec):
    out_vec = []
    for t in in_vec:
        t_stem = stemmer.stem(t)
        if len(t_stem) > 2:
            out_vec.append(t_stem)       
    return out_vec

# Create user defined function for stemming with return type Array<String>
from pyspark.sql.types import *
stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

# Create new df with vectors containing the stemmed tokens 
vector_stemmed_df = (
    vector_no_stopw_df
        .withColumn("vector_stemmed", stemmer_udf("vector_no_stopw"))
        .select("vector_stemmed")
  )

# Rename df and column for clarity
production_df = vector_stemmed_df.select(col("vector_stemmed").alias("unigrams"))

# Display
production_df.printSchema()
production_df.show()

#### 5. Create bigrams

In [15]:
from pyspark.ml.feature import NGram

# Define NGram transformer
ngram = NGram(n=2, inputCol="unigrams", outputCol="bigrams")

# Create bigram_df as a transform of unigram_df using NGram tranformer
production_df = ngram.transform(production_df)

# Display
production_df.printSchema()
production_df.show()

#### 6. Filter out small/empty vectors

In [17]:
from pyspark.sql.functions import col, size

production_df = production_df.where(size(col("bigrams")) >= 2)

# Display
production_df.printSchema()
production_df.show()

### Save production data as Spark table for analysis

In [19]:
production_df.write.saveAsTable("faam_dataset_production")