

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/TEXT_PREPROCESSING.ipynb)




# **Pre-Process text:**
## **Convert text to tokens, remove punctuation, stop words, perform stemming and lemmatization using Spark NLP's annotators**

**Demo of the following annotators:**


* SentenceDetector
* Tokenizer
* Normalizer
* Stemmer
* Lemmatizer
* StopWordsCleaner

## 1. Colab Setup

In [None]:
# Install java
!apt-get update -qq
!apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
!java -version

# Install pyspark
!pip install --ignore-installed -q pyspark==2.4.4

# Install Sparknlp
!pip install --ignore-installed spark-nlp

In [None]:
import pandas as pd
import numpy as np
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

## 2. Start Spark Session

In [None]:
spark = sparknlp.start()

## 3. Setting sample text

In [None]:
## Generating Example Files ##

text_list = ["""The Geneva Motor Show, the first major car show of the year, opens tomorrow with U.S. Car makers hoping to make new inroads into European markets due to the cheap dollar, automobile executives said. Ford Motor Co and General Motors Corp sell cars in Europe, where about 10.5 mln new cars a year are bought. GM also makes a few thousand in North American plants for European export.""",
             ]

## 4. Download lemma reference file. (you may also use a pre-trained lemmatization model)

In [None]:
#getting lemma files
!wget https://raw.githubusercontent.com/mahavivo/vocabulary/master/lemmas/AntBNC_lemmas_ver_001.txt

## 5. Define Spark NLP pipleline

In [None]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentenceDetector = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentences')

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")\
    .setLowercase(True)\
    .setCleanupPatterns(["[^\w\d\s]"])

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("token")\
    .setOutputCol("removed_stopwords")\
    .setCaseSensitive(False)\

stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")


lemmatizer = Lemmatizer() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma") \
    .setDictionary("./AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->")

nlpPipeline = Pipeline(stages=[documentAssembler,
                               sentenceDetector,
                               tokenizer,
                               normalizer,
                               stopwords_cleaner,
                               stemmer,
                               lemmatizer,
                               ])


## 6. Run pipeline

In [None]:
empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

df = spark.createDataFrame(pd.DataFrame({'text':text_list}))
result = pipelineModel.transform(df)

## 7. Visualize Results

In [None]:
# sentences in the text
result.select(F.explode(F.arrays_zip('sentences.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("sentences")).show(truncate=False)


In [None]:
# tokens in the text
result.select(F.explode(F.arrays_zip('token.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("tokens")).show(truncate=False)

In [None]:
# eliminated punctuation
result.select(F.explode(F.arrays_zip('normalized.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("normalized_tokens")).show(truncate=False)

In [None]:
# stemmed tokens
result.select(F.explode(F.arrays_zip('stem.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token_stems")).show(truncate=False)

In [None]:
# removed_stopwords
result.select(F.explode(F.arrays_zip('removed_stopwords.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("removed_stopwords")).show(truncate=False)

In [None]:
# lemmatization
result.select(F.explode(F.arrays_zip('lemma.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("lemma")).show(truncate=False)