In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting click
  Using cached click-8.1.3-py3-none-any.whl (96 kB)
Collecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp310-cp310-macosx_10_9_x86_64.whl (293 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.9/293.9 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: regex, click, nltk
Successfully installed click-8.1.3 nltk-3.8.1 regex-2022.10.31

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


# NLP and NLTK Basics
---

## SparkContext and SparkSession

In [2]:
from pyspark import SparkContext
sc = SparkContext(master = 'local')

from pyspark.sql import SparkSession
spark = SparkSession.builder \
          .appName("Python Spark SQL basic example") \
          .config("spark.some.config.option", "some-value") \
          .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/20 10:21:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


A lot of examples in this article are borrowed from the book written by **Bird et al. (2009)**. Here I tried to implement the examples from the book with spark as much as possible.

Refer to the book for more details: Bird, Steven, Ewan Klein, and Edward Loper. Natural language processing with Python: analyzing text with the natural language toolkit. " O'Reilly Media, Inc.", 2009.

## Basic terminology

* **text**: a sequence of words and punctuation.
* **frequency distribution**: the frequency of words in a text object.
* **collocation**: a **sequence of words** that occur together unusually often.
* **bigrams**: word pairs. High frequent bigrams are collocations.
* **corpus**: a large body of text
* **wordnet**: a lexical database in which english words are grouped into sets of synonyms (**also called synsets**).
* **text normalization**: the process of transforming text into a single canonical form, e.g., converting text to lowercase, removing punctuations and so on.
* **Lemmatization**: the process of grouping variant forms of the same word so that they can be analyzed as a single item.
* **Stemming**: the process of reducing inflected words to their **word stem**.
* **tokenization**:
* **segmentation**:
* **chunking**:

## Texts as lists of words

Create a data frame consisting of text elements.

In [3]:
import pandas as pd
pdf = pd.DataFrame({
        'texts': [['I', 'like', 'playing', 'basketball'],
                 ['I', 'like', 'coding'],
                 ['I', 'like', 'machine', 'learning', 'very', 'much']]
    })
    
df = spark.createDataFrame(pdf)
df.show(truncate=False)

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():
[Stage 0:>                                                          (0 + 1) / 1]

+----------------------------------------+
|texts                                   |
+----------------------------------------+
|[I, like, playing, basketball]          |
|[I, like, coding]                       |
|[I, like, machine, learning, very, much]|
+----------------------------------------+



                                                                                

## Ngrams and collocations

Transform texts to 2-grams, 3-grams and 4-grams collocations.

In [4]:
from pyspark.ml.feature import NGram
from pyspark.ml import Pipeline
ngrams = [NGram(n=n, inputCol='texts', outputCol=str(n)+'-grams') for n in [2,3,4]]

# build pipeline model
pipeline = Pipeline(stages=ngrams)

# transform data
texts_ngrams = pipeline.fit(df).transform(df)

In [5]:
# display result
texts_ngrams.select('2-grams').show(truncate=False)
texts_ngrams.select('3-grams').show(truncate=False)
texts_ngrams.select('4-grams').show(truncate=False)

+------------------------------------------------------------------+
|2-grams                                                           |
+------------------------------------------------------------------+
|[I like, like playing, playing basketball]                        |
|[I like, like coding]                                             |
|[I like, like machine, machine learning, learning very, very much]|
+------------------------------------------------------------------+

+----------------------------------------------------------------------------------+
|3-grams                                                                           |
+----------------------------------------------------------------------------------+
|[I like playing, like playing basketball]                                         |
|[I like coding]                                                                   |
|[I like machine, like machine learning, machine learning very, learning very much]|
+-----

## Access corpora from the NLTK package

### The `gutenberg` corpus

#### Get file ids in gutenberg corpus

In [9]:
import nltk
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /Users/user/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [10]:
from nltk.corpus import gutenberg

gutenberg_fileids = gutenberg.fileids()
gutenberg_fileids

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

#### Absolute path of a file

In [11]:
gutenberg.abspath(gutenberg_fileids[0])

FileSystemPathPointer('/Users/user/nltk_data/corpora/gutenberg/austen-emma.txt')

#### Raw text

In [12]:
gutenberg.raw(gutenberg_fileids[0])[:200]

'[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; an'

#### The words of the entire corpus

In [13]:
gutenberg.words()

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

In [14]:
len(gutenberg.words())

2621613

#### Sentences of a specific file

In [16]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [17]:
gutenberg.sents(gutenberg_fileids[0])

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'], ['VOLUME', 'I'], ...]

In [18]:
len(gutenberg.sents(gutenberg_fileids[0]))

7752

### Loading custom corpus

Let's create a corpus consisting all files from the **./data** directory.

In [19]:
from nltk.corpus import PlaintextCorpusReader
corpus_data = PlaintextCorpusReader('./data', '.*')

#### Files in the corpus *corpus_data*

In [20]:
data_fileids = corpus_data.fileids()
data_fileids

['Advertising.csv',
 'BicycleWeather.csv',
 'FremontBridge.csv',
 'SparkData/Advertising.csv',
 'SparkData/Credit.csv',
 'SparkData/WineData.csv',
 'SparkData/airquality.csv',
 'SparkData/churn-bigml-20.csv',
 'SparkData/churn-bigml-80.csv',
 'SparkData/cuse_binary.csv',
 'SparkData/horseshoe_crab.csv',
 'SparkData/hsb2.csv',
 'SparkData/hsb2_modified.csv',
 'SparkData/iris.csv',
 'SparkData/kaggle-titanic-gender_submission.csv',
 'SparkData/kaggle-titanic-test.csv',
 'SparkData/kaggle-titanic-train.csv',
 'SparkData/mtcars.csv',
 'SparkData/prostate.csv',
 'SparkData/saved-mtcars/.part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv.crc',
 'SparkData/saved-mtcars/_SUCCESS',
 'SparkData/saved-mtcars/part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv',
 'SparkData/saved-twitter/.part-00000.crc',
 'SparkData/saved-twitter/_SUCCESS',
 'SparkData/saved-twitter/part-00000',
 'SparkData/titanic/gender_submission.csv',
 'SparkData/titanic/test.csv',
 'SparkData/titanic/train.csv',
 

#### Raw text in *twitter.txt*

In [23]:
corpus_data.raw('twitter.txt')

'Fresh install of XP on new computer. Sweet relief! fuck vista\t1018769417\t1.0\nWell. Now I know where to go when I want my knives. #ChiChevySXSW http://post.ly/RvDl\t10284216536\t1.0\n"Literally six weeks before I can take off ""SSC Chair"" off my email. Its like the torturous 4th mile before everything stops hurting."\t10298589026\t1.0\nMitsubishi i MiEV - Wikipedia, the free encyclopedia - http://goo.gl/xipe Cutest car ever!\t109017669432377344\t1.0\n\'Cheap Eats in SLP\' - http://t.co/4w8gRp7\t109642968603963392\t1.0\nTeenage Mutant Ninja Turtle art is never a bad thing... http://bit.ly/aDMHyW\t10995492579\t1.0\nNew demographic survey of online video viewers: http://bit.ly/cx8b7I via @KellyOlexa\t11713360136\t1.0\nhi all - i\'m going to be tweeting things lookstat at the @lookstat twitter account. please follow me there\t1208319583\t1.0\nHoly carp, no. That movie will seriously suffer for it. RT @MouseInfo: Anyone excited for The Little Mermaid in 3D?\t121330835726155776\t1.0\n"Di

#### Words and sentences in file *twitter.txt*

In [24]:
corpus_data.words(fileids='twitter.txt')

['Fresh', 'install', 'of', 'XP', 'on', 'new', ...]

In [25]:
len(corpus_data.words(fileids='twitter.txt'))

253

In [26]:
corpus_data.sents(fileids='twitter.txt')

[['Fresh', 'install', 'of', 'XP', 'on', 'new', 'computer', '.'], ['Sweet', 'relief', '!'], ...]

In [27]:
len(corpus_data.sents(fileids='twitter.txt'))

14

## WordNet

The `nltk.corpus.wordnet.synsets()` function load all synsents with a given lemma and part of speech tag.

Load all synsets into a spark data frame given the lemma `car`.

In [29]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/user/nltk_data...


True

In [30]:
from nltk.corpus import wordnet
wordnet.synsets
pdf = pd.DataFrame({
        'car_synsets': [synsets._name for synsets in wordnet.synsets('car')]
    })
df = spark.createDataFrame(pdf)
df.show()

+--------------+
|   car_synsets|
+--------------+
|      car.n.01|
|      car.n.02|
|      car.n.03|
|      car.n.04|
|cable_car.n.01|
+--------------+



  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


### Get lemma names given a synset

In [31]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from nltk.corpus import wordnet

def lemma_names_from_synset(x):
    synset = wordnet.synset(x)
    return synset.lemma_names()

lemma_names_from_synset('car.n.02')
# synset_lemmas_udf = udf(lemma_names_from_synset, ArrayType(StringType()))


['car', 'railcar', 'railway_car', 'railroad_car']