In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("sparkNlp") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5") \
    .getOrCreate()

In [3]:
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from functools import reduce
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud 
import pandas as pd
import re
import string

In [4]:
cong_2017 = spark.read.csv("s3://502finalprojbucky/congresstweets/data/June2017.csv/*.part",header=True)

In [5]:
cong_2017.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- id: string (nullable = true)
 |-- screen_name: string (nullable = true)
 |-- time: string (nullable = true)
 |-- link: string (nullable = true)
 |-- text: string (nullable = true)
 |-- source: string (nullable = true)
 |-- user_id: string (nullable = true)



In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/hadoop/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /home/hadoop/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [18]:
# get the list of stopwords from nltk
from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')
eng_stopwords.append('xxxx')

In [15]:
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer, 
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA, LDAModel
from pyspark.ml import Pipeline
import pyspark.sql.functions as f
import boto3, os, datetime

In [22]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
    .setInputCols(['token']) \
    .setOutputCol('normalized') \
    .setLowercase(True)

# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(['normalized']) \
    .setOutputCol('lemma') \

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(['lemma']) \
    .setOutputCol('clean_lemma') \
    .setCaseSensitive(False) \
    .setStopWords(eng_stopwords)

# finisher converts tokens to human-readable output
finisher = Finisher() \
    .setInputCols(['clean_lemma']) \
    .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [23]:
pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        tokenizer,
        normalizer,
        lemmatizer,
        stopwords_cleaner,
        finisher
    ])

In [24]:
cong_2017_token = pipeline.fit(cong_2017).transform(cong_2017)

In [28]:
cong_2017_token.columns

['_c0',
 'id',
 'screen_name',
 'time',
 'link',
 'text',
 'source',
 'user_id',
 'document',
 'token',
 'normalized',
 'lemma',
 'clean_lemma',
 'finished_clean_lemma']

In [29]:
# expand the "finished_clean_lemma" column so that the words are not in a list
from pyspark.sql.functions import explode, col

all_words = cong_2017_token.withColumn("exploded_text", explode(col("finished_clean_lemma")))

In [30]:
counts = all_words.groupby('exploded_text').count()

In [40]:
counts.printSchema()

root
 |-- exploded_text: string (nullable = true)
 |-- count: long (nullable = false)



In [None]:
from pyspark.sql.functions import *
counts.sort(desc("count")).collect()