#Prepare for Model Creation
##Download packages
There are two versions of SparkNLP that we could utilize (both work) however we vyed for the newer 3.1.2 version of Apache Spark which works with the newest version of Spark NLP (version 3.3.1).

In [1]:
import os
# > Old Package Versions
# # Install java
# ! apt-get update -qq
# ! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
# ! java -version
# # Install pyspark
# ! pip install --ignore-installed pyspark==2.4.4
# # Install Spark NLP
# ! pip install --ignore-installed spark-nlp==2.5.1

# > New Package Versions
! pip install -q pyspark==3.1.2 spark-nlp


We need to also install packages for the main script and functions we will run.

In [2]:
# Install necessary packages for main script
! pip install html2text
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

##Upload model
We upload the ToneItPipeline to our GoogleCOLAB environment and then unpack it via tar.

In [3]:
!tar -xzvf ToneItPipeline.tar.gz

ToneItPipeline/
ToneItPipeline/stages/
ToneItPipeline/stages/1_UNIVERSAL_SENTENCE_ENCODER_5e0d8b922c74/
ToneItPipeline/stages/1_UNIVERSAL_SENTENCE_ENCODER_5e0d8b922c74/.use_tensorflow.crc
ToneItPipeline/stages/1_UNIVERSAL_SENTENCE_ENCODER_5e0d8b922c74/metadata/
ToneItPipeline/stages/1_UNIVERSAL_SENTENCE_ENCODER_5e0d8b922c74/metadata/._SUCCESS.crc
ToneItPipeline/stages/1_UNIVERSAL_SENTENCE_ENCODER_5e0d8b922c74/metadata/part-00000
ToneItPipeline/stages/1_UNIVERSAL_SENTENCE_ENCODER_5e0d8b922c74/metadata/_SUCCESS
ToneItPipeline/stages/1_UNIVERSAL_SENTENCE_ENCODER_5e0d8b922c74/metadata/.part-00000.crc
ToneItPipeline/stages/1_UNIVERSAL_SENTENCE_ENCODER_5e0d8b922c74/use_tensorflow
ToneItPipeline/stages/2_ClassifierDLModel_b2de7b745e33/
ToneItPipeline/stages/2_ClassifierDLModel_b2de7b745e33/.classifierdl_tensorflow.crc
ToneItPipeline/stages/2_ClassifierDLModel_b2de7b745e33/fields/
ToneItPipeline/stages/2_ClassifierDLModel_b2de7b745e33/fields/datasetParams/
ToneItPipeline/stages/2_ClassifierDLM

##Run the main script

In [4]:
import nltk.data
from collections import defaultdict, deque
import random

import html2text
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

import sparknlp
from pyspark.ml import PipelineModel
from sparknlp.base import LightPipeline


def predict_with_engine(engine, input) -> str:
  # use engine to annotate the input with a tone
  result = engine.annotate(input)['class'][0]
  return result

def html_to_string(link) -> list:
  # instantiate a list of paragraphs
  paragraphs_list = []
  # pull from the website
  req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
  html_text = urlopen(req).read()
  soup = BeautifulSoup(html_text, "html.parser")
  p_tags = soup.findAll('p')
  # for each paragraph add it to the list
  for p_tag in p_tags:
    text_result = html2text.html2text(p_tag.text)
    # currently hardcoded removal of categories
    # > all \ns
    invalid = len(set(text_result.split('\n'))) == 1 and text_result.split('\n')[0] == ''
    if(not invalid):
      paragraphs_list.append(text_result.replace('\n',' '))
  return paragraphs_list

# predicts every sentences in paragraph and match with its tone
# returns Tuple: ({tone: [indices of sentences] ...}, [best predictions])
def predict_sentences(engine, sentence_arr):
  # instantiate a tracking dictionary
  sentence_result_dict = defaultdict(lambda: [])
  if len(sentence_arr) == 0:
    return (0, "DNE")
  for index, sentence in enumerate(sentence_arr):
    prediction = predict_with_engine(engine, sentence)
    sentence_result_dict[prediction].append(index)
  maxNum = 0
  bestResult = []
  for indices in sentence_result_dict.values():
    if len(indices) >= maxNum:
      maxNum = len(indices)

  for result, indices in sentence_result_dict.items():
    if len(indices) == maxNum:
      bestResult.append(result)

  return (sentence_result_dict, bestResult)

#converts each sentence index to real sentence
#returns queue: [sentence, (True/False), ...]
def indices_to_sentences(sentence_arr, matched_indices) -> deque:
    matched_indices_q = deque(matched_indices)
    final_result = deque([])
    for index, sentence in enumerate(sentence_arr):
        matched = False
        if len(matched_indices_q) != 0 and index == matched_indices_q[0]:
            matched = True
            matched_indices_q.popleft()
        final_result.append((sentence, matched))
    return final_result
        
def toneit(link):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sparknlp.start()
    ToneItPipeline = LightPipeline(PipelineModel.load('ToneItPipeline'))
    paragraph_list = html_to_string(link)
    final_result = []
    for paragraph in paragraph_list:
        sentence_arr = tokenizer.tokenize(paragraph)
        sentence_results = predict_sentences(ToneItPipeline, sentence_arr)
        paragraph_result = predict_with_engine(ToneItPipeline, paragraph)
        
        temp_result = {
            'paragraphTone': paragraph_result,
            'sentenceMatchesParagraph': False
        }
        
        if paragraph_result in sentence_results[1]:
            temp_result['sentenceMatchesParagraph'] = True
            sentences_result = indices_to_sentences(sentence_arr, sentence_results[0][paragraph_result])
            temp_result['sentences'] = sentences_result

        final_result.append(temp_result)
    return final_result
    #Produces: {paragraphTone: 'tone', sentences: {sentence1: ('sentence1', Matched Or Not (True/False))...}, sentenceMatchesParagraph: True/False}


We can then submit a link to the `toneit` function and output a `JSON` formatted file that can be read to highlight the relevant data.

In [5]:
link = 'https://americanliterature.com/author/stephen-crane/short-story/a-dark-brown-dog'
toneit(link)

[{'paragraphTone': 'Anger',
  'sentenceMatchesParagraph': True,
  'sentences': deque([('A Child was standing on a street-corner.', True),
         ('He leaned with one shoulder against a high board-fence and swayed the other to and fro, the while kicking carelessly at the gravel.',
          False)])},
 {'paragraphTone': 'Joy',
  'sentenceMatchesParagraph': True,
  'sentences': deque([('Sunshine beat upon the cobbles, and a lazy summer wind raised yellow dust which trailed in clouds down the avenue.',
          True),
         ('Clattering trucks moved with indistinctness through it.', True),
         ('The child stood dreamily gazing.', True)])},
 {'paragraphTone': 'Joy',
  'sentenceMatchesParagraph': True,
  'sentences': deque([('After a time, a little dark-brown dog came trotting with an intent air down the sidewalk.',
          True),
         ('A short rope was dragging from his neck.', False),
         ('Occasionally he trod upon the end of it and stumbled.', True)])},
 {'paragra

In [6]:
html_to_string(link)

['A Child was standing on a street-corner. He leaned with one shoulder against a high board-fence and swayed the other to and fro, the while kicking carelessly at the gravel.  ',
 'Sunshine beat upon the cobbles, and a lazy summer wind raised yellow dust which trailed in clouds down the avenue. Clattering trucks moved with indistinctness through it. The child stood dreamily gazing.  ',
 'After a time, a little dark-brown dog came trotting with an intent air down the sidewalk. A short rope was dragging from his neck. Occasionally he trod upon the end of it and stumbled.  ',
 'He stopped opposite the child, and the two regarded each other. The dog hesitated for a moment, but presently he made some little advances with his tail. The child put out his hand and called him. In an apologetic manner the dog came close, and the two had an interchange of friendly pattings and waggles. The dog became more enthusiastic with each moment of the interview, until with his gleeful caperings he threaten