Let's set up SparkNLP.

In [3]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2024-07-23 16:21:08--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 3.86.22.73
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|3.86.22.73|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2024-07-23 16:21:08--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1191 (1.2K) [text/plain]
Saving to: ‘STDOUT’


2024-07-23 16:21:08 (58.3 MB/s) - written to stdout [1191/1191]

Installing PySpark 3.2.3 and Spark NLP 5.4.1
setup Colab for PySpark 3.2.3 and Spark NLP 5.4.1
[2

In [4]:
import sparknlp
spark = sparknlp.start()

from sparknlp.pretrained import PretrainedPipeline

In [5]:
pipeline = PretrainedPipeline("explain_document_ml")

explain_document_ml download started this may take some time.
Approx size to download 9 MB
[OK!]


We can use some recent headlines.

In [16]:
hls = [
		"She ran.",
		"He ran.",
		"I saw her.",
		"I saw him.",
		"I know her name.",
		"I know his name.",
		"That is hers.",
		"That is his."
	]

Let's use SparkNLP to analyze these headlines.

In [17]:
# Use dataframes, or...
# data = spark.createDataFrame(hls).toDF("text")
# dfs = pipeline.transform(data)
# ... use list comprehension
dfs = [pipeline.annotate(hl) for hl in hls] # I don't know how to use dataframes

In [18]:
# its big
dfs

[{'document': ['She ran.'],
  'spell': ['She', 'ran', '.'],
  'pos': ['PRP', 'VBD', '.'],
  'lemmas': ['She', 'run', '.'],
  'token': ['She', 'ran', '.'],
  'stems': ['she', 'ran', '.'],
  'sentence': ['She ran.']},
 {'document': ['He ran.'],
  'spell': ['He', 'ran', '.'],
  'pos': ['PRP', 'VBD', '.'],
  'lemmas': ['He', 'run', '.'],
  'token': ['He', 'ran', '.'],
  'stems': ['he', 'ran', '.'],
  'sentence': ['He ran.']},
 {'document': ['I saw her.'],
  'spell': ['I', 'saw', 'her', '.'],
  'pos': ['PRP', 'VBD', 'PRP', '.'],
  'lemmas': ['I', 'see', 'she', '.'],
  'token': ['I', 'saw', 'her', '.'],
  'stems': ['i', 'saw', 'her', '.'],
  'sentence': ['I saw her.']},
 {'document': ['I saw him.'],
  'spell': ['I', 'saw', 'him', '.'],
  'pos': ['PRP', 'VBD', 'PRP', '.'],
  'lemmas': ['I', 'see', 'he', '.'],
  'token': ['I', 'saw', 'him', '.'],
  'stems': ['i', 'saw', 'him', '.'],
  'sentence': ['I saw him.']},
 {'document': ['I know her name.'],
  'spell': ['I', 'know', 'her', 'name', '.'],

Let's say we want to fuse part-of-speech tags to words, to make word differentiation easier.

In [19]:
# Extract words and parts-of-speech
tok_tag = [(df['token'],df['pos']) for df in dfs]

In [20]:
# Still big
tok_tag

[(['She', 'ran', '.'], ['PRP', 'VBD', '.']),
 (['He', 'ran', '.'], ['PRP', 'VBD', '.']),
 (['I', 'saw', 'her', '.'], ['PRP', 'VBD', 'PRP', '.']),
 (['I', 'saw', 'him', '.'], ['PRP', 'VBD', 'PRP', '.']),
 (['I', 'know', 'her', 'name', '.'], ['PRP', 'VBP', 'PRP$', 'NN', '.']),
 (['I', 'know', 'his', 'name', '.'], ['PRP', 'VBP', 'PRP$', 'NN', '.']),
 (['That', 'is', 'hers', '.'], ['DT', 'VBZ', 'NNS', '.']),
 (['That', 'is', 'his', '.'], ['DT', 'VBZ', 'PRP$', '.'])]

In [21]:
# fuse pos to word
zips = [list(zip(tt[0], tt[1])) for tt in tok_tag]

In [22]:
# not too big
zips

[[('She', 'PRP'), ('ran', 'VBD'), ('.', '.')],
 [('He', 'PRP'), ('ran', 'VBD'), ('.', '.')],
 [('I', 'PRP'), ('saw', 'VBD'), ('her', 'PRP'), ('.', '.')],
 [('I', 'PRP'), ('saw', 'VBD'), ('him', 'PRP'), ('.', '.')],
 [('I', 'PRP'), ('know', 'VBP'), ('her', 'PRP$'), ('name', 'NN'), ('.', '.')],
 [('I', 'PRP'), ('know', 'VBP'), ('his', 'PRP$'), ('name', 'NN'), ('.', '.')],
 [('That', 'DT'), ('is', 'VBZ'), ('hers', 'NNS'), ('.', '.')],
 [('That', 'DT'), ('is', 'VBZ'), ('his', 'PRP$'), ('.', '.')]]

In [23]:
tagged = [" ".join(["".join(word) for word in hl]) for hl in zips]

In [24]:
tagged

['ShePRP ranVBD ..',
 'HePRP ranVBD ..',
 'IPRP sawVBD herPRP ..',
 'IPRP sawVBD himPRP ..',
 'IPRP knowVBP herPRP$ nameNN ..',
 'IPRP knowVBP hisPRP$ nameNN ..',
 'ThatDT isVBZ hersNNS ..',
 'ThatDT isVBZ hisPRP$ ..']

What about ebooks?

In [40]:
he_count = 0
him_count = 0
she_count = 0
her_count = 0
with open('dune.txt') as dune:
    i = 1
    masculine_object = [('Him', 'PRP'), ('him', 'PRP')]
    masculine_subject = [('He', 'PRP'), ('he', 'PRP')]
    feminine_object = [('Her', 'PRP'), ('her', 'PRP')]
    feminine_subject = [('She', 'PRP'), ('she', 'PRP')]
    feminine_possessive = [('Her', 'PRP$'), ('her', 'PRP$')]
    masculine_possessive = [('His', 'PRP$'), ('his', 'PRP$')]
    line = dune.readline()
    while line:
      annotated_line = pipeline.annotate(line)
      merged_list = tuple(zip(annotated_line['token'], annotated_line['pos']))

      #Count matches per line in each category
      her_count = her_count + len(set(feminine_object).intersection(merged_list))
      she_count = she_count + len(set(feminine_subject).intersection(merged_list))
      he_count = he_count + len(set(masculine_subject).intersection(merged_list))
      him_count = him_count + len(set(masculine_object).intersection(merged_list))

      i = i + 1
      line = dune.readline()
      #if i > 200:
      #  break;

print("He Count: ", he_count)
print("She Count: ", she_count)
print("Him Count: ",  him_count)
print("Her Count: ",  her_count)

He Count:  2655
She Count:  1490
Him Count:  950
Her Count:  190


In [32]:
def find_pronouns(book):
	he_count = 0
	him_count = 0
	she_count = 0
	her_count = 0
	her_possessive_count = 0
	his_possessive_count = 0
	masculine_object = [('Him', 'PRP'), ('him', 'PRP')]
	masculine_subject = [('He', 'PRP'), ('he', 'PRP')]
	feminine_object = [('Her', 'PRP'), ('her', 'PRP')]
	feminine_subject = [('She', 'PRP'), ('she', 'PRP')]
	feminine_possessive = [('Her', 'PRP$'), ('her', 'PRP$')]
	masculine_possessive = [('His', 'PRP$'), ('his', 'PRP$')]
	with open(book) as booktext:
		i = 1
		line = booktext.readline()
		while line:
			annotated_line = pipeline.annotate(line)
			merged_list = tuple(zip(annotated_line['token'], annotated_line['pos']))

			#Count matches per line in each category
			her_count = her_count + len(set(feminine_object).intersection(merged_list))
			she_count = she_count + len(set(feminine_subject).intersection(merged_list))
			he_count = he_count + len(set(masculine_subject).intersection(merged_list))
			him_count = him_count + len(set(masculine_object).intersection(merged_list))
			her_possessive_count = her_possessive_count + len(set(feminine_possessive).intersection(merged_list))
			his_possessive_count = his_possessive_count + len(set(masculine_possessive).intersection(merged_list))

			i = i + 1
			line = booktext.readline()
    	#if i > 200:
    	#  break;

	print("He Count: ", he_count)
	print("She Count: ", she_count)
	print("Him Count: ",  him_count)
	print("Her Count: ",  her_count)
	print("Her possessive Count: ", her_possessive_count)
	print("His possessive Count: ", his_possessive_count)

In [27]:
!curl "https://raw.githubusercontent.com/cml-data/mktsv/main/dune.txt" -o dune.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1079k  100 1079k    0     0  1905k      0 --:--:-- --:--:-- --:--:-- 1903k


In [28]:
!curl "https://raw.githubusercontent.com/cml-data/mktsv/main/parable.txt" -o parable.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  538k  100  538k    0     0  1097k      0 --:--:-- --:--:-- --:--:-- 1097k


In [33]:
find_pronouns('dune.txt')

He Count:  2655
She Count:  1490
Him Count:  950
Her Count:  190
Her possessive Count:  963
His possessive Count:  2250


In [34]:
find_pronouns('parable.txt')

He Count:  1023
She Count:  615
Him Count:  408
Her Count:  137
Her possessive Count:  452
His possessive Count:  370
