# Use of regular expressions to extract the following information


#### Load the book

In [4]:
with open("miracle_in_the_andes.txt", "r", encoding="utf-8") as file:
    book = file.read()

In [5]:
type(book)

str

## The number of chapters in the book
#### With string methods

In [7]:
book.count("Chapter")

11

#### With regex

In [9]:
import re

In [10]:
pattern = re.compile("Chapter [0-9]+")
findings = re.findall(pattern, book)
# findings
len(findings)

10

## Sentences where "love" was used in the book

In [12]:
pattern = re.compile("[A-Z][^.]*[^a-z]love[^a-z][^.]*.")
findings = re.findall(pattern, book)
len(findings)
# findings

67

## Paragraphs where "love" was used in the book

In [14]:
pattern = re.compile("[^\n]+love[^\n]+")
findings = re.findall(pattern, book)
# findings
len(findings)

60

## Chapter titles
#### Method 1

In [16]:
pattern = re.compile("[a-zA-Z ,]+\n\n")
findings = re.findall(pattern, book)
findings = [item.strip("\n\n") for item in findings]
findings

['Before',
 'Everything Precious',
 'A Promise',
 'Breathe Once More',
 'Abandoned',
 'Tomb',
 'East',
 'The Opposite of Death',
 'I See a Man',
 'After']

#### Method 2

In [18]:
pattern = re.compile("([a-zA-Z ]+)\n\n")
findings = re.findall(pattern, book)
findings

['Before',
 'Everything Precious',
 'A Promise',
 'Breathe Once More',
 'Abandoned',
 'Tomb',
 'East',
 'The Opposite of Death',
 'I See a Man',
 'After']

## The most used word in the book

In [20]:
pattern = re.compile("[a-zA-Z]+")
findings = re.findall(pattern, book.lower())
# len(findings)
findings[:5]

['chapter', 'before', 'it', 'was', 'friday']

In [21]:
d = {}
for word in findings:
    if word in d.keys():
        d[word] = d[word] + 1
    else:
        d[word] = 1

In [22]:
d_list = [(value, key) for (key, value) in d.items()]

In [23]:
d_list = sorted(d_list, reverse=True)

## Function that finds the occurrence of any word

In [25]:
def find(w):
    pattern = re.compile("[a-zA-Z]+")
    findings = re.findall(pattern, book.lower())

    d = {}
    for word in findings:
        if word in d.keys():
            d[word] = d[word] + 1
        else:
            d[word] = 1

    try:
        return d[w]
    except:
        return f'The book does not contain the word "{w}"'

## Call the function

In [27]:
find('love')

83

In [28]:
find('hate')

'The book does not contain the word "hate"'

## The most use words (non-articles)

In [30]:
from platform import python_version
python_version()

'3.12.4'

In [31]:
import nltk

from nltk.corpus import stopwords
english_stopwords = stopwords.words("english")

In [32]:
# english_stopwords

In [33]:
filtered_words = []
for count, word in d_list:
    if word not in english_stopwords:
        filtered_words.append((word, count))

In [59]:
filtered_words[:10]

[('would', 575),
 ('us', 519),
 ('said', 292),
 ('roberto', 284),
 ('could', 252),
 ('one', 249),
 ('snow', 227),
 ('mountain', 183),
 ('time', 182),
 ('like', 165)]

## Sentiment Analysis: What's the most positive and the most negative chapter?

In [64]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [68]:
analyzer = SentimentIntensityAnalyzer()

In [94]:
# dir(analyzer)

In [90]:
analyzer.polarity_scores(book)

{'neg': 0.116, 'neu': 0.76, 'pos': 0.125, 'compound': 1.0}

#### Chapters sentiment analysis

In [117]:
pattern = re.compile("Chapter [0-9]+")
chapters = re.split(pattern, book)

In [121]:
# chapters

chapters = chapters[1:]

In [125]:
for nr, chapter in enumerate(chapters):
    scores = analyzer.polarity_scores(chapter)
    print(nr + 1, scores)

1 {'neg': 0.061, 'neu': 0.779, 'pos': 0.16, 'compound': 1.0}
2 {'neg': 0.12, 'neu': 0.726, 'pos': 0.154, 'compound': 0.9991}
3 {'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}
4 {'neg': 0.141, 'neu': 0.721, 'pos': 0.138, 'compound': -0.9963}
5 {'neg': 0.118, 'neu': 0.742, 'pos': 0.141, 'compound': 0.9997}
6 {'neg': 0.124, 'neu': 0.761, 'pos': 0.115, 'compound': -0.9979}
7 {'neg': 0.136, 'neu': 0.761, 'pos': 0.103, 'compound': -0.9999}
8 {'neg': 0.12, 'neu': 0.786, 'pos': 0.094, 'compound': -0.9998}
9 {'neg': 0.097, 'neu': 0.824, 'pos': 0.079, 'compound': -0.9996}
10 {'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}
