In [1]:
with open("miracle_in_the_andes.txt", "r", encoding="UTF-8") as file:
    book = file.read()

## How many chapters

### With string methods

In [2]:
book.count('Chapter')

11

### With regex

In [3]:
import re


In [4]:
pattern = re.compile("Chapter [0-9]+")
re.findall(pattern, book)

['Chapter 1',
 'Chapter 2',
 'Chapter 3',
 'Chapter 4',
 'Chapter 5',
 'Chapter 6',
 'Chapter 7',
 'Chapter 8',
 'Chapter 9',
 'Chapter 10']

In [5]:
len(re.findall(pattern, book))

10

### Sentences where "love" was used

In [6]:
pattern = re.compile("[A-Z]{1}[^.]* love[^a-zA-Z]+[^.]*.")
findings = re.findall(pattern, book)
# len(findings)
findings

['As a young man, of course, I could not put these things into words, but I knew, and my teammates knew, that there was something special about the game, and under the guidance of the Christian Brothers we developed a passionate love for the sport that shaped our friendships and our lives.',
 'Guido and I grew up together, playing soccer and sharing a love of motorcycles, cars, and auto racing.',
 'Under the guidance of the Christian Brothers, both of us grew to love the game of rugby with a consuming passion.',
 'That rowdiness came to an abrupt end for Guido in 1969, when he met and fell in love with the beautiful daughter of a Chilean diplomat.',
 'I believe he had a great hunger for the love and comforts of a family that was happy and whole.',
 'He shared, with my father and me, a love for cars and driving, and he loved going with us to auto races.',
 'The house had a beautiful view of the sea, and this more than anything made my mother love it.',
 'She was a true tower of strength

### What are the most used words?

In [7]:
pattern = re.compile("[a-zA-Z]+")
findings = re.findall(pattern, book)


In [8]:
d = {}
for word in findings:
    if word in d.keys():
        d[word] = d[word] + 1  
    else:
        d[word] = 1 
# d

In [9]:
d_list = [(value, key) for key, value in d.items()]
sorted(d_list[:10], reverse=True)

[(5013, 'the'),
 (2053, 'of'),
 (333, 'We'),
 (10, 'Chapter'),
 (9, 'Before'),
 (7, 'October'),
 (3, 'WAS'),
 (1, 'thirteenth'),
 (1, 'IT'),
 (1, 'FRIDAY')]

### Extract the chapter titles

In [10]:
pattern = re.compile("[a-zA-Z ]+\n\n")
findings = re.findall(pattern, book)
findings = [item.strip("\n\n") for item in findings]
findings

['Before',
 'Everything Precious',
 'A Promise',
 'Breathe Once More',
 'Abandoned',
 'Tomb',
 'East',
 'The Opposite of Death',
 'I See a Man',
 'After']

# Using NLP library

### FInging the most used words (non-articles)

In [11]:
d_list = [(value, key) for key, value in d.items()]
sorted(d_list[:10], reverse=True)

[(5013, 'the'),
 (2053, 'of'),
 (333, 'We'),
 (10, 'Chapter'),
 (9, 'Before'),
 (7, 'October'),
 (3, 'WAS'),
 (1, 'thirteenth'),
 (1, 'IT'),
 (1, 'FRIDAY')]

In [14]:
pip install nltk




In [16]:
import nltk
from nltk.corpus import stopwords

# english_stopwords = stopwords.words("english")
# english_stopwords


# Sentiment Analysis

### What is the most positive and negative chapter

In [17]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [19]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
analyzer.polarity_scores()

In [None]:
scores = analyzer.polarity_scores(book)

In [None]:
if scores["pos"] > scores["neg"]:
    print("The text contains positive sentiment")
else:
    print("The text contains negative sentiment")

### Analyzing sentiment by chapters

#### We first split the book by chapters using regex

In [24]:
pattern = re.compile("Chapter [0-9]*")
chapters = re.split(pattern, book)[1:]
len(chapters)

10

In [None]:
for chapter in chapters:
    scores = analyzer.polarity_scores(chapter)
    print(scores)