# Scrape Texts For NLP Tasks

In [1]:
# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd
from bs4 import BeautifulSoup
import spacy


# Custom imports

# Built-in library
import itertools
import re
import json
import requests
from typing import Union, Optional, Any
import logging
import warnings

# pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
fp = "../data/sample_text.txt"

with open(fp, "r") as f:
    data = [line.strip() for line in f.readlines()]

print(data)

['Leeds United: Where did it go wrong for Jesse Marsch and who could replace him?', '', 'Leeds United are looking for their third manager in less than year after the sacking of Jesse Marsch.', '', 'After replacing Marcelo Bielsa, the American helped Leeds avoid relegation on the final day of last season.', '', "But they are 17th, above the relegation zone only on goal difference after 20 games this campaign, with Saturday's 1-0 defeat by Nottingham Forest leading to Marsch's dismissal on Monday.", '', 'Why did Marsch fail? Who is best suited to take over? And what do they need to do to keep Leeds up?', '', 'Watch the Football News Show - Who next for Leeds United?', "Bamford 'a joke' for criticising Marsch tactics - Sutton", 'MNC podcast: Jobless Jesse, Man City charged & pealess whistles', "Don't Go To Bed Just Yet podcast - reaction to Marsch sacking", 'Where did it go wrong for Marsch?', 'For a section of supporters it was from the beginning, because he was not his predecessor Marce

In [3]:
# Preprocess the data. Convert the list of sentences to a document
# and store in a list. i.e a list of a single document.
data_str = "".join(data)
data_doc = [data_str]

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
class Tokenizer:
    """This is used to tokenize documents"""

    def __init__(self) -> None:
        self.nlp = nlp

    def __call__(self, doc: str, *args: Any, **kwargs: Any) -> list[str]:
        # Tokenize
        doc = nlp(doc)
        tokenized_doc = [word.text.lower() for word in doc]
        return tokenized_doc


class Sentencizer:
    """This is used to convert a document into a list of sentences.
    It returns sentences."""

    def __init__(self) -> None:
        self.nlp = nlp

    def __call__(self, doc: str, *args: Any, **kwargs: Any) -> list[str]:
        # Tokenize
        doc = nlp(doc)
        sentences = list(doc.sents)
        tokenized_sentences = [str(sentence) for sentence in sentences]
        return tokenized_sentences

In [6]:
# Load spaCy stopwords
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
spacy_stopwords = list(spacy_stopwords)
spacy_stopwords.extend(
    ("'", "d", "ll", "m", "ve", "‘", "", '"', "-", ":", "?", ",", ".")
)
spacy_stopwords[:5]

['has', 'by', 'please', 'is', 'twelve']

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer(
    stop_words=spacy_stopwords, max_features=3_000, tokenizer=Tokenizer()
)
X = tfidf.fit_transform(data_doc)



In [8]:
word_2_idx = tfidf.vocabulary_

# Map each number to its equvalent word
idx_2_word = {idx: word for word, idx in word_2_idx.items()}

In [9]:
# Convert to a 1-D array
scores = X.toarray().flatten()

# Sort the scores descending order and return the indices
indices = (-scores).argsort()

# Print the top n most important words
for idx in indices[:5]:
    print(idx_2_word[idx])

marsch
leeds
bielsa
club
players


### Text Summarization

In [10]:
tfidf = TfidfVectorizer(
    stop_words=spacy_stopwords, max_features=3_000, tokenizer=Sentencizer()
)
X = tfidf.fit_transform(data)

In [11]:
def calculate_sentence_score(tfidf_row):
    """This returns the average score of the non-zero tfidf value
    for a given sentence."""
    x = tfidf_row[tfidf_row != 0]  # Select the non-zero values
    return x.mean()

In [12]:
sents = Sentencizer()
sentences = sents(doc=data_str)
print(len(sentences))

40


In [13]:
# Vectorize the sentences
tfidf = TfidfVectorizer(stop_words=spacy_stopwords, norm="l1")
X_tr = tfidf.fit_transform(sentences)
X_tr.shape

(40, 409)

In [14]:
# Initialize the score
scores = np.zeros(len(sentences))

# Calculate the score for each sentence
for idx in range(len(sentences)):
    score = calculate_sentence_score(X_tr[idx, :])
    scores[idx] = score

In [15]:
# Sort the scores in descending order
sort_idx = np.argsort(-scores)
sort_idx

array([ 3,  4, 11, 18,  5, 34, 15, 29, 30, 28, 26, 21,  7,  0, 25,  1, 24,
       35, 32, 12, 13,  8, 39, 27, 31,  2, 23,  6, 38, 36, 37,  9, 20, 19,
       22, 33, 16, 14, 10, 17])

In [16]:
for i in sort_idx[:5]:
    print(f"{i}: {round(scores[i], 3)} {sentences[i]}")

3: 0.5 Why did Marsch fail?
4: 0.5 Who is best suited to take over?
11: 0.333 The squad was arguably stronger.
18: 0.25 Fans saw and thought differently.
5: 0.2 And what do they need to do to keep Leeds up?Watch the Football News Show -


In [17]:
top_idx = sort_idx[:5]
sorted_idx = [idx for idx in top_idx]
top_sentences = [sentences[idx] for idx in top_idx]

result = tuple(itertools.zip_longest(sorted_idx, top_sentences))

result = sorted(result, key=lambda x: x[0])

for _, sent in result:
    print(sent)

Why did Marsch fail?
Who is best suited to take over?
And what do they need to do to keep Leeds up?Watch the Football News Show -
The squad was arguably stronger.
Fans saw and thought differently.


In [19]:
filepath = "../data/sample_text.txt"

main(filepath=filepath, num=None)

Number of sentences in input document: 40

Leeds United: Where did it go wrong for Jesse Marsch and who could replace him?Leeds United are looking for their third manager in less than year after the sacking of Jesse Marsch.
After replacing Marcelo Bielsa, the American helped Leeds avoid relegation on the final day of last season.
Why did Marsch fail?
Who is best suited to take over?
And what do they need to do to keep Leeds up?Watch the Football News Show -
sackingWhere did it go wrong for Marsch?For a section of supporters it was from the beginning, because he was not his predecessor Marcelo Bielsa.
The squad was arguably stronger.
There were glimpses of what might have been with a stunning 3-0 win over Chelsea in August.
Fans saw and thought differently.
And so decisive action was taken following the defeat at Nottingham Forest.
Indeed, Radrizzani suggested in a tweet on Tuesday evening that the new man could be in place as soon as Wednesday morning.
Rayo Vallecano's Andoni Iraola is

In [20]:
0.2 * 95

19.0