In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,mlxtend,omegaconf --conda

Python implementation: CPython
Python version       : 3.11.8
IPython version      : 8.22.2

numpy    : 1.26.4
pandas   : 2.2.1
polars   : 0.20.18
mlxtend  : 0.23.1
omegaconf: 2.3.0

conda environment: torch_p11



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")


# auto reload imports# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(500)

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [18]:
import json
from typing import List, Dict, Any


def _convert_to_iob(data: Dict[str, Any]) -> Dict[str, List[str]]:
    """
    Convert a single data item to IOB format.

    Parameters
    ----------
    data : Dict[str, Any]
        A dictionary containing 'text' and 'label' keys.

    Returns
    -------
    Dict[str, List[str]]
        A dictionary with 'tokens' and 'iob_tags' keys, both containing lists of strings.
    """
    text: str = data["text"]
    labels: List[List[Any]] = data["label"]

    tokens: List[str] = text.split()
    iob_tags: List[str] = ["O"] * len(tokens)

    for label in labels:
        start: int
        end: int
        entity: str
        start, end, entity = label
        start_token_idx: int = len(text[:start].split())
        end_token_idx: int = len(text[:end].split())

        if start_token_idx == end_token_idx:
            iob_tags[start_token_idx] = f"B-{entity}"
        else:
            iob_tags[start_token_idx] = f"B-{entity}"
            for i in range(start_token_idx + 1, end_token_idx):
                iob_tags[i] = f"I-{entity}"

    return {"tokens": tokens, "iob_tags": iob_tags}


def convert_to_iob(data: List[Dict[str, Any]], filepath: str = "output.jsonl") -> None:
    """
    Convert a list of data items to IOB format and write to a JSONL file.

    Parameters
    ----------
    data : List[Dict[str, Any]]
        A list of dictionaries, each containing 'text' and 'label' keys.
    filepath : str, optional
        The path to the output file (default is "output.jsonl").

    Returns
    -------
    None
    """
    data_list: List[Dict[str, List[str]]] = [_convert_to_iob(d) for d in data]
    convert_to_jsonl(data_list, filepath)


def convert_to_jsonl(
    data: List[Dict[str, List[str]]], filepath: str = "output.jsonl"
) -> None:
    """
    Convert data to JSONL format and write to a file.

    Parameters
    ----------
    data : List[Dict[str, List[str]]]
        A list of dictionaries, each containing 'tokens' and 'iob_tags' keys with
        lists of strings.
    filepath : str, optional
        The path to the output file (default is "output.jsonl").

    Returns
    -------
    None
    """
    with open(filepath, "w") as f:
        for item in data:
            json.dump(item, f)
            f.write("\n")

In [19]:
# Example JSON data
data = [
    {
        "id": 1,
        "text": "The meeting will be held in New York City next week.",
        "label": [[28, 41, "Location"]],
        "Comments": [],
    },
    {
        "id": 2,
        "text": "John Smith was born on December 25, 1990.",
        "label": [[0, 10, "Person"], [23, 40, "Date"]],
        "Comments": [],
    },
]

results: list[dict[str, str]] = convert_to_iob(data)

results

In [24]:
import spacy
import re

# Load spaCy model
nlp = spacy.load("en_core_web_sm")


def extract_info(text):
    doc = nlp(text)

    # Extract entities
    entities = {ent.label_: ent.text for ent in doc.ents}

    # Extract score (simple regex example)
    score_pattern = r"\b(\d{1,2}[-]\d{1,2})\b"
    scores = re.findall(score_pattern, text)

    # Extract player performance (simple example)
    performance = {}
    for sent in doc.sents:
        if "scored" in sent.text.lower() or "strike" in sent.text.lower():
            for ent in sent.ents:
                if ent.label_ == "PERSON":
                    performance[ent.text] = "scored"

    return {"entities": entities, "scores": scores, "player_performance": performance}

In [25]:
# Example usage
article_text = (
    "Manchester United defeated Chelsea 2-1 at Old Trafford. Rashford scored "
    "the winning goal in the 80th minute."
)
extracted_info = extract_info(article_text)
print(extracted_info)

{'entities': {'ORG': 'Old Trafford', 'PERSON': 'Rashford', 'CARDINAL': '2-1'}, 'scores': ['2-1'], 'player_performance': {'Rashford': 'scored'}}


In [26]:
# Example usage
article_text = (
    "In a thrilling Premier League clash, Chelsea and Manchester City played "
    "out a 1-1 draw at Stamford Bridge. The Citizens took the lead early on through a "
    "stunning Erling Haaland strike, but the Blues fought back and equalized through a well-taken "
    "goal from Raheem Sterling. Both teams had chances to win the match, but neither could find a "
    "decisive goal. The result leaves both sides in the hunt for the Premier League title."
)
extracted_info = extract_info(article_text)
print(extracted_info)

{'entities': {'ORG': 'Raheem Sterling', 'GPE': 'Blues', 'CARDINAL': '1', 'FAC': 'Stamford Bridge', 'PERSON': 'Citizens'}, 'scores': ['1-1'], 'player_performance': {'Citizens': 'scored'}}


In [31]:
fp: str = "../../../../../../../Downloads/football_data/allfootball.csv"
df: pl.DataFrame = pl.read_csv(fp)
print(f"{df.shape = }")
df.head(2)

df.shape = (800, 6)


Unnamed: 0_level_0,title,link,publish_time,content,author
i64,str,str,str,str,str
0,"""Exclusive: Former Chelsea manager not offered himself to Barca as Xavi successor""","""https://m.allfootballapp.com/news/Headline/Exclusive-Former-Chelsea-manager-not-offered-himself-to-Barca-as-Xavi-successor/3299287""","""11 Feb""","""The Barcelona managerial situation has been the subject of speculation ever since Xavi Hernandez announced that he would be stepping down as head coach at the end of the season. Club bosses wonât rush into making a decision, although there has been plenty of noise over the last two weeks. Barca Atletic coach Rafael Marquez and Bolognaâs Thiago Motta have both been linked with the position, although as Fabrizio Romano has told Caught Offside in his exclusive column, neither are in line to be a serious candidate. The same can be said for Antonio Conte, who reportedly offered himself to Barcelona in recent days. âIâm told itâs not true that Antonio Conte has been offered to the club as a candidate to be their next manager. At the moment there is nothing ongoing between Conte and Barca, itâs all quiet. The same is true with Thiago Motta â he is not in the frame for the job at the moment, heâs going to stay in Italy and work in Italy next season. Itâs also completely qui…","""There are sure to be plenty of twists and turns on the Barcelona managerial situation over the coming months. One thing for certain is that Xavi is not considering staying on in the role, despite an upturn in form over the last couple of weeks."""
1,"""Akram Afif is the first ever player to score a hat-trick in the Asian Cup final""","""https://m.allfootballapp.com/news/Headline/Akram-Afif-is-the-first-ever-player-to-score-a-hat-trick-in-the-Asian-Cup-final/3298670""","""11 Feb""","""Akram Afif is the first ever player to score a hat-trick in the Asian Cup final. ð© He scored three penalties against Jordan to help Qatar win back-to-back Asian Cups and take his tally to eight for the tournament, winning himself the Golden Boot. In doing so, he also became just the third player to score 8+ goals in a single edition of the tournament after Ali Daei (1996) and teammate Almoez Ali (2019). """,


In [36]:
df_cleaned: pl.DataFrame = df.with_columns(
    content=pl.col("content")
    .str.replace_all(r"\W", " ")
    .str.replace_all(r"[.,!:?]", "")
).select(["content"])

df_cleaned.head(2)

content
str
"""The Barcelona managerial situation has been the subject of speculation ever since Xavi Hernandez announced that he would be stepping down as head coach at the end of the season Club bosses wonâ t rush into making a decision although there has been plenty of noise over the last two weeks Barca Atletic coach Rafael Marquez and Bolognaâ s Thiago Motta have both been linked with the position although as Fabrizio Romano has told Caught Offside in his exclusive column neither are in line to be a serious candidate The same can be said for Antonio Conte who reportedly offered himself to Barcelona in recent days â Iâ m told itâ s not true that Antonio Conte has been offered to the club as a candidate to be their next manager At the moment there is nothing ongoing between Conte and Barca itâ s all quiet The same is true with Thiago Motta â he is not in the frame for the job at the moment heâ s going to stay in Italy and work in Italy next season Itâ s also completely qui…"
"""Akram Afif is the first ever player to score a hat trick in the Asian Cup final ð He scored three penalties against Jordan to help Qatar win back to back Asian Cups and take his tally to eight for the tournament winning himself the Golden Boot In doing so he also became just the third player to score 8 goals in a single edition of the tournament after Ali Daei 1996 and teammate Almoez Ali 2019 """


In [37]:
sp: str = "article.txt"

with open(sp, "w") as f:
    for row in df_cleaned.iter_rows(named=True):
        f.write(row["content"] + "\n")