In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Gemini 2.0 Flash API Call Script

In [None]:
import os
from google import genai
from google.genai import types

def generate(comments, api_key):
    client = genai.Client(api_key=api_key)

    model = "gemini-2.0-flash"
    system_instruction = [
        types.Part.from_text(text="""
The output should be any of the following integers:

1 - Indicates a Goal is scored at the moment.
2 - Indicates a red card given at the moment.
3 - Indicates a yellow card given at the moment.
4 - A substitution took place at the moment.
5 - Extra time granted at the moment.
6 - Half Time.
7 - Full Time.
8 - Narration of match info like team members, formation, subs etc.
9 - Offside took place at the moment.
0 - None (If none of the above events occur then output a 0)

Only one of the following events number should be outputed, incase 2 events occur at a time then output the number of the most relevant event and the less relevant events as comma separated values in parenthesis (eg: 1 (2, 3)).
All highlight commentary must be None (output 0)
""")
    ]

    results = []

    for comment in comments:
        contents = [
            types.Content(
                role="user",
                parts=[types.Part.from_text(text=comment)]
            )
        ]

        generate_content_config = types.GenerateContentConfig(
            temperature=1,
            top_p=0.95,
            top_k=40,
            max_output_tokens=8192,
            response_mime_type="text/plain",  # or JSON if structured
            system_instruction=system_instruction
        )

        response_text = ""
        for chunk in client.models.generate_content_stream(
            model=model,
            contents=contents,
            config=generate_content_config,
        ):
            if hasattr(chunk, "text"):
                response_text += chunk.text

        results.append(response_text.strip())

    return results

In [None]:
event_responses_all = []

In [None]:
comments = dataframe["Comments"][:100].tolist()

api_key = userdata.get("GEMINI_API_KEY")
event_responses = generate(comments, api_key)

event_responses_all.extend(event_responses)

In [None]:
event_responses_all

['8',
 '8',
 '8',
 '8',
 '8',
 '8',
 '8',
 '8',
 '8',
 '8',
 '0',
 '0',
 '1',
 '1',
 '0',
 '0',
 '0',
 '1',
 '0',
 '9',
 '0',
 '0',
 '0',
 '4',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '0',
 '0',
 '0',
 '5',
 '0',
 '6',
 '6',
 '6',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '4',
 '4',
 '0',
 '0',
 '0',
 '0',
 '0',
 '4',
 '1',
 '4',
 '4',
 '4',
 '4',
 '0',
 '0',
 '1',
 '5',
 '4',
 '4',
 '3',
 '2',
 '4',
 '7',
 '1 (2)',
 '7',
 '7',
 '8',
 '8',
 '8',
 '8',
 '8',
 '8',
 '8',
 '8',
 '8',
 '0',
 '8',
 '8',
 '8',
 '8',
 '0',
 '0',
 '0',
 '0',
 '0',
 '9',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1']

In [None]:
event_responses_all.extend([0 for x in range(len(dataframe) - len(event_responses_all))])

In [None]:
dataframe["Event Type"] = event_responses_all

In [None]:
dataframe[dataframe["Event Type"] == "1"]

Unnamed: 0,Comments,Event Type
12,GOAL!!! HAALAND PUTS CITY AHEAD!!! Having nett...,1
13,Rodri's header was clever to put it on a plate...,1
17,Haaland – who scored twice in City's opening P...,1
30,GOAL!!! HAALAND AGAIN!!! It is a quite brillia...,1
31,Alvarez's lay-off was perfectly weighted for H...,1
55,GOAL!!! RODRI SURELY KILLS IT OFF!!! After Cit...,1
62,Rodri's strike has capped an excellent perform...,1
99,NKETIAH!!! 1-0 ARSENAL! The Gunners have the b...,1


1. Indicates a Goal is scored at the moment
2. Indicates a red card given at the moment
3. Indicates a yellow card given at the moment
4. A substitution took place at the moment
5. Extra time granted at the moment
6. Half Time
7. Full Time
8. Team formation and players playing in the game (stats/team info)
9. Offside took place at the moment
0. None (If none of the above events occur then output a 0)

## Web-Scraping

In [None]:
import requests
from bs4 import BeautifulSoup
from lxml import html
import pickle
import requests
import json

In [None]:
with open("/content/drive/MyDrive/commentary_urls.pickle", "rb") as file:
    urls = pickle.load(file)

In [None]:
urls_list = []

for url in urls:
    url_parameters = {}
    parameters = url.split('/')
    team_a, team_b = parameters[4].split('-vs-')
    url_code, id = parameters[5].split('#')
    print(team_a, team_b, url_code, id)

    url_parameters['team_a'] = team_a
    url_parameters['team_b'] = team_b
    url_parameters['url_code'] = url_code
    url_parameters['id'] = id

    urls_list.append(url_parameters)

fulham manchester-united 3cqww9 4506263
liverpool ipswich-town 2ugv0q 4506264
wolverhampton-wanderers arsenal 2t3bl7 4506265
everton brighton-hove-albion 2y16fs 4506266
southampton newcastle-united 2weqvy 4506267
afc-bournemouth nottingham-forest 2y4tjb 4506268
west-ham-united aston-villa 2yexzh 4506269
crystal-palace brentford 38a4jj 4506270
chelsea manchester-city 2d55kw 4506271
leicester-city tottenham-hotspur 2buxqa 4506272
brighton-hove-albion manchester-united 3goccs 4506275
west-ham-united crystal-palace 2toa9m 4506276
leicester-city fulham 2pa0dp 4506277
manchester-city ipswich-town 2sc4e3 4506279
southampton nottingham-forest 2vri2l 4506280
tottenham-hotspur everton 2gmqxt 4506281
arsenal aston-villa 3c06vg 4506274
afc-bournemouth newcastle-united 2ysbu8 4506273
chelsea wolverhampton-wanderers 2emb2j 4506282
liverpool brentford 2uusjv 4506278
arsenal brighton-hove-albion 3bfk5g 4506283
southampton brentford 2stunb 4506284
everton afc-bournemouth 2hkv2h 4506286
fulham ipswich-t

In [None]:
urls_list

[{'team_a': 'fulham',
  'team_b': 'manchester-united',
  'url_code': '3cqww9',
  'id': '4506263'},
 {'team_a': 'liverpool',
  'team_b': 'ipswich-town',
  'url_code': '2ugv0q',
  'id': '4506264'},
 {'team_a': 'wolverhampton-wanderers',
  'team_b': 'arsenal',
  'url_code': '2t3bl7',
  'id': '4506265'},
 {'team_a': 'everton',
  'team_b': 'brighton-hove-albion',
  'url_code': '2y16fs',
  'id': '4506266'},
 {'team_a': 'southampton',
  'team_b': 'newcastle-united',
  'url_code': '2weqvy',
  'id': '4506267'},
 {'team_a': 'afc-bournemouth',
  'team_b': 'nottingham-forest',
  'url_code': '2y4tjb',
  'id': '4506268'},
 {'team_a': 'west-ham-united',
  'team_b': 'aston-villa',
  'url_code': '2yexzh',
  'id': '4506269'},
 {'team_a': 'crystal-palace',
  'team_b': 'brentford',
  'url_code': '38a4jj',
  'id': '4506270'},
 {'team_a': 'chelsea',
  'team_b': 'manchester-city',
  'url_code': '2d55kw',
  'id': '4506271'},
 {'team_a': 'leicester-city',
  'team_b': 'tottenham-hotspur',
  'url_code': '2buxqa'

In [None]:
'https://www.fotmob.com/api/ltc?ltcUrl=data.fotmob.com/webcl/ltc/gsm/4506505_en.json.gz&teams=["Brighton+&+Hove+Albion","Arsenal"]'

In [None]:
curl 'https://www.fotmob.com/api/ltc?ltcUrl=data.fotmob.com%2Fwebcl%2Fltc%2Fgsm%2F4506505_en.json.gz&teams=%5B%22Brighton+%26+Hove+Albion%22%2C%22Arsenal%22%5D' \
  -H 'sec-ch-ua-platform: "macOS"' \
  -H 'Referer: https://www.fotmob.com/matches/arsenal-vs-brighton-hove-albion/3bfk5g' \
  -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' \
  -H 'x-mas: eyJib2R5Ijp7InVybCI6Ii9hcGkvbHRjP2x0Y1VybD1kYXRhLmZvdG1vYi5jb20lMkZ3ZWJjbCUyRmx0YyUyRmdzbSUyRjQ1MDY1MDVfZW4uanNvbi5neiZ0ZWFtcz0lNUIlMjJCcmlnaHRvbislMjYrSG92ZStBbGJpb24lMjIlMkMlMjJBcnNlbmFsJTIyJTVEIiwiY29kZSI6MTc0MTg2OTQyNjY2OCwiZm9vIjoicHJvZHVjdGlvbjo1MTgzYjk4ZDlkNDM2NmRhZGM4NDZiNzE1ZDY0MGQxOWMwMzY2ZThiLXVuZGVmaW5lZCJ9LCJzaWduYXR1cmUiOiIzREY0QTQ0MEU1QUQ1NzQwMTZFMzE2NTg1RjNDNTZFMiJ9' \
  -H 'sec-ch-ua: "Brave";v="131", "Chromium";v="131", "Not_A Brand";v="24"' \
  -H 'sec-ch-ua-mobile: ?0'

In [None]:
def get_commentary(team_a, team_b, url_code, id):
    url = f"https://www.fotmob.com/api/ltc?ltcUrl=data.fotmob.com%2Fwebcl%2Fltc%2Fgsm%2F{id}_en.json.gz&teams=%5B%22{team_a.replace('-', '+')}%22%2C%22{team_b.replace('-', '+')}%22%5D" # &teams=%5B%22{team_a}%22%2C%22{team_b}%22%5D

    headers = {
        "sec-ch-ua-platform": '"macOS"',
        "Referer": f"https://www.fotmob.com/matches/{team_a}-vs-{team_b}/{url_code}",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        "x-mas": "eyJib2R5Ijp7InVybCI6Ii9hcGkvbHRjP2x0Y1VybD1kYXRhLmZvdG1vYi5jb20lMkZ3ZWJjbCUyRmx0YyUyRmdzbSUyRjQ1MDY2MTJfZW4uanNvbi5neiZ0ZWFtcz0lNUIlMjJXb2x2ZXJoYW1wdG9uK1dhbmRlcmVycyUyMiUyQyUyMkFyc2VuYWwlMjIlNUQiLCJjb2RlIjoxNzQxODQ3MjgwODQ2LCJmb28iOiJwcm9kdWN0aW9uOmI1MzIwODMyN2U5ZGQxOTcyM2VjNTEzM2ExYzQ3Y2Q1NDYwNzMyZWMtdW5kZWZpbmVkIn0sInNpZ25hdHVyZSI6Ijk1QjIxQkUzMEQwQThCMTIzNUQ3NzgxMkY2RjM2OTE5In0=",
        "sec-ch-ua": '"Brave";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
        "sec-ch-ua-mobile": "?0"
    }

    response = requests.get(url, headers=headers)

    # print(response.status_code)

    json_object = json.loads(response.text)

    return json_object["events"]

In [None]:
dataset_dict = {"Team A": [], "Team B": [], "URL Code": [], "ID": [], "Comment": [], "Type": [], "Players": [], "Minute": []}

for url_params in urls_list:
    team_a = url_params['team_a']
    team_b = url_params['team_b']
    url_code = url_params['url_code']
    id = url_params['id']

    data = get_commentary(team_a, team_b, url_code, id)

    for event in data:
        dataset_dict["Comment"].append(event["text"].lower())
        dataset_dict["Type"].append(event["type"])
        # print(event["time"])
        # print(type(event["time"]))
        if event["time"]:
            extra = int(event["time"]["added"][1:]) if event["time"]["added"] else 0
            dataset_dict["Minute"].append(int(event["time"]["main"][:-1]) + extra)
        else:
            dataset_dict["Minute"].append(None)

        if len(event["players"]) != 0:
            players = []
            # print(event["type"], event["text"], event["players"], sep="\n")
            for player in event["players"]:
                players.append(player["name"])
            dataset_dict["Players"].append(players)
        else:
            dataset_dict["Players"].append(event["players"])
            # print(event["type"], event["text"], sep="\n")

        dataset_dict["Team A"].append(team_a)
        dataset_dict["Team B"].append(team_b)
        dataset_dict["URL Code"].append(url_code)
        dataset_dict["ID"].append(id)

In [None]:
# Dataset Raw

dataset = pd.DataFrame(dataset_dict)

NameError: name 'dataset_dict' is not defined

## Data Modification

In [None]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.12.2


In [None]:
def generate_url(team_a, team_b, url_code, id):
    return f'https://www.fotmob.com/matches/{team_a}-vs-{team_b}/{url_code}#{id}:tab=ticker'

In [None]:
def directional_partial_ratio(short, long_):

    if len(short) > len(long_):
        return fuzz.ratio(short, long_)
    else:
        return fuzz.partial_ratio(short, long_)

In [None]:
from rapidfuzz import fuzz, process

def name_in_text(player, comment, threshold):
    highest_ratio = 0
    word_match_index = 0

    # comment.replace("-", " ")
    comment_list = comment.split(' ')

    for name in player.split(' '):
        for word in comment_list:
            ratio = directional_partial_ratio(name.lower(), word)
            if ratio > highest_ratio:
                highest_ratio = ratio
                word_match_index = comment_list.index(word)

    if highest_ratio > threshold:
        return comment_list[word_match_index], highest_ratio, comment_list[word_match_index], name, comment
    else:
        return False, highest_ratio, comment_list[word_match_index], name, comment

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/commentary_final_dataset.csv")

In [None]:
dataset

Unnamed: 0,Team A,Team B,URL Code,ID,Comment,Type,Players,Minute,Player Name In Commentary
0,fulham,manchester-united,3cqww9,4506263,full-time: manchester united 1-0 fulham,comment,[],95.0,[]
1,fulham,manchester-united,3cqww9,4506263,how did he miss that?! bassey completely misse...,highlight,[],94.0,[]
2,fulham,manchester-united,3cqww9,4506263,united go searching for the game-clinching sec...,comment,[],92.0,[]
3,fulham,manchester-united,3cqww9,4506263,reed is also introduced for these latter stage...,SI,"['Harrison Reed', 'Andreas Pereira']",91.0,[]
4,fulham,manchester-united,3cqww9,4506263,one final throw of the dice from silva. the fi...,SI,"['Jay Stansfield', 'Sasa Lukic']",91.0,[]
...,...,...,...,...,...,...,...,...,...
18159,west-ham-united,newcastle-united,2yilb8,4506351,west ham in the ascendency in this opening 7 m...,comment,[],8.0,[]
18160,west-ham-united,newcastle-united,2yilb8,4506351,bowen gets away down the right side this time ...,comment,[],6.0,[]
18161,west-ham-united,newcastle-united,2yilb8,4506351,newcastle have settled into possession after t...,comment,[],4.0,[]
18162,west-ham-united,newcastle-united,2yilb8,4506351,huge chance! kudus turns schar inside out on t...,highlight,[],1.0,[]


In [None]:
# Remove comments that are outside the time line (that don't have time)
dataset.drop(index=dataset.index[dataset["Minute"].isnull()].tolist(), inplace=True)

In [None]:
# Remove comments in the irrelevant category
dataset.drop(index=dataset.index[dataset["Type"] == "post_match summary"], inplace=True)
dataset.drop(index=dataset.index[dataset["Type"] == "half_time summary"], inplace=True)

In [None]:
dataset['Player Name In Commentary'] = '[]'
for index, row in dataset[["Comment", "Players"]].iterrows():
    comment, players = row
    if players != '[]':
        players_list = eval(players)
        for player in players_list:
            name = name_in_text(player, comment, 60)[0]
            if name:
                name_ = eval(dataset["Player Name In Commentary"].iloc[index]) + [name.lower()]
                dataset.loc[index, "Player Name In Commentary"] = str(name_)

In [None]:
dataset_nodirection = dataset.query("`Players` != '[]' and `Player Name In Commentary` == '[]'")

In [None]:
dataset_direction = dataset.query("`Players` != '[]' and `Player Name In Commentary` == '[]'")

In [None]:
for index, row in dataset.query("`Players` != '[]' and `Player Name In Commentary` != '[]'").iterrows():
    # Team A	Team B	URL Code	ID	Comment	Type	Players	Minute	Player Name In Commentary
    team_a, team_b, url_code, id, comment, type_, players, minute, player_name_in_comment = row
    url = generate_url(team_a, team_b, url_code, id)
    print(url, end="\n")
    if players != '[]':
        players_list = eval(players)
        for player in players_list:
            # comment_list[word_match_index] / False, highest_ratio, comment_list[word_match_index], name, comment
            print(name_in_text(player, comment, 60)[1:5])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
https://www.fotmob.com/matches/southampton-vs-tottenham-hotspur/2ekh9c#4506451:tab=ticker
(100.0, 'yukinari', 'Sugawara', "manning's been one of few southampton players to not put many feet wrong in this game, but comes off now for yukinari sugawara to slot into the hosts' defence.")
(100.0, "manning's", 'Manning', "manning's been one of few southampton players to not put many feet wrong in this game, but comes off now for yukinari sugawara to slot into the hosts' defence.")
https://www.fotmob.com/matches/southampton-vs-tottenham-hotspur/2ekh9c#4506451:tab=ticker
(100.0, 'johnson', 'Johnson', 'johnson prevents southampton taking a free-kick quickly for some reason – completely redundant when 5-0 up – and is shown a yellow card for time-wasting.')
https://www.fotmob.com/matches/southampton-vs-tottenham-hotspur/2ekh9c#4506451:tab=ticker
(100.0, 'maddison', 'Maddison', 'maddison drags downes to ground from behind as the sain

In [None]:
for index, row in pd.DataFrame(ds1.difference(ds2)).iterrows():
    # Team A	Team B	URL Code	ID	Comment	Type	Players	Minute	Player Name In Commentary
    team_a, team_b, url_code, id, comment, type_, players, minute, player_name_in_comment = row
    url = generate_url(team_a, team_b, url_code, id)
    print(url, end="\n")
    if players != '[]':
        players_list = eval(players)
        for player in players_list:
            # comment_list[word_match_index] / False, highest_ratio, comment_list[word_match_index], name, comment
            print(name_in_text(player, comment, 60)[1:5])

https://www.fotmob.com/matches/leicester-city-vs-west-ham-united/2cjg6k#4506429:tab=ticker
(45.16129032258065, 'a\xa0consolation.\xa0fullkrug', 'Füllkrug', "goallll!!! it's too little too late for west ham but they have a\xa0consolation.\xa0fullkrug grabs his first goal in a hammers shirt after his summer transfer from borussia dortmund and return from injury. the german\xa0gets on the end of a summerville flick-on from a corner to convert. 3-1.")
https://www.fotmob.com/matches/fulham-vs-newcastle-united/3crcfp#4506620:tab=ticker
(60.0, 'yellow!\xa0tonali', 'Tonali', 'yellow!\xa0tonali is booked after a late tackle on traore.')
https://www.fotmob.com/matches/leicester-city-vs-west-ham-united/2cjg6k#4506429:tab=ticker
(60.0, 'past', 'Daka', 'goaalll!!!! leicester have their third!!\xa0daka who has looked lively since coming on, pinches the ball off kilman and drives towards goal. the zambian fires into the roof of the net and past fabianski. game over. 3-0!')
https://www.fotmob.com/matc

In [None]:
generate_url('everton', 'aston-villa', '2ykmb4', '4506294')

https://www.fotmob.com/matches/everton-vs-aston-villa/2ykmb4#4506294:tab=ticker


In [None]:
dataset['labels'], label_mapping = pd.factorize(dataframe['Type'])

Label Mapping: {0: 'comment', 1: 'highlight', 2: 'SI', 3: 'AS', 4: 'G', 5: 'YC', 6: 'kick off', 7: 'var', 8: 'RC', 9: 'PSG', 10: 'OG', 11: 'full time', 12: 'stats', 13: 'Y2C', 14: 'penalty save', 15: 'half time', 16: 'PM'}


In [None]:
# Save Changes
dataset.to_csv("/content/drive/MyDrive/commentary_final_dataset.csv", index=False)

## Data Inspection and Analysis

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/commentary_final_dataset.csv")

In [None]:
dataset.columns

Index(['Team A', 'Team B', 'URL Code', 'ID', 'Comment', 'Type', 'Players',
       'Minute'],
      dtype='object')

In [None]:
dataset["Type"].unique()

array(['post_match summary', 'comment', 'highlight', 'SI', 'AS', 'G',
       'YC', 'team news', 'kick off', 'half_time summary', 'var', 'RC',
       'PSG', 'OG', 'full time', 'stats', 'Y2C', 'penalty save',
       'half time', 'PM'], dtype=object)

In [None]:
dataset.loc[dataset["Players"] != '[]']["Type"].unique()

array(['SI', 'AS', 'G', 'YC', 'RC', 'PSG', 'OG', 'Y2C', 'penalty save',
       'PM'], dtype=object)

In [None]:
set(dataset["Type"].unique()).difference(set(dataset.loc[dataset["Minute"].astype(str) != 'nan']["Type"].unique()))

{'post_match summary', 'team news'}

In [None]:
dataset.loc[dataset['Type'] == 'Y2C'][["Comment", "Team A", "Team B", "URL Code", "Minute"]].iloc[0]

Unnamed: 0,1615
Comment,It's all gone a bit silly. Veltman is fouled a...
Team A,arsenal
Team B,brighton-hove-albion
URL Code,3bfk5g
Minute,49.0


In [None]:
dataset.query("`ID` == 4506283")

Unnamed: 0,Team A,Team B,URL Code,ID,Comment,Type,Players,Minute
1581,arsenal,brighton-hove-albion,3bfk5g,4506283,The two unbeaten records continue as this game...,post_match summary,[],
1582,arsenal,brighton-hove-albion,3bfk5g,4506283,FULL-TIME: ARSENAL 1-1 BRIGHTON,comment,[],96.0
1583,arsenal,brighton-hove-albion,3bfk5g,4506283,The free-kick is low and Arsenal block. Bright...,comment,[],96.0
1584,arsenal,brighton-hove-albion,3bfk5g,4506283,Joao Pedro wriggles past Partey and is clipped...,YC,['Joao Pedro'],96.0
1585,arsenal,brighton-hove-albion,3bfk5g,4506283,Raya enters the book for time-wasting. He's be...,YC,['David Raya'],94.0
...,...,...,...,...,...,...,...,...
1661,arsenal,brighton-hove-albion,3bfk5g,4506283,"It’s just the one change for Arsenal, as Leand...",team news,[],
1662,arsenal,brighton-hove-albion,3bfk5g,4506283,"ARSENAL SUBSTITUTES: Tommy Setford, Jakub Kiwi...",comment,[],
1663,arsenal,brighton-hove-albion,3bfk5g,4506283,"ARSENAL (4-3-3): David Raya; Jurrien Timber, G...",comment,[],
1664,arsenal,brighton-hove-albion,3bfk5g,4506283,Brighton prevailed 2-1 over Manchester United ...,comment,[],


In [None]:
dataset["Comment"].iloc[15334:]

Unnamed: 0,Comment
15334,We are underway at the American Express Stadium.
15335,The teams are in the tunnel and kick-off will ...
15336,"Meanwhile, Mikel Arteta makes three changes fr..."
15337,Fabian Hurzeler makes three changes from the B...
15338,"ARSENAL SUBS: Kieran Tierney, Martin Odegaard,..."
...,...
21780,NEWCASTLE UNITED (4-3-3): Nick Pope; Tino Livr...
21781,"WEST HAM SUBS: Lukasz Fabianski, Carlos Soler,..."
21782,WEST HAM (3-4-2-1): Alphonse Areola; Aaron Cre...
21783,After a difficult start to life under Graham P...


In [None]:
dataset.loc[dataset["URL Code"] == "3bfk5g"]

Unnamed: 0,Team A,Team B,URL Code,Comment,Type,Players,Minute
1581,arsenal,brighton-hove-albion,3bfk5g,The two unbeaten records continue as this game...,post_match summary,[],
1582,arsenal,brighton-hove-albion,3bfk5g,FULL-TIME: ARSENAL 1-1 BRIGHTON,comment,[],96.0
1583,arsenal,brighton-hove-albion,3bfk5g,The free-kick is low and Arsenal block. Bright...,comment,[],96.0
1584,arsenal,brighton-hove-albion,3bfk5g,Joao Pedro wriggles past Partey and is clipped...,YC,['Joao Pedro'],96.0
1585,arsenal,brighton-hove-albion,3bfk5g,Raya enters the book for time-wasting. He's be...,YC,['David Raya'],94.0
...,...,...,...,...,...,...,...
15340,arsenal,brighton-hove-albion,3bfk5g,"BRIGHTON SUBS: Tariq Lamptey, Adam Webster, So...",comment,[],
15341,arsenal,brighton-hove-albion,3bfk5g,BRIGHTON (4-2-3-1): Bart Verbruggen; Pervis Es...,comment,[],
15342,arsenal,brighton-hove-albion,3bfk5g,"Meanwhile, visitors Arsenal travel to the sout...",comment,[],
15343,arsenal,brighton-hove-albion,3bfk5g,Hosts Brighton welcome Mikel Arteta’s Arsenal ...,comment,[],


In [None]:
dataset["Type"].unique()

array(['post_match summary', 'comment', 'highlight', 'SI', 'AS', 'G',
       'YC', 'team news', 'kick off', 'half_time summary', 'var', 'RC',
       'PSG', 'OG', 'full time', 'stats', 'Y2C', 'penalty save',
       'half time', 'PM'], dtype=object)

In [None]:
dataset.query("`Type` == 'stats' and `Minute`.notnull()")[["Comment", "Team A", "Team B", "URL Code", "ID"]].iloc[5]

Unnamed: 0,1771
Comment,bournemouth have won four of their last six pr...
Team A,everton
Team B,afc-bournemouth
URL Code,2hkv2h
ID,4506286


In [None]:
dataset.query("`Minute`.isnull()")["Type"].unique()

array(['post_match summary', 'comment', 'team news', 'half_time summary',
       'full time'], dtype=object)

In [None]:
team_a = 'tottenham-hotspur'
team_b = 'wolverhampton-wanderers'
url_code = '2fydv8'
id = '4506501'
f"https://www.fotmob.com/matches/{team_a}-vs-{team_b}/{url_code}#{id}:tab=ticker"

'https://www.fotmob.com/matches/tottenham-hotspur-vs-wolverhampton-wanderers/2fydv8#4506501:tab=ticker'

In [None]:
AS - Assist
SI - Substitution
G - Goal
YC - Yellow Card
RC - Red Card
var - Video Assistant Referee
Y2C - Yellow and Red Card
PM - Penealty Miss

Discard the following:
post_match summary
half_time summary

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset["Comment"], dataset[["Player Name In Commentary", "Type"]], train_size=0.8, random_state=42)

In [None]:
pd.concat([X_train, y_train], axis=1).to_csv("Football_Commentary_train.csv", index=False)

In [None]:
pd.concat([X_test, y_test], axis=1).to_csv("Football_Commentary_test.csv", index=False)

## bert-base Fine Tuning

In [None]:
!pip install --upgrade transformers datasets evaluate huggingface_hub torch

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.12.0 which is incompatible.[0m[31m
[0mSuccessfully installed datasets-3.4.1 dill-0.3.8 evaluate-0.4.3 fsspec-2024.12.0 multiprocess-0.70.16 transformers-4.50.0 xxhash-3.5.0


In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

### Model training

In [None]:
from datasets import DatasetDict, Dataset, load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                         TrainingArguments, Trainer, DataCollatorWithPadding)
import evaluate
import numpy as np
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
dataframe = pd.read_csv("/content/drive/MyDrive/commentary_final_dataset.csv")

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(list(dataframe["Type"].unique()))
dataframe["labels-m1"] = le.transform(dataframe["Type"])

In [None]:
from datasets import Dataset, DatasetDict

train, test = train_test_split(dataframe[["Comment", "labels-m1"]], test_size=0.2)
train = Dataset.from_pandas(train, preserve_index=False)
test = Dataset.from_pandas(test, preserve_index=False)

dataset_dict = DatasetDict({
    "train": train,
    "test": test
})

NameError: name 'train_test_split' is not defined

In [None]:
id2label = {}
label2id = {}

for id in range(17):
    label = str(le.inverse_transform([id])[0])
    id2label[id] = label
    label2id[label] = id

In [None]:
# define pre-trained model path
model_path = "google-bert/bert-base-uncased"

# load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                           num_labels=17,
                                                           id2label=id2label,
                                                           label2id=label2id,)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# freeze all base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [None]:
# define text preprocessing
def preprocess_function(examples):
    # return tokenized text with truncation
    return tokenizer(examples["Comment"], truncation=True, padding='max_length', max_length=128)

# preprocess all datasets
tokenized_data = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/14531 [00:00<?, ? examples/s]

Map:   0%|          | 0/3633 [00:00<?, ? examples/s]

In [None]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# load metrics
import evaluate
import numpy as np
from scipy.special import softmax

accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    labels = labels.astype(np.int32)  # Fix: ensure correct dtype
    probabilities = softmax(predictions, axis=1).astype(np.float32)  # Fix: ensure float32

    # Predict most probable class
    predicted_classes = np.argmax(probabilities, axis=1)

    # Compute Accuracy
    acc = np.round(
        accuracy.compute(
            predictions=predicted_classes,
            references=labels
        )["accuracy"], 3
    )

    return {"Accuracy": acc}

In [None]:
# hyperparameters
lr = 2e-4
batch_size = 8
num_epochs = 10

training_args = TrainingArguments(
    output_dir="football_commentary-EE",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfalahmanalodi[0m ([33mfalahmanalodi-mea-engineering-college[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.639,0.522651,0.838
2,0.4581,0.404724,0.865
3,0.4276,0.390102,0.875
4,0.4092,0.382989,0.879
5,0.3899,0.370513,0.884
6,0.3778,0.345553,0.89
7,0.3678,0.347023,0.891
8,0.351,0.343369,0.89
9,0.3497,0.343954,0.892
10,0.3446,0.339237,0.894


TrainOutput(global_step=18170, training_loss=0.4114720194957777, metrics={'train_runtime': 1360.8989, 'train_samples_per_second': 106.775, 'train_steps_per_second': 13.351, 'total_flos': 9559454147443200.0, 'train_loss': 0.4114720194957777, 'epoch': 10.0})

In [None]:
trainer.args.num_train_epochs = 27
trainer.args.learning_rate = 1e-5
trainer.train(resume_from_checkpoint=True)

Epoch,Training Loss,Validation Loss,Accuracy
26,0.3206,0.331241,0.895
27,0.3128,0.329716,0.898


TrainOutput(global_step=49059, training_loss=0.023456319764132854, metrics={'train_runtime': 271.9101, 'train_samples_per_second': 1442.892, 'train_steps_per_second': 180.424, 'total_flos': 2.581052619809664e+16, 'train_loss': 0.023456319764132854, 'epoch': 27.0})

### Training Log

In [None]:
training_log = pd.DataFrame(trainer.state.log_history)

In [None]:
training_log[["epoch", "eval_loss", "eval_Accuracy"]].dropna().style.hide()

epoch,eval_loss,eval_Accuracy
1.0,0.522651,0.838
2.0,0.404724,0.865
3.0,0.390102,0.875
4.0,0.382989,0.879
5.0,0.370513,0.884
6.0,0.345553,0.89
7.0,0.347023,0.891
8.0,0.343369,0.89
9.0,0.343954,0.892
10.0,0.339237,0.894


### Save the model

In [None]:
trainer.model.save_pretrained("/content/my-bert-finetuned")
tokenizer.save_pretrained("/content/my-bert-finetuned")

('/content/my-bert-finetuned/tokenizer_config.json',
 '/content/my-bert-finetuned/special_tokens_map.json',
 '/content/my-bert-finetuned/vocab.txt',
 '/content/my-bert-finetuned/added_tokens.json',
 '/content/my-bert-finetuned/tokenizer.json')

In [None]:
import shutil

In [None]:
shutil.make_archive("/content/my-bert-finetuned", 'zip', "/content/my-bert-finetuned")

### Model testing

In [None]:
# prompt: import fine tuned model from hugging face transformers

from transformers import pipeline

# Replace with your model and tokenizer paths
model_path = "falahmanalodi/Football-Commentary-EE"

# Load the fine-tuned model and tokenizer
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path)


config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


[{'label': 'comment', 'score': 0.9362394213676453}]

In [None]:
# Example usage: classify some text
text = "Ronaldo Scores a fantastic goal"
result = classifier(text)

result

[{'label': 'AS', 'score': 0.6525484323501587}]

### New dataset preperation for Model 2 fine tuning

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/commentary_final_dataset.csv")

In [None]:
import pandas as pd
import random

In [None]:
dataset.columns

Index(['Unnamed: 0', 'Team A', 'Team B', 'URL Code', 'ID', 'Comment', 'Type',
       'Players', 'Minute', 'Player Name In Commentary', 'labels-m1'],
      dtype='object')

In [None]:
TRUNCATE_PERCENT = 0.4  # 50% of data will be used for truncation

# Sample 50% of original dataset to create Incomplete examples
truncate_samples = dataset.sample(frac=TRUNCATE_PERCENT, random_state=42).copy()

In [None]:
# Truncate each sampled comment by 40%–90% of its length
def truncate_text(text):
    words = text.strip().split()
    if len(words) <= 3:
        return text  # don't truncate very short ones
    keep_ratio = random.uniform(0.1, 0.9)
    keep_len = max(1, int(len(words) * keep_ratio))
    truncated = ' '.join(words[:keep_len])
    return truncated

In [None]:
truncate_text("Hello world, this is falah. im a cool")

'Hello world, this is'

In [None]:
truncate_samples["Comment"] = truncate_samples["Comment"].apply(truncate_text)

In [None]:
truncate_samples["Type-m2"] = "Incomplete"

In [None]:
final_data = pd.concat([dataset, truncate_samples], ignore_index=True)
final_data = final_data.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [None]:
final_data.to_csv("/content/drive/MyDrive/commentary_final_dataset.csv", index=False)

In [None]:
dataset.loc[final_data["Type-m2"].isnull(), "Type-m2"] = "Complete"

In [None]:
dataset.to_csv("/content/drive/MyDrive/commentary_final_dataset.csv", index=False)

## microsoft/MiniLM-L6-H384-uncased - Fine tuning for finding semantically connected sentances

In [None]:
!pip install --upgrade transformers datasets evaluate huggingface_hub torch

Collecting transformers
  Downloading transformers-4.50.0-py3-none-any.whl.metadata (39 kB)
Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohap

In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `Colab` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `Colab`


In [None]:
from datasets import DatasetDict, Dataset, load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                         TrainingArguments, Trainer, DataCollatorWithPadding)
import evaluate
import numpy as np
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
dataframe = pd.read_csv("/content/drive/MyDrive/commentary_final_dataset.csv")

In [None]:
dataframe.columns

Index(['Team A', 'Team B', 'URL Code', 'ID', 'Comment', 'Type', 'Players',
       'Minute', 'Player Name In Commentary', 'Type-m2'],
      dtype='object')

In [None]:
dataframe.drop(columns=["labels-m1", "labels-m2"], inplace=True)

In [None]:
dataframe = load_dataset("falahmanalodi/Football-Commentary")

README.md:   0%|          | 0.00/841 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.38M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/602k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20344 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5086 [00:00<?, ? examples/s]

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(list(dataframe["Type-m2"].unique()))
dataframe["labels-m2"] = le.transform(dataframe["Type-m2"])

In [None]:
le.transform(["Incomplete"]), le.transform(["Complete"])

(array([1]), array([0]))

In [None]:
len(le.classes_)

2

In [None]:
id2label = {}
label2id = {}

for id in range(len(le.classes_)):
    label = str(le.inverse_transform([id])[0])
    id2label[id] = label
    label2id[label] = id

In [None]:
from datasets import Dataset, DatasetDict

train, test = train_test_split(dataframe[["Comment", "labels-m2"]], test_size=0.2)
train = Dataset.from_pandas(train, preserve_index=False)
test = Dataset.from_pandas(test, preserve_index=False)

dataset_dict = DatasetDict({
    "train": train,
    "test": test
})

In [None]:
dataset_dict = dataset_dict.rename_column("labels-m2", "labels")

In [None]:
# define pre-trained model path
model_path = "microsoft/MiniLM-L12-H384-uncased"

# load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id,)

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/133M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# freeze all base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
found_pooler = False
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True
        found_pooler = True
if not found_pooler:
    print("⚠️ No pooler layer found — all base layers remain frozen.")

In [None]:
# define text preprocessing
def preprocess_function(examples):
    # return tokenized text with truncation
    return tokenizer(examples["Comment"], truncation=True, padding='max_length', max_length=128)

# preprocess all datasets
tokenized_data = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/20344 [00:00<?, ? examples/s]

Map:   0%|          | 0/5086 [00:00<?, ? examples/s]

In [None]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# load metrics
import evaluate
import numpy as np
from scipy.special import softmax

accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    labels = labels.astype(np.int32)  # Fix: ensure correct dtype
    probabilities = softmax(predictions, axis=1).astype(np.float32)  # Fix: ensure float32

    # Predict most probable class
    predicted_classes = np.argmax(probabilities, axis=1)

    # Compute Accuracy
    acc = np.round(
        accuracy.compute(
            predictions=predicted_classes,
            references=labels
        )["accuracy"], 3
    )

    return {"Accuracy": acc}

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# hyperparameters
lr = 2e-5
batch_size = 16
num_epochs = 8

training_args = TrainingArguments(
    output_dir="football_commentary-comment-clusterer",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5492,0.492415,0.755
2,0.4301,0.409461,0.797
3,0.3811,0.378941,0.815
4,0.3588,0.361153,0.833
5,0.3418,0.344307,0.844
6,0.331,0.332801,0.852
7,0.3243,0.32891,0.854
8,0.3215,0.328222,0.854


TrainOutput(global_step=10176, training_loss=0.37972915697397674, metrics={'train_runtime': 433.5554, 'train_samples_per_second': 375.537, 'train_steps_per_second': 23.471, 'total_flos': 2680229642797056.0, 'train_loss': 0.37972915697397674, 'epoch': 8.0})

In [None]:
trainer.args.num_train_epochs = 20
trainer.train(resume_from_checkpoint=True)

Epoch,Training Loss,Validation Loss,Accuracy
17,0.2771,0.283733,0.876
18,0.274,0.279873,0.879
19,0.274,0.281268,0.878
20,0.2753,0.278918,0.879


TrainOutput(global_step=25440, training_loss=0.05502149114068949, metrics={'train_runtime': 207.563, 'train_samples_per_second': 1961.043, 'train_steps_per_second': 122.565, 'total_flos': 6700574106992640.0, 'train_loss': 0.05502149114068949, 'epoch': 20.0})

In [None]:
trainer.model.cpu()
trainer.model.save_pretrained("/content/microsoft_MiniLM")
tokenizer.save_pretrained("/content/microsoft_MiniLM")

('/content/microsoft_MiniLM/tokenizer_config.json',
 '/content/microsoft_MiniLM/special_tokens_map.json',
 '/content/microsoft_MiniLM/vocab.txt',
 '/content/microsoft_MiniLM/added_tokens.json',
 '/content/microsoft_MiniLM/tokenizer.json')

In [None]:
import shutil

In [None]:
shutil.make_archive("/content/microsoft_MiniLM", 'zip', "/content/microsoft_MiniLM")

'/content/microsoft_MiniLM.zip'

### Testing model

In [None]:
bert_model = AutoModelForSequenceClassification.from_pretrained("falahmanalodi/Football-Commentary-EE")
tokenizer = AutoTokenizer.from_pretrained("falahmanalodi/Football-Commentary-EE")

config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
def predict(comment_):
    inputs = tokenizer(comment_, return_tensors="pt", truncation=True)
    outputs = bert_model(**inputs)
    predicted_class = outputs.logits.argmax(dim=1).item()
    print(id2label[int(predicted_class)])

In [None]:
import time

In [None]:
start = time.perf_counter()
predict("brilliant pass from rice and a good save from martinez!! the ball finds its way back to rice on the left wing after the corner and he holds onto it, waiting for a run. he spots martinelli in space on the far post and clips in a lovely cross. the angle isn't clean but the brazilian still gets a good touch on it to force martinez into making a neat save.")
stop = time.perf_counter()

highlight


In [None]:
stop - start

0.5868516250000084

In [None]:
microsoft_model = AutoModelForSequenceClassification.from_pretrained("/content/microsoft_MiniLM")
tokenizer = AutoTokenizer.from_pretrained("/content/microsoft_MiniLM")

In [None]:
def predict(comment_):
    inputs = tokenizer(comment_, return_tensors="pt", truncation=True)
    outputs = microsoft_model(**inputs)
    predicted_class = outputs.logits.argmax(dim=1).item()
    print(id2label[int(predicted_class)])

In [None]:
import time

In [None]:
start = time.perf_counter()
predict("palace come flying out of the blocks to put pressure on the hosts early. it is perhaps a little too intense as gillett gives a foul to chelsea deep in their own half.	")
stop = time.perf_counter()
print(stop - start)

Complete
0.023227099999985512


## BERT-BASE-NER Fine Tuning

### Data pre-processing

In [None]:
dataset = load_dataset("falahmanalodi/Football-Commentary")

In [None]:
dataframe = dataset["train"].to_pandas()

In [None]:
label2id = {"O": 0, "B-PLAYER": 1}
id2label = {0: "O", 1: "B-PLAYER"}

In [None]:
def generate_tokens_and_tags(text, player_names, label2id):
    tokens = text.strip().split()
    ner_tags = ["O"] * len(tokens)

    for player in player_names:
        for i in range(len(tokens)):
            if tokens[i] == player:
                ner_tags[i] = "B-PLAYER"
                break

    return tokens, [label2id[tag] for tag in ner_tags]  # return lists

In [None]:
dataframe["tokens"] = None
dataframe["ner_tags"] = None

In [None]:
for index, row in dataframe.iterrows():
    tokens, ner_tags = generate_tokens_and_tags(row["Comment"], eval(row["Player Name In Commentary"]), label2id)
    dataframe.at[index, "tokens"] = tokens
    dataframe.at[index, "ner_tags"] = ner_tags

### Model creation

In [None]:
!pip install transformers datasets evaluate



In [None]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Split dataframe
train_df, test_df = train_test_split(dataframe[["tokens", "ner_tags"]], test_size=0.2, random_state=42)

# Convert to HuggingFace Dataset
dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df.reset_index(drop=True)),
})

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"  # or microsoft/MiniLM-L12-H384-uncased if you prefer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding='max_length',
        max_length=128,
    )

    labels = []
    word_ids = tokenized.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(example["ner_tags"][word_idx])  # or -100 if you skip continuation
        previous_word_idx = word_idx

    tokenized["labels"] = labels
    return tokenized

tokenized_datasets = dataset_dict.map(tokenize_and_align_labels)


Map:   0%|          | 0/20344 [00:00<?, ? examples/s]

Map:   0%|          | 0/5086 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# freeze all base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
found_pooler = False
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True
        found_pooler = True
if not found_pooler:
    print("⚠️ No pooler layer found — all base layers remain frozen.")

⚠️ No pooler layer found — all base layers remain frozen.


In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(-1)

    true_labels = [[l for l in label if l != -100] for label in labels]
    true_preds = [[p for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]

    return accuracy.compute(
        predictions=[p for row in true_preds for p in row],
        references=[l for row in true_labels for l in row]
    )

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification

training_args = TrainingArguments(
    output_dir="player-name-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1052,0.101322,0.973302
2,0.0936,0.091364,0.973307
3,0.0896,0.086878,0.973292
4,0.086,0.084262,0.973338
5,0.0853,0.082507,0.973515
6,0.0831,0.081352,0.973598
7,0.0826,0.080577,0.973634
8,0.0799,0.080068,0.973686
9,0.0847,0.079782,0.973743
10,0.0815,0.079693,0.973754


TrainOutput(global_step=25430, training_loss=0.09411921229270608, metrics={'train_runtime': 1331.8862, 'train_samples_per_second': 152.746, 'train_steps_per_second': 19.093, 'total_flos': 1.328955304759296e+16, 'train_loss': 0.09411921229270608, 'epoch': 10.0})

In [None]:
trainer.model.cpu()
trainer.model.save_pretrained("/content/player-name-ner-model")
tokenizer.save_pretrained("/content/player-name-ner-model")

('/content/player-name-ner-model/tokenizer_config.json',
 '/content/player-name-ner-model/special_tokens_map.json',
 '/content/player-name-ner-model/vocab.txt',
 '/content/player-name-ner-model/added_tokens.json',
 '/content/player-name-ner-model/tokenizer.json')

In [None]:
import shutil

In [None]:
shutil.make_archive("/content/player-name-ner-model", 'zip', "/content/player-name-ner-model")

'/content/player-name-ner-model.zip'

In [None]:
dataset_dict['train'].to_pandas()[["tokens", "ner_tags"]]

Unnamed: 0,tokens,ner_tags
0,"[mbeumo, has, assisted, his, fourth, goal, in,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[a, rare, chance, for, watkins, is, ultimately...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[faes, gets, carried, away, in, his, efforts, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[konate, drags, cunha, back, to, prevent, any,...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[hudson-odoi, tried, a, couple, of, times, to,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
20339,"[vardy, nearly, causes, a, disastrous, error, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
20340,"[nunez, and, badiashile, clash, again,, this, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
20341,"[diaz, weaves, into, the, united, box,, but, h...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
20342,"[it's, another, corner, for, villa, after, wat...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
ner_model = AutoModelForTokenClassification.from_pretrained("/content/player-name-ner-model")
tokenizer = AutoTokenizer.from_pretrained("/content/player-name-ner-model")

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("player-name-ner-model")
model = AutoModelForTokenClassification.from_pretrained("player-name-ner-model")
label_map = model.config.id2label

# Input text
text = "Goal by Cristiano Ronaldo assisted by Vinicius Jr"
tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
outputs = model(**tokens)
predictions = torch.argmax(outputs.logits, dim=2)

# Convert tokens + labels
tokens_list = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])
predicted_labels = [label_map[label.item()] for label in predictions[0]]

for token, label in zip(tokens_list, predicted_labels):
    print(f"{token:15} → {label}")

[CLS]           → O
Goal            → O
by              → O
C               → O
##rist          → O
##iano          → O
Ronald          → O
##o             → O
assisted        → O
by              → O
Vin             → O
##ici           → O
##us            → O
Jr              → O
[SEP]           → O


## MP4 to WAV converter

In [None]:
!pip install moviepy



In [None]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
from pydub import AudioSegment
import os

def convert_to_wav_resample_pydub(input_path, output_path=None, target_sr=48000):
    # Load MP4 audio
    audio = AudioSegment.from_file(input_path, format="mp4")

    # Resample
    audio = audio.set_frame_rate(target_sr)

    # Optional: Set mono
    audio = audio.set_channels(1)

    if output_path is None:
        base = os.path.splitext(input_path)[0]
        output_path = base + ".wav"

    # Export as WAV
    audio.export(output_path, format="wav")
    print(f"✅ Saved WAV @ {target_sr} Hz → {output_path}")
    return output_path

In [None]:
convert_mp4_to_wav_resample_pydub("audio_samples/premier_league.mp4")

✅ Saved WAV @ 48000 Hz → audio_samples/premier_league.wav


'audio_samples/premier_league.wav'

## Noise Filter

In [None]:
!pip install torch torchaudio -f https://download.pytorch.org/whl/cpu/torch_stable.html --quiet
!pip install deepfilternet --quiet

In [None]:
from df.enhance import enhance, init_df, load_audio, save_audio
from df.utils import download_file

if __name__ == "__main__":
    # Load default model
    model, df_state, _ = init_df()
    # Download and open some audio file. You use your audio files here
    audio_path = download_file(
        "https://github.com/Rikorose/DeepFilterNet/raw/e031053/assets/noisy_snr0.wav",
        download_dir=".",
    )
    audio, _ = load_audio(audio_path, sr=df_state.sr())
    # Denoise the audio
    enhanced = enhance(model, df_state, audio)
    # Save for listening
    save_audio("enhanced.wav", enhanced, df_state.sr())

[32m2025-03-25 18:11:11[0m | [1mINFO    [0m | [36mDF[0m | [1mLoading model settings of DeepFilterNet3[0m
[32m2025-03-25 18:11:11[0m | [1mINFO    [0m | [36mDF[0m | [1mUsing DeepFilterNet3 model at /root/.cache/DeepFilterNet/DeepFilterNet3[0m
[32m2025-03-25 18:11:11[0m | [1mINFO    [0m | [36mDF[0m | [1mInitializing model `deepfilternet3`[0m
[32m2025-03-25 18:11:11[0m | [1mINFO    [0m | [36mDF[0m | [1mFound checkpoint /root/.cache/DeepFilterNet/DeepFilterNet3/checkpoints/model_120.ckpt.best with epoch 120[0m
[32m2025-03-25 18:11:11[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on device cuda:0[0m
[32m2025-03-25 18:11:11[0m | [1mINFO    [0m | [36mDF[0m | [1mModel loaded[0m


TypeError: argument 'input': 'ndarray' object cannot be converted to 'PyArray<T, D>'

### Enhance audio file

In [None]:
from df.enhance import enhance, init_df, load_audio, save_audio
from df.utils import download_file
import torch

def noise_filter(filepath):
    # Load default model
    model, df_state, _ = init_df()
    # Download and open some audio file. You use your audio files here
    audio_path = filepath
    base = os.path.splitext(audio_path)[0]
    output_path = base + ".wav"
    audio, _ = load_audio(audio_path, sr=df_state.sr())
    # audio_tensor = torch.tensor(audio, dtype=torch.float32)
    # Denoise the audio
    enhanced = enhance(model, df_state, audio)
    # Save for listening
    save_audio(output_path, enhanced, df_state.sr())

In [None]:
from df.enhance import enhance, init_df, load_audio, save_audio
from df.utils import download_file

if __name__ == "__main__":
    # Load default model
    model, df_state, _ = init_df()
    # Download and open some audio file. You use your audio files here
    audio_path = "/content/premier_league.wav"
    audio, _ = load_audio(audio_path, sr=df_state.sr())
    # Denoise the audio
    enhanced = enhance(model, df_state, audio)
    # Save for listening
    save_audio("enhanced.wav", enhanced, df_state.sr())

[32m2025-03-24 12:46:13[0m | [1mINFO    [0m | [36mDF[0m | [1mLoading model settings of DeepFilterNet3[0m
[32m2025-03-24 12:46:13[0m | [1mINFO    [0m | [36mDF[0m | [1mUsing DeepFilterNet3 model at /root/.cache/DeepFilterNet/DeepFilterNet3[0m
[32m2025-03-24 12:46:13[0m | [1mINFO    [0m | [36mDF[0m | [1mInitializing model `deepfilternet3`[0m
[32m2025-03-24 12:46:14[0m | [1mINFO    [0m | [36mDF[0m | [1mFound checkpoint /root/.cache/DeepFilterNet/DeepFilterNet3/checkpoints/model_120.ckpt.best with epoch 120[0m
[32m2025-03-24 12:46:14[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on device cuda:0[0m
[32m2025-03-24 12:46:14[0m | [1mINFO    [0m | [36mDF[0m | [1mModel loaded[0m


### Live enhancement

In [None]:
from df.enhance import enhance, init_df, load_audio
import torch
import time

model, df_state, _ = init_df()

torch.cuda.empty_cache()
def noise_filter_live(chunk):
    chunk_tensor = torch.tensor(chunk, dtype=torch.float32)
    enhanced = enhance(model, df_state, chunk_tensor)
    return enhanced.squeeze(0).numpy()

[32m2025-03-25 10:17:09[0m | [1mINFO    [0m | [36mDF[0m | [1mLoading model settings of DeepFilterNet3[0m
[32m2025-03-25 10:17:09[0m | [1mINFO    [0m | [36mDF[0m | [1mUsing DeepFilterNet3 model at /root/.cache/DeepFilterNet/DeepFilterNet3[0m
[32m2025-03-25 10:17:09[0m | [1mINFO    [0m | [36mDF[0m | [1mInitializing model `deepfilternet3`[0m
[32m2025-03-25 10:17:09[0m | [1mINFO    [0m | [36mDF[0m | [1mFound checkpoint /root/.cache/DeepFilterNet/DeepFilterNet3/checkpoints/model_120.ckpt.best with epoch 120[0m
[32m2025-03-25 10:17:09[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on device cpu[0m
[32m2025-03-25 10:17:09[0m | [1mINFO    [0m | [36mDF[0m | [1mModel loaded[0m


In [None]:
from df.enhance import enhance, init_df, load_audio, save_audio
from df.utils import download_file

if __name__ == "__main__":
    # Load default model
    model, df_state, _ = init_df()
    # Download and open some audio file. You use your audio files here
    audio_path = download_file(
        "https://github.com/Rikorose/DeepFilterNet/raw/e031053/assets/noisy_snr0.wav",
        download_dir=".",
    )
    audio, _ = load_audio(audio_path, sr=df_state.sr())
    # Denoise the audio
    enhanced = enhance(model, df_state, audio)
    # Save for listening
    save_audio("enhanced.wav", enhanced, df_state.sr())

[32m2025-03-25 10:16:22[0m | [1mINFO    [0m | [36mDF[0m | [1mLoading model settings of DeepFilterNet3[0m
[32m2025-03-25 10:16:22[0m | [1mINFO    [0m | [36mDF[0m | [1mUsing DeepFilterNet3 model at /root/.cache/DeepFilterNet/DeepFilterNet3[0m
[32m2025-03-25 10:16:22[0m | [1mINFO    [0m | [36mDF[0m | [1mInitializing model `deepfilternet3`[0m
[32m2025-03-25 10:16:22[0m | [1mINFO    [0m | [36mDF[0m | [1mFound checkpoint /root/.cache/DeepFilterNet/DeepFilterNet3/checkpoints/model_120.ckpt.best with epoch 120[0m
[32m2025-03-25 10:16:22[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on device cpu[0m
[32m2025-03-25 10:16:22[0m | [1mINFO    [0m | [36mDF[0m | [1mModel loaded[0m


TypeError: argument 'input': 'ndarray' object cannot be converted to 'PyArray<T, D>'

In [None]:
!deepFilter /content/noisy_snr0.wav

  from torchaudio.backend.common import AudioMetaData
[32m2025-03-25 10:15:57[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on torch 2.6.0+cu124[0m
[32m2025-03-25 10:15:57[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on host de1701537176[0m
fatal: not a git repository (or any of the parent directories): .git
[32m2025-03-25 10:15:57[0m | [1mINFO    [0m | [36mDF[0m | [1mLoading model settings of DeepFilterNet3[0m
[32m2025-03-25 10:15:57[0m | [1mINFO    [0m | [36mDF[0m | [1mUsing DeepFilterNet3 model at /root/.cache/DeepFilterNet/DeepFilterNet3[0m
[32m2025-03-25 10:15:57[0m | [1mINFO    [0m | [36mDF[0m | [1mInitializing model `deepfilternet3`[0m
[32m2025-03-25 10:15:57[0m | [1mINFO    [0m | [36mDF[0m | [1mFound checkpoint /root/.cache/DeepFilterNet/DeepFilterNet3/checkpoints/model_120.ckpt.best with epoch 120[0m
[32m2025-03-25 10:15:57[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on device cpu[0m
[32m2025-03-25 10:15:57[0m | [1mINF

### Live Enhancement with DEMUCS

In [None]:
!pip install -U demucs --quiet

      Successfully uninstalled nvidia-cusparse-cu12-12.5.1.3
  Attempting uninstall: nvidia-cudnn-cu12
    Found existing installation: nvidia-cudnn-cu12 9.3.0.75
    Uninstalling nvidia-cudnn-cu12-9.3.0.75:
      Successfully uninstalled nvidia-cudnn-cu12-9.3.0.75
  Attempting uninstall: nvidia-cusolver-cu12
    Found existing installation: nvidia-cusolver-cu12 11.6.3.83
    Uninstalling nvidia-cusolver-cu12-11.6.3.83:
      Successfully uninstalled nvidia-cusolver-cu12-11.6.3.83
Successfully installed antlr4-python3-runtime-4.9.3 demucs-4.0.1 dora-search-0.1.12 julius-0.2.7 lameenc-1.8.1 nvidia-cublas-cu12-12.4.5.8 nvidia-cuda-cupti-cu12-12.4.127 nvidia-cuda-nvrtc-cu12-12.4.127 nvidia-cuda-runtime-cu12-12.4.127 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.2.1.3 nvidia-curand-cu12-10.3.5.147 nvidia-cusolver-cu12-11.6.1.9 nvidia-cusparse-cu12-12.3.1.170 nvidia-nvjitlink-cu12-12.4.127 omegaconf-2.3.0 openunmix-1.3.0 retrying-1.3.4 submitit-1.5.2 treetable-0.2.5


In [None]:
!demucs --two-stems=vocals /content/premier_league.mp4

[1mImportant: the default model was recently changed to `htdemucs`[0m the latest Hybrid Transformer Demucs model. In some cases, this model can actually perform worse than previous models. To get back the old default model use `-n mdx_extra_q`.
Downloading: "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/955717e8-8726e21a.th" to /root/.cache/torch/hub/checkpoints/955717e8-8726e21a.th
100% 80.2M/80.2M [00:00<00:00, 214MB/s]
Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /content/separated/htdemucs
Separating track /content/premier_league.mp4
100%|████████████████████████████████████████████████████████████████████████| 117.0/117.0 [00:08<00:00, 13.98seconds/s]


In [None]:
!pip install rnnoise --quiet

[31mERROR: Could not find a version that satisfies the requirement rnnoise (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for rnnoise[0m[31m
[0m

## Silero VAD based EOS

In [None]:
!pip install numpy==2.0.0

Collecting numpy==2.0.0
  Downloading numpy-2.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/60.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
deepfilternet 0.5.6 requires numpy<2.0,>=1.22, b

In [None]:
import torch
import numpy as np
import os
import shutil

In [None]:
# Load Silero VAD
vad_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', trust_repo=True)
(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /root/.cache/torch/hub/master.zip


In [None]:
# Constants
SAMPLING_RATE = 16000
CHUNK_SIZE = 512
silence_samples_threshold = SAMPLING_RATE // 2  # 0.5 sec silence = end of utterance

# Prepare output directory
CHUNK_ID = 0
utterance_buffer = []
silence_counter = 0

### VAD from live audio

In [None]:
# Save utterance chunk
def output_buffer():
    global utterance_buffer
    if len(utterance_buffer) > 0:
        audio_data = np.concatenate(utterance_buffer)
        audio_data_int16 = np.array(audio_data * 32768, dtype=np.int16)
        utterance_buffer.clear()

        return audio_data

In [None]:
# Process audio file
def process_audio_chunk(chunk_sample):

    # Iterate in chunks
    for i in range(0, len(chunk_sample), CHUNK_SIZE):
        chunk = chunk_sample[i:i + CHUNK_SIZE]
        chunk_tensor = chunk

        if len(chunk_tensor) < CHUNK_SIZE:
            break  # end of audio

        speech_prob = vad_model(chunk_tensor, SAMPLING_RATE).item()

        if speech_prob > 0.7:
            utterance_buffer.append(chunk)
            silence_counter = 0
            print(f"{i / SAMPLING_RATE:.2f}s - Speech detected")
        else:
            if len(utterance_buffer) > 0:
                silence_counter += len(chunk)
                if silence_counter >= silence_samples_threshold:
                    output_buffer()
                    silence_counter = 0
                else:
                    utterance_buffer.append(chunk)
                    print(f"{i / SAMPLING_RATE:.2f}s - Weak speech, added to buffer")

    # Final save if audio ends while speaking
    if len(utterance_buffer) > 0:
        output_buffer()

### Live audio simulation

In [None]:
import librosa
import base64
import numpy as np
import time
import IPython.display as ipd
from io import BytesIO
import soundfile as sf
import torchaudio

In [None]:
# JavaScript to stop all previous audio before playing the new one
js_code = f"""
var allAudios = document.getElementsByTagName('audio');
for (var i = 0; i < allAudios.length; i++) {{
    allAudios[i].pause();
    allAudios[i].currentTime = 0;
}}
"""
ipd.display(ipd.Javascript(js_code))  # Execute JS autoplay

<IPython.core.display.Javascript object>

In [None]:
# Load full audio
AUDIO_FILE = "/content/premier_league.wav"
SAMPLING_RATE = 48000
CHUNK_DURATION = 1  # in seconds

audio, sr = librosa.load(AUDIO_FILE, sr=SAMPLING_RATE)
chunk_size = int(sr * CHUNK_DURATION)
BUFFER = []

In [None]:
# Function to encode audio to base64
def encode_audio(audio_chunk, sr):
    buffer = BytesIO()
    sf.write(buffer, audio_chunk, sr, format="wav")
    base64_audio = base64.b64encode(buffer.getvalue()).decode("utf-8")
    return base64_audio

In [None]:
# Stream audio chunks
for start in range(0, len(audio), chunk_size):
    chunk = audio[start: start + chunk_size]

    # DeepFilterNet
    start = time.perf_counter()
    enhanced_chunk = noise_filter_live(chunk)
    stop = time.perf_counter()

    resampler = torchaudio.transforms.Resample(orig_freq=48000, new_freq=16000)
    waveform_16k = resampler(torch.tensor(enhanced_chunk, dtype=torch.float32))

    # Silero VAD
    vad_chunk = process_audio_chunk(waveform_16k)

    base64_audio = encode_audio(enhanced_chunk, 48000)

    # JavaScript to autoplay audio
    js_code = f"""
    var audio = new Audio("data:audio/wav;base64,{base64_audio}");
    audio.play();
    """
    ipd.display(ipd.Javascript(js_code))

    # Simulate real-time delay
    delay = (len(chunk) / sr)
    time.sleep(delay)

    print(stop - start)
    # print(f"🔊 Auto-playing chunk {start//chunk_size} ({delay:.2f} sec), encoding delay: {stop_enc - start_enc}, enhancement delay: {stop_enh - start_enh}, play delay: {stop_play - start_play}")


TypeError: argument 'input': 'ndarray' object cannot be converted to 'PyArray<T, D>'

### VAD from audio file

In [None]:
shutil.rmtree('utterances', ignore_errors=True)
os.makedirs('utterances', exist_ok=True)

# Save utterance chunk
def save_utterance():
    global CHUNK_ID, utterance_buffer
    if len(utterance_buffer) > 1:
        audio_data = np.concatenate(utterance_buffer)
        audio_data_int16 = np.array(audio_data * 32768, dtype=np.int16)
        filename = f'utterances/utterance_{CHUNK_ID}.wav'
        write(filename, SAMPLING_RATE, audio_data_int16)
        print(f'[✅ SAVED] {filename} (Length: {len(audio_data) / SAMPLING_RATE:.2f}s)')
        CHUNK_ID += 1
        utterance_buffer.clear()

# Process audio file
def process_audio_file(file_path):
    global utterance_buffer, silence_counter

    # Load audio file
    wav = read_audio(file_path, sampling_rate=SAMPLING_RATE)

    # Iterate in chunks
    for i in range(0, len(wav), CHUNK_SIZE):
        chunk = wav[i:i + CHUNK_SIZE]

        if len(chunk) < CHUNK_SIZE:
            break  # end of audio

        chunk_tensor = chunk
        speech_prob = vad_model(chunk_tensor, SAMPLING_RATE).item()

        if speech_prob > 0.8:
            utterance_buffer.append(chunk)
            silence_counter = 0
            print(f"{i / SAMPLING_RATE:.2f}s - Speech detected")
        else:
            if len(utterance_buffer) > 0:
                silence_counter += len(chunk)
                if silence_counter >= silence_samples_threshold:
                    save_utterance()
                    silence_counter = 0
                else:
                    utterance_buffer.append(chunk)
                    print(f"{i / SAMPLING_RATE:.2f}s - Weak speech, added to buffer")

    # Final save if audio ends while speaking
    if len(utterance_buffer) > 0:
        save_utterance()

In [None]:
process_audio_file('/content/audio_samples/premier_league_converted.wav')

## Faster-Whisper

In [None]:
!pip install faster-whisper

Collecting faster-whisper
  Downloading faster_whisper-1.1.1-py3-none-any.whl.metadata (16 kB)
Collecting ctranslate2<5,>=4.0 (from faster-whisper)
  Downloading ctranslate2-4.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting av>=11 (from faster-whisper)
  Downloading av-14.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting coloredlogs (from onnxruntime<2,>=1.14->faster-whisper)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<2,>=1.14->faster-whisper)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading faster_whisper-1.1.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m

In [None]:
import os
import time
import shutil
from faster_whisper import WhisperModel

In [None]:
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
model_size = "distil-large-v3"

# Run on GPU with FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16")

def stt(filename):
    global comments
    # print("Transcribing...")
    segments, info = model.transcribe(f'/content/drive/MyDrive/utterances/{filename}', language="en", vad_filter=False)
    # print("Transcribed")
    # print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

    # print(segments[0].start, segments[-1].end, sep="->", end=": ")
    comment = ""
    for segment in segments:
        comment += str(segment.text).replace("\n", "") + " "

    return comment

In [None]:
comments = []
for filename in sorted(os.listdir('/content/drive/MyDrive/utterances')):
    comment = stt(filename)
    comments.extend([comment])

In [None]:
comments

[" tell with some time and space again running at castania again oh brilliant from matty's  towel what a run and he plays it across the face of goal might be better going for it himself  now beryvale ",
 ' hangs across in to Solanky.  This is as good as spurs the fashion so far.  Benton Corr.  Tell again. ',
 ' Madison in, son. ',
 ' Selanky and say by Leno. ',
 ' Curling effort, beaten away by Leno and Selanky over.  Great chance.  Possibly Spurs best.  Clips his heel and off goes Pereira.  Muniz in the middle.  Vicaria with a punch.  William in his kind of shooting area and he goes for it. ',
 ' William, Robinson. ',
 ' Briaray Pereira Muniz just managed to guide it past Vicario. ',
 ' Vail, great feat.  Gets around Bassie! ',
 ' No, says Calvin Bassi. Dominic Selanky had a good view with that and was appealing. ',
 ' Leah Bassi in a real panic. ',
 " It's not a great challenge for Calvin Bassie. ",
 ' Ben Davis, having to watch the flight of that as Sessian goes after it.  Sassignon

## Data processing pipeline