In [1]:
from src.config import *
from src.preprocessing import *
from src.utils import load_data
import matplotlib.pyplot as plt

prep = Preprocessor()

In [2]:
news_df = load_data('../'+DEVELOPMENT_PATH)
X = news_df.drop(columns='y')
X_prep, idxs = prep.fit_transform(X.copy())

In [3]:
X_prep

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 816301 stored elements and shape (79996, 6335)>

In [4]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix, hstack

from src.config import *


class Preprocessor2:
    
    def __init__(self):
        self.df = None
        self.is_fit = False
        self.vectorizer = None
        self.ohe = None
        self.top_50 = None
        self.title_cols_to_keep_idxs = []

    def timestamp_management(self):
        # 1) Parse timestamp: treat invalid placeholders as missing (NaT)
        ts = self.df["timestamp"].replace("0000-00-00 00:00:00", pd.NA)
        ts = pd.to_datetime(ts, errors="coerce")  # invalid -> NaT
        self.df["timestamp"] = ts

        # 2) Missingness flag (important!)
        self.df["timestamp_missing"] = self.df["timestamp"].isna().astype(int)

        # 3) Create time features ONLY where timestamp is valid
        valid = self.df["timestamp"].notna()

        # is_weekend: keep as 0/1; set to 0 for missing (or leave NaN if you prefer)
        self.df["is_weekend"] = 0
        self.df.loc[valid, "is_weekend"] = (
            self.df.loc[valid, "timestamp"].dt.day_of_week.isin([5, 6]).astype(int)
        )

        # hour sin/cos
        self.df["hour_sin"] = 0.0
        self.df["hour_cos"] = 0.0
        h = self.df.loc[valid, "timestamp"].dt.hour.astype(float)
        self.df.loc[valid, "hour_sin"] = np.sin(2 * np.pi * h / 24)
        self.df.loc[valid, "hour_cos"] = np.cos(2 * np.pi * h / 24)

        # month sin/cos
        self.df["month_sin"] = 0.0
        self.df["month_cos"] = 0.0
        m = self.df.loc[valid, "timestamp"].dt.month.astype(float)
        self.df.loc[valid, "month_sin"] = np.sin(2 * np.pi * m / 12)
        self.df.loc[valid, "month_cos"] = np.cos(2 * np.pi * m / 12)

        # year (numeric): set to -1 for missing (or keep NaN)
        self.df["year"] = -1
        self.df.loc[valid, "year"] = self.df.loc[valid, "timestamp"].dt.year.astype(int)

    def pagerank_manegement(self):
        self.df['page_rank'] = self.df['page_rank'].map({2:0, 3:0, 4:1, 5:2})

    def na_management(self): 
        null_title_idx = self.df[self.df['title'].isna()].index 
        self.df.drop(null_title_idx, inplace=True)
        
    def title_management(self):
        if self.is_fit:
            self.vectorizer = TfidfVectorizer(stop_words="english", min_df=10, ngram_range=(1,3), sublinear_tf=True)
            tfidf_matrix = self.vectorizer.fit_transform(self.df["title"].fillna(""))
            pattern = r"^\d+$"
            feat = self.vectorizer.get_feature_names_out()
            matches = [m[0] for w in feat if (m := re.findall(pattern, w))]
            title_cols_to_drop = [c for c in feat if len(c) <= 2] + matches    
            self.title_cols_to_keep_idxs = [i for i,c in enumerate(feat) if c not in title_cols_to_drop]

        else:
            tfidf_matrix = self.vectorizer.transform(self.df["title"].fillna(""))

        tfidf_matrix = tfidf_matrix[:, self.title_cols_to_keep_idxs]
        return tfidf_matrix,  [f for f in feat if f not in title_cols_to_drop]

    def source_management(self):
        df = self.df["source"].fillna("MISSING")

        if self.is_fit:
            self.top_50 = df.value_counts()[:50].index.to_list()
            self.ohe = OneHotEncoder(categories=[self.top_50], handle_unknown="ignore", sparse_output=True)
            X = df.where(df.isin(self.top_50), np.nan).to_frame()
            X_ohe = self.ohe.fit_transform(X)
        else:
            X = df.where(df.isin(self.top_50), np.nan).to_frame()
            X_ohe = self.ohe.transform(X)
        
        return X_ohe

    def full_prep(self,df):
        self.df = df
        self.na_management()
        self.timestamp_management()
        X_ohe = self.source_management()
        tfidf_matrix = self.title_management()
        output = self.df.copy()
        idxs = output.index
        self.df = None


        output.drop(columns=COLUMNS_TO_DROP, inplace=True, errors='ignore')
        output = csr_matrix(output.to_numpy(dtype=np.float32))
        X = hstack([tfidf_matrix, X_ohe, output], format="csr")
        return X, idxs


    def fit_transform(self, df):
        self.is_fit = True
        return self.full_prep(df)

    def transform(self, df):
        if self.vectorizer is None or self.ohe is None or self.top_50 is None:
            raise RuntimeError("Preprocessor.fit_transform called before fit.")
        
        self.is_fit = False
        return self.full_prep(df)

In [5]:
prep2 = Preprocessor2()
prep2.is_fit = True

prep2.df = news_df
prep2.na_management()
prep2.timestamp_management()
tfidf_matrix, feat = prep2.title_management()
# output = prep2.df.copy()
# idxs = output.index



In [6]:
row = tfidf_matrix[0]
nonzero_idxs = row.nonzero()[1]

tokens = np.array(feat)[nonzero_idxs]
values = row.data

pd.Series(values, index=tokens).sort_values(ascending=False)

revenue    0.473376
nigeria    0.457874
opec       0.453730
boosts     0.446236
oil        0.321272
dtype: float64

In [7]:
news_df.loc[0, 'title']

'OPEC Boosts Nigeria&#39;s Oil Revenue By .82m Bpd'

In [8]:
feat

['000 jobs',
 '000 mortgage',
 '000 mortgage 690',
 '000 mortgage 730',
 '10 000',
 '10 killed',
 '10 million',
 '10 percent',
 '10 year',
 '10 years',
 '100 000',
 '100 million',
 '100m',
 '10th',
 '11th',
 '12 000',
 '12 percent',
 '12th',
 '13th',
 '14 ap',
 '15 years',
 '150 000',
 '150 000 mortgage',
 '175 000',
 '175 000 mortgage',
 '1bn',
 '1st',
 '20 years',
 '2008 race',
 '24 hours',
 '25 million',
 '25 percent',
 '25 years',
 '2nd',
 '30 days',
 '30 year',
 '36 billion',
 '36 million',
 '39 39',
 '39 afp',
 '39 best',
 '39 bid',
 '39 big',
 '39 death',
 '39 end',
 '39 future',
 '39 health',
 '39 husband',
 '39 iraq',
 '39 latest',
 '39 law',
 '39 life',
 '39 ll',
 '39 moon',
 '39 neal',
 '39 new',
 '39 nuclear',
 '39 plan',
 '39 return',
 '39 shark',
 '39 star',
 '39 win',
 '3bn',
 '3q profit',
 '3rd',
 '3rd quarter',
 '49ers',
 '4th',
 '50 barrel',
 '500 000',
 '500 jobs',
 '5th',
 '690 month',
 '6th',
 '70 000',
 '730 month',
 '76ers',
 'a380',
 'abandoned',
 'abbas',
 'abb

Number must be for sure managed in a different way...

In [9]:
titles_with_numbers = news_df[news_df["title"].str.contains(r"\d", regex=True, na=False)]["title"]

len(titles_with_numbers)

13445

In [10]:
for title in titles_with_numbers[:20]:
    print(title)
    print('='*30)

OPEC Boosts Nigeria&#39;s Oil Revenue By .82m Bpd
Yearender: Mideast peace roadmap reaches dead-end in 2004
Battleground Dispatches for Oct. 5 \
    (CQPolitics.com)\

Opera Star Robert Merrill Dies at 85
Â£800M PLEDGED FOR VIRGIN BOSS&#39;S SPACE TRIPS
Wells Leads Grizzlies Past Spurs 93-90 (AP)
 Double Bombing Kills At Least 42 in Hyderabad 
MmO2 signs i-mode deal
German Police to Drop 1970s Mustard/beige Uniforms (Reuters)
Australia&#39;s Voters to Choose Between Howard, Latham (Update1)
Thai 2004 growth forecast cut to 6.0-6.5 percent as Q2 growth slows (AFP)
Schroeder begins 3-day China visit
US IT Spending to Grow 7% Next Year
Downer: Hicks lobby &#39;if innocent&#39;
Tigers to resume &#39;&#39;freedom struggle&#39;&#39; if talks not resumed
Intel Chips to Shrink to 32-Nanometer Process
A. D. Lewis, Administrator Behind Conrail, Is Dead at 88
Pakistan Kills 50 Fighters on Afghan Border
Bobcats 94 Hornets 93, overtime
Eros announces 5-year deal with K Sera Sera 


Okay, maybe it's a big work but I want to categorize numbers:

- year: (1900-2025 also with an ending s) 
- percentage: (%number o number%)
- money: (\$number, number\$, or euros... )
- score: (number - number)

and I substitute with yeartoken, percentagetoken etc...

Now I need to validate this initial intuitive pattern with real data

In [11]:
for title in titles_with_numbers:
    print(title)
    print('='*30)

OPEC Boosts Nigeria&#39;s Oil Revenue By .82m Bpd
Yearender: Mideast peace roadmap reaches dead-end in 2004
Battleground Dispatches for Oct. 5 \
    (CQPolitics.com)\

Opera Star Robert Merrill Dies at 85
Â£800M PLEDGED FOR VIRGIN BOSS&#39;S SPACE TRIPS
Wells Leads Grizzlies Past Spurs 93-90 (AP)
 Double Bombing Kills At Least 42 in Hyderabad 
MmO2 signs i-mode deal
German Police to Drop 1970s Mustard/beige Uniforms (Reuters)
Australia&#39;s Voters to Choose Between Howard, Latham (Update1)
Thai 2004 growth forecast cut to 6.0-6.5 percent as Q2 growth slows (AFP)
Schroeder begins 3-day China visit
US IT Spending to Grow 7% Next Year
Downer: Hicks lobby &#39;if innocent&#39;
Tigers to resume &#39;&#39;freedom struggle&#39;&#39; if talks not resumed
Intel Chips to Shrink to 32-Nanometer Process
A. D. Lewis, Administrator Behind Conrail, Is Dead at 88
Pakistan Kills 50 Fighters on Afghan Border
Bobcats 94 Hornets 93, overtime
Eros announces 5-year deal with K Sera Sera 
The Workplace: It&

In [12]:
import re

def titles_by_regex(titles, pattern):
    regex = re.compile(pattern, flags=re.IGNORECASE)

    count = 0
    matches = []
    for t in titles:
        if regex.search(t):
            matches.append(t)


    print(f"\nTotal matches: {len(matches)}")
    return matches

In [13]:
year_pattern = r"\b(19|20)\d{2}s?\b"
matches = titles_by_regex(titles_with_numbers, year_pattern)



Total matches: 933


In [14]:
for title in matches[:50]:
    print(title)
    print("="*30)

Yearender: Mideast peace roadmap reaches dead-end in 2004
German Police to Drop 1970s Mustard/beige Uniforms (Reuters)
Thai 2004 growth forecast cut to 6.0-6.5 percent as Q2 growth slows (AFP)
Get Peachtree Pro Accounting 2008 free after rebate
Ford 4th-quarter loss narrows; forecasts 2008 loss
Best First Class 2004
2004 forecast raised on strength of &#39;The Incredibles&#39;
Forecaster predicts busy 2007 U.S. hurricane season \
    (Reuters)\

Bonds used steroids in 2003, trainer says on recording
2010 World Cup the target as Burley takes charge of Scotland \
    (AFP)\

2004 Presidential Endorsements (AP)
Lilly sees slower profit growth in 2007
Spaceflight to cost $40 million in 2009
World briefs - November 30, 2004
Sen. Brownback drops out of 2008 campaign \
    (Reuters)\

SideStep Named Momentum 2006 Company at the IBDNetwork Annual Momentum Growth Conference
Aug. 15, 2004
Go &quot;Dutch&quot; in 2005: Eat A Polymeal of Wine, Fish, Chocolate <b>...</b>
Whatever Happened To . . . 

In [15]:
pct_pattern = r"(\b\d+(\.\d+)?\s?%)|(%\s?\d+(\.\d+)?\b)|\b\d+(?:\.\d+)?\s?percent(?:age)?s?\b"
# pct_pattern2 = r"\b(?:\d+(?:\.\d+)?\s?%|%\s?\d+(?:\.\d+)?|\d+(?:\.\d+)?\s?percent(?:age)?s?)\b"
titles = titles_by_regex(titles_with_numbers, pct_pattern)
# titles.append(titles_by_regex(titles_with_numbers, pct_pattern2))
for title in titles:
    print(title)
    print('='*30)


Total matches: 452
Thai 2004 growth forecast cut to 6.0-6.5 percent as Q2 growth slows (AFP)
US IT Spending to Grow 7% Next Year
State-Owned Russian Bank Buys a 5% Stake in EADS
Earnings Rise 24% at Kimberly-Clark
CA cutting 5 percent of workforce
Households face 15% jump in fuel prices
CIT Group&#39;s Q3 profit up 25%, tops by a penny
Heineken 1H profit rises 26 percent \
    (AP)\

Grass biofuels 'cut CO2 by 94%'
Earnings Fall 32 Percent at Bank of America
 Freddie Mac Profit Down 45 Percent in 2Q 
A-level top grade will need 90%
Citi mulls cutting work force by 5 to 10 percent: report
Crude Oil Surges 33.6 Percent in 2004
 Citigroup Profits Jump 18 Percent in 2Q 
Fed raises rates in U.S. for a 6th time, to 2.5%
IPO Gives 17 Percent Hike to Shares at DivX
UPDATE 1-Perry acquires 9.89 percent stake in Mylan Labs
Smithfield 2nd-Qtr Net Soars 61% on Higher Hog Prices (Update3)
Durable goods orders slid 8.3 percent in Oct
TXU quarterly earnings rise 33 percent
Boston Scientific stock cl

In [16]:
money_pattern = r"[$€£]\s?\d+(\.\d+)?\w*\b"
titles_by_regex(titles_with_numbers, money_pattern)


Total matches: 1368


['Â£800M PLEDGED FOR VIRGIN BOSS&#39;S SPACE TRIPS',
 'Fattah Got a $100 Gift From Santa Street',
 'Freescale 1000-worker cull to cost $65m',
 'ADV: $175,000 Mortgage for Under $730/Month',
 'Mortgage insurer MGIC to buy Radian for $5 bln \\\n    (Reuters)\\\n',
 'Jaguar negotiates Â£534m Ford aid',
 'Update: Motorola to buy Symbol for $3.9 billion',
 'UN pledges help for Darfur refugees, Sudan wants $300 mln',
 'Everton reject Â£20m Rooney bid',
 'Basket Nears $8,800 With 5 Days to Go',
 'HP to pay $14.5m following probe into boardroom leaks',
 'PacifiCare buys AMS for $502 million',
 'General Mills sells Snack Ventures Europe stake for $750 million',
 'CA to acquire Netegrity for $430M',
 ' LSI to Buy Agere in $4B Stock Swap ',
 'Murder manhunt costs reach Â£1.5m',
 'ADV: $150,000 Mortgage for Under $690/Month',
 'Drug pipeline brings $80m more to Synta ',
 'Google boys to pocket Â£650m from share sale',
 'eBay snaps up Rent.com for $415m',
 'Spaceflight to cost $40 million in 2009',

In [17]:
score_pattern = r"\b\d+\s?[-–]\s?\d+\b"
titles_by_regex(titles_with_numbers, score_pattern)


Total matches: 501


['Wells Leads Grizzlies Past Spurs 93-90 (AP)',
 'Thai 2004 growth forecast cut to 6.0-6.5 percent as Q2 growth slows (AFP)',
 "Beltran's blast in 16th lifts Mets 9-8 \\\n    (AP)\\\n",
 'Bills Overpower Browns, 37-7 (AP)',
 'Colorado (3-0) At Missouri (2-1)',
 'Webber Leads Kings Past Clippers, 89-83 (AP)',
 'Knuble nets 2 as Flyers beat Caps 6-4 \\\n    (AP)\\\n',
 'VfB Stuttgart 3-0 Benfica: FT Report',
 "Packers lead Eagles 3-0 on Rayner's FG \\\n    (AP)\\\n",
 'Juventus stretches Serie A lead while Roma and Inter play to 3-3 <b>...</b>',
 'US, Brazil tied 0-0 in gold medal match',
 'Patriots, Now a Perfect 16-0, Go For 19',
 'Redskins Tied, 3-3',
 'Dodgers Lead Cardinals 1-0 After One',
 'Miami Upsets No. 22 Maryland, 75-73 (OT) (AP)',
 'Man Utd 2-0 Arsenal: FT Report',
 'Erat lifts Predators past Flyers 3-2 \\\n    (AP)\\\n',
 'Washington St. tops No. 18 Gonzaga 77-67 \\\n    (AP)\\\n',
 'Rick Majerus Named USC Men&#39;s Basketball Coach For 2005-06 Season',
 'Pena Homers Give R

In [71]:
year_pattern = r"\b(19|20)\d{2}s?\b"
pct_pattern = (r"(\b\d+(\.\d+)?\s?%)"
               r"|(%\s?\d+(\.\d+)?\b)"
               r"|\b\d+(?:\.\d+)?\s?percent(?:age)?s?\b"
                r"\b\d+(?:\.\d+)?\s*[-–]\s*\d+(?:\.\d+)?\s*(?:%|percent(?:age)?s?)\b"
                r"|\b\d+(?:\.\d+)?\s*%\b"
                r"|%\s*\d+(?:\.\d+)?\b"
                r"|\b\d+(?:\.\d+)?\s*percent(?:age)?s?\b"               
)
unit_pattern = r"\b\d+(?:\.\d+)?\s?(?:GB|MB|TB|MP|kg|g|km|m|cm|mm)\b"
money_pattern = r"((\d+((\.|,)\d*)?[$€£]+)|([$€£]+\d+((\.|,)\d*)?))(M|B)*"
score_pattern = r"\b\d+\s?[-–]\s?\d+\b"
quarter_pattern = r"\b(?:[1-4]Q|Q[1-4])\b"
ord_pattern = r"\b\d+(?:st|nd|rd|th)\b"
num_pattern = r"(?<![A-Za-z])\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b(?![A-Za-z])"
iso_pattern = r"\b\d{4,5}:\d{4}\b"


replacements = [
    (iso_pattern,     "ISOTOKEN"),       # 13485:2003
    (quarter_pattern, "QUARTERTOKEN"),   # 3Q, Q4
    (score_pattern,   "SCORETOKEN"),     # 3-1
    (pct_pattern,     "PCTTOKEN"),       # 6%, %6, 6 percent, 6.0-6.5 percent
    (money_pattern,   "MONEYTOKEN"),     # €100, $100.5, 100€ (vedi nota sotto)
    (unit_pattern,    "UNITTOKEN"),      # 8GB, 80kg, 100m
    (year_pattern,    "YEARTOKEN"),      # 1990, 1990s
    (ord_pattern,     "ORDTOKEN"),       # 3rd, 10th
    (num_pattern,     "NUMTOKEN"),       # numeri standalone rimanenti
]

import unicodedata
import html

news_df["title"] = news_df["title"].astype(str).apply(html.unescape)

s = news_df['title'].astype(str)
s = s.apply(lambda x: unicodedata.normalize("NFKC", x))

for pattern, token in replacements:
    s = s.str.replace(
    pattern,
    token,
    regex=True,
    flags=re.IGNORECASE
    )
news_df['title'] = s

In [72]:
titles_with_numbers = news_df[news_df["title"].str.contains(r"\d", regex=True, na=False)]["title"]

len(titles_with_numbers)

890

In [73]:
for title in titles_with_numbers[100:200]:
    print(title)
    print('='*30)

Cardinals Beat Astros to Force Game NUMTOKEN of NL Series (Update1)
SABMiller Venture to Buy Lion Nathan's China Brewers (Update1)
Seven US Soldiers Killed in Texas Helicopter Crash (Update1)
G7 fails to reach debt deal
Airbus sees A350 orders ahead
A âPG-13â Rating, Despite the Haze
Ferrari's Raikkonen storms to F1 title
Bono urges G8 nations to open purses in fight against AIDS \
    (AFP)\

Mayer Goes Nashville for VH1 Special (AP)
G7 warns markets against yen-depressing trades
F1 power struggle now in session.
UB40 singer Campbell quits group
Samsung Forecasts Global Chip Sales to Slow Next Year (Update2)
AWB to Supply AMONEYTOKEN Million of Milling Wheat to China (Update1)
Eight injured in M1 coach crash
Oracle Raises PeopleSoft Bid by PCTTOKEN to MONEYTOKEN Billion (Update5)
Bush Is Leading Kerry in Surveys by Newsweek, Time (Update2)
GSPDA M28 Palm OS Smartphone Hits Asia
Siemens SF65 Mobile Phone Digital Camera
UPS says to cancel order for NUMTOKEN Airbus A380s
Input inflat

In [62]:
pct_pattern = r"[3,4,5]G"
titles = titles_by_regex(titles_with_numbers, pct_pattern)
# titles.append(titles_by_regex(titles_with_numbers, pct_pattern2))
for title in titles:
    print(title)
    print('='*30)


Total matches: 55
SingTel poised to launch first 3G service in Southeast Asia (AFP)
Huawei wins 3G contract from Telfort
Take a Look at Proven Strategies for 3G & Broadband Mobile Data Services & Applications
Vodafone 'looking into' 3G femtocells
Euro iPhone launch will reveal 3G handset for Vodafone, T-Mobile
Vodafone enters 3G battle against Hutchison
Panasonic Unveils International 3G Cell Phone
T-Mobile USA to set up nationwide 3G network \
    (FT.com)\

3G kicks off in Norway
mmO2 to launch speedy 3G
Alienware's 4GHz Pentium
Japan's Docomo Unveils 3G Phones with 3D Sound
DoCoMo and Motorola seal 3G deal
Orange readies a 3G Christmas
Business jets get inflight 3G
HTC Touch Smart Phone Goes 3G
Orange 3G phones launched in UK
Japan to build 3G capacity in China
Motorola, DoCoMo to make 3G phones for business use
Nokia says row with Qualcomm may hurt 3G uptake \
    (Reuters)\

U.K. to test improved indoor 3G coverage by year-end
3GSM World Congress
Report: DoCoMo, Cingular to link 

In [105]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix, hstack
import unicodedata
import html

from src.config import *


class Preprocessor2:
    
    def __init__(self):
        self.df = None
        self.is_fit = False
        self.vectorizer = None
        self.ohe = None
        self.top_50 = None

    def timestamp_management(self):
        # 1) Parse timestamp: treat invalid placeholders as missing (NaT)
        ts = self.df["timestamp"].replace("0000-00-00 00:00:00", pd.NA)
        ts = pd.to_datetime(ts, errors="coerce")  # invalid -> NaT
        self.df["timestamp"] = ts

        # 2) Missingness flag (important!)
        self.df["timestamp_missing"] = self.df["timestamp"].isna().astype(int)

        # 3) Create time features ONLY where timestamp is valid
        valid = self.df["timestamp"].notna()

        # is_weekend: keep as 0/1; set to 0 for missing (or leave NaN if you prefer)
        self.df["is_weekend"] = 0
        self.df.loc[valid, "is_weekend"] = (
            self.df.loc[valid, "timestamp"].dt.day_of_week.isin([5, 6]).astype(int)
        )

        # hour sin/cos
        self.df["hour_sin"] = 0.0
        self.df["hour_cos"] = 0.0
        h = self.df.loc[valid, "timestamp"].dt.hour.astype(float)
        self.df.loc[valid, "hour_sin"] = np.sin(2 * np.pi * h / 24)
        self.df.loc[valid, "hour_cos"] = np.cos(2 * np.pi * h / 24)

        # month sin/cos
        self.df["month_sin"] = 0.0
        self.df["month_cos"] = 0.0
        m = self.df.loc[valid, "timestamp"].dt.month.astype(float)
        self.df.loc[valid, "month_sin"] = np.sin(2 * np.pi * m / 12)
        self.df.loc[valid, "month_cos"] = np.cos(2 * np.pi * m / 12)

        # year (numeric): set to -1 for missing (or keep NaN)
        self.df["year"] = -1
        self.df.loc[valid, "year"] = self.df.loc[valid, "timestamp"].dt.year.astype(int)

    def pagerank_manegement(self):
        self.df['page_rank'] = self.df['page_rank'].map({2:0, 3:0, 4:1, 5:2})

    def na_management(self): 
        null_title_idx = self.df[self.df['title'].isna()].index 
        self.df.drop(null_title_idx, inplace=True)

    @staticmethod
    def clean_number(s):
        year_pattern = r"\b(19|20)\d{2}s?\b"
        pct_pattern = (r"(\b\d+(\.\d+)?\s?%)"
                    r"|(%\s?\d+(\.\d+)?\b)"
                    r"|\b\d+(?:\.\d+)?\s?percent(?:age)?s?\b"
                        r"\b\d+(?:\.\d+)?\s*[-–]\s*\d+(?:\.\d+)?\s*(?:%|percent(?:age)?s?)\b"
                        r"|\b\d+(?:\.\d+)?\s*%\b"
                        r"|%\s*\d+(?:\.\d+)?\b"
                        r"|\b\d+(?:\.\d+)?\s*percent(?:age)?s?\b"               
        )
        unit_pattern = r"\b\d+(?:\.\d+)?\s?(?:GB|MB|TB|MP|kg|g|km|m|cm|mm)\b"
        money_pattern = r"((\d+((\.|,)\d*)?[$€£]+)|([$€£]+\d+((\.|,)\d*)?))(M|B)*"
        score_pattern = r"\b\d+\s?[-–]\s?\d+\b"
        quarter_pattern = r"\b(?:[1-4]Q|Q[1-4])\b"
        ord_pattern = r"\b\d+(?:st|nd|rd|th)\b"
        num_pattern = r"(?<![A-Za-z])\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b(?![A-Za-z])"
        iso_pattern = r"\b\d{4,5}:\d{4}\b"

        replacements = [
            (iso_pattern,     "ISOTOKEN"),       # 13485:2003
            (quarter_pattern, "QUARTERTOKEN"),   # 3Q, Q4
            (score_pattern,   "SCORETOKEN"),     # 3-1
            (pct_pattern,     "PCTTOKEN"),       # 6%, %6, 6 percent, 6.0-6.5 percent
            (money_pattern,   "MONEYTOKEN"),     # €100, $100.5, 100€ (vedi nota sotto)
            (unit_pattern,    "UNITTOKEN"),      # 8GB, 80kg, 100m
            (year_pattern,    "YEARTOKEN"),      # 1990, 1990s
            (ord_pattern,     "ORDTOKEN"),       # 3rd, 10th
            (num_pattern,     "NUMTOKEN"),       # numeri standalone rimanenti
        ]
        for pattern, token in replacements:
            s = s.str.replace(
            pattern,
            token,
            regex=True,
            flags=re.IGNORECASE
            )
        return s
    
    @staticmethod
    def clean_text(s):
        s = s.apply(html.unescape)
        s = s.apply(lambda x: unicodedata.normalize("NFKC", x))
        return s

    def title_management(self):
        self.df["title"] = Preprocessor2.clean_text(self.df['title'])
        self.df['title'] = Preprocessor2.clean_number(self.df['title'])

        titles = self.df["title"].fillna("")

        if self.is_fit:
            KEEP_2CHAR = {
                "uk","us","eu","un","ny","nj","nh","la","tv","ip","xp","hp","hq",
                "f1","g7","g8","u2","vw","gm","bp","ft","cd"
            }
            keep2 = "|".join(sorted(KEEP_2CHAR))
            token_pattern = rf"(?u)\b(?!\d+\b)(?:[A-Za-z]{{3,}}|(?:{keep2})|[A-Za-z]\w+)\b"

            self.vectorizer = TfidfVectorizer(stop_words="english", token_pattern=token_pattern, min_df=10, ngram_range=(1,3), sublinear_tf=True, strip_accents="unicode")
            return  self.vectorizer.fit_transform(titles), self.vectorizer.get_feature_names_out()
        else:
            return self.vectorizer.transform(titles)

    def source_management(self):
        df = self.df["source"].fillna("MISSING")

        if self.is_fit:
            self.top_50 = df.value_counts()[:50].index.to_list()
            self.ohe = OneHotEncoder(categories=[self.top_50], handle_unknown="ignore", sparse_output=True)
            X = df.where(df.isin(self.top_50), np.nan).to_frame()
            X_ohe = self.ohe.fit_transform(X)
        else:
            X = df.where(df.isin(self.top_50), np.nan).to_frame()
            X_ohe = self.ohe.transform(X)
        
        return X_ohe

    def full_prep(self,df):
        self.df = df
        self.na_management()
        self.timestamp_management()
        X_ohe = self.source_management()
        tfidf_matrix = self.title_management()
        output = self.df.copy()
        idxs = output.index
        self.df = None


        output.drop(columns=COLUMNS_TO_DROP, inplace=True, errors='ignore')
        output = csr_matrix(output.to_numpy(dtype=np.float32))
        X = hstack([tfidf_matrix, X_ohe, output], format="csr")
        return X, idxs


    def fit_transform(self, df):
        self.is_fit = True
        return self.full_prep(df)

    def transform(self, df):
        if self.vectorizer is None or self.ohe is None or self.top_50 is None:
            raise RuntimeError("Preprocessor.fit_transform called before fit.")
        
        self.is_fit = False
        return self.full_prep(df)

In [106]:
prep2 = Preprocessor2()
prep2.is_fit = True

prep2.df = news_df
prep2.na_management()
prep2.timestamp_management()
tfidf_matrix, feat = prep2.title_management()

In [107]:
for f in feat:
    if any(ch.isdigit() for ch in f):
        print(f)



a380
co2
f1
g7
g8
mp3
p2p
ps3
sp2
u2
update1
update2
update3
update4
update5
xp sp2
