In [5]:
import re
import html
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()


def text_cleaner_wrapper(df):
    df["title"] = df["title"].fillna("")
    df["article"] = df["article"].fillna("")

    df['title'] = clean_text(df['title'])
    df['article'] = clean_text(df['article'])
    df['len_article'] = df['article'].str.len()
    df['len_title'] = df['title'].str.len()
    df.loc[df['article'].str.len() < 5, "article"] = ""
    
    return df


def clean_text(s):

    s = s.apply(html.unescape)
    s = s.str.lower()

    HTML_NOISE_WORDS = [
        "http", "https", "www", "com",
        "rss", "feed", "feeds",
        "img", "src", "href",
        "reuters", "border", "said", "new",
        "yahoo", "yimg", "jpg", "jpeg", "png", "gif",
        "width", "height", "align", "alt",
        "photo", "clear", "left", "right",
        "sig",
        "dailynews", "csmonitor", "feedburner",
        "yeartoken", 
        "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
        "president", "people", "world"
    ]

    data = []

    pat = r"(?u)\b(" + "|".join(map(re.escape, HTML_NOISE_WORDS)) + r")\b"
    s = s.str.replace(pat, " ", regex=True)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    stop_words = set(stopwords.words('english'))
    s = s.str.replace(r"\d+", " ", regex=True)
    s = s.str.replace(r'[^\w\s]', " ", regex=True)
    for article in s:
        tokens = word_tokenize(article)
        filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        data.append(filtered_tokens)
    return data


In [2]:
import pandas as pd

from src.config import * 

In [3]:
news_df = pd.read_csv(DEVELOPMENT_PATH)

In [6]:
data = clean_text(news_df['article'].fillna(""))

In [7]:
len(data)

79997

In [18]:
data

[['organisation',
  'petroleum',
  'exporting',
  'country',
  'opec',
  'hiking',
  'official',
  'output',
  'one',
  'million',
  'barrel',
  'per',
  'day',
  'effective',
  'november',
  'nigeria',
  'getting',
  'barrel',
  'per',
  'day',
  'per',
  'cent',
  'quota'],
 ['looking',
  'back',
  'major',
  'event',
  'took',
  'place',
  'middle',
  'east',
  'death',
  'veteran',
  'palestinian',
  'leader',
  'yasser',
  'arafat',
  'undoubtedly',
  'shaker'],
 ['cqpolitics',
  'today',
  'battleground',
  'dispatch',
  'ã',
  'â',
  'â',
  'roundup',
  'going',
  'year',
  'hottest',
  'race',
  'based',
  'report',
  'local',
  'national',
  'medium'],
 ['air',
  'rather',
  'oxygen',
  'used',
  'resuscitate',
  'newborn',
  'baby',
  'may',
  'even',
  'save',
  'life',
  'research',
  'suggests'],
 ['p',
  'u',
  'rd',
  'europe',
  'news',
  'nm',
  'wl_nm',
  'transport_germany_accident_dc',
  'u',
  'p',
  'rids',
  'r',
  'x',
  'df',
  'yh',
  'mhxp',
  'zblqrj',
  'sm

In [8]:
from gensim.models import Word2Vec
import gensim

model = Word2Vec(sentences=data, vector_size=100, window=5, sg=1, min_count=5, workers=4)


In [9]:
model.save("word2vec.model")

In [14]:
import numpy as np

def document_embedding(tokens, model):
    vectors = []
    for word in tokens:
        if word in model.wv:
            vectors.append(model.wv[word])

    if len(vectors) == 0:
        # documento senza parole nel vocabolario
        return np.zeros(model.vector_size)

    return np.mean(vectors, axis=0)


In [15]:
X_embeddings = np.vstack([
    document_embedding(doc, model)
    for doc in data
])


In [17]:
X_embeddings.shape

(79997, 100)

In [1]:
from src.evaluation import *

params = {'C': 0.01,
          'class_weight': "balanced",
          "dual": False,
          "max_iter": 5000
          }
performance('linear_svm', params, is_w2v=True)

  from .autonotebook import tqdm as notebook_tqdm


(61302, 460)


{'precision_macro': 0.6874576374001654,
 'recall_macro': 0.7330887630466743,
 'f1-macro': 0.6994954808998938,
 'per_class': {'precision': array([0.75555062, 0.73006932, 0.7928506 , 0.63829787, 0.7771261 ,
         0.62283156, 0.49547739]),
  'recall': array([0.74725516, 0.8171678 , 0.79723502, 0.51807879, 0.94362018,
         0.45972739, 0.84853701]),
  'f1': array([0.75138   , 0.77116705, 0.79503676, 0.57193923, 0.85231841,
         0.5289924 , 0.62563452]),
  'support': array([4554, 2062, 2170, 1853, 1685, 2421,  581])},
 'confusion_matrix': array([[3403,  190,   77,  189,  117,  408,  170],
        [  77, 1685,  130,   49,   32,   38,   51],
        [ 129,  131, 1730,   70,   20,   25,   65],
        [ 261,  100,  136,  960,  136,  169,   91],
        [  22,   13,    4,   27, 1590,   23,    6],
        [ 584,  172,   93,  194,  146, 1113,  119],
        [  28,   17,   12,   15,    5,   11,  493]])}

In [1]:
from src.evaluation import *

params = {
    "objective": "multi:softprob",
    "num_class": 7,

    "n_estimators": 800,
    "learning_rate": 0.05,

    "max_depth": 4,
    "min_child_weight": 3,

    "subsample": 0.85,
    "colsample_bytree": 0.75,

    "reg_alpha": 0.0,
    "reg_lambda": 1.0,
    "gamma": 0.0,

    "tree_method": "hist",
    "n_jobs": -1,
    "random_state": 42,

    "eval_metric": "mlogloss",
}

performance("xgboost", params, is_w2v=True)


  from .autonotebook import tqdm as notebook_tqdm


(61302, 260)


{'precision_macro': 0.6949148718147538,
 'recall_macro': 0.7462894908282619,
 'f1-macro': 0.714128464034166,
 'per_class': {'precision': array([0.80625161, 0.74892519, 0.808213  , 0.58052231, 0.81635802,
         0.57816712, 0.52596685]),
  'recall': array([0.68533158, 0.84481086, 0.82534562, 0.57582299, 0.94183976,
         0.53159851, 0.81927711]),
  'f1': array([0.74089021, 0.79398359, 0.81668947, 0.5781631 , 0.87462111,
         0.55390575, 0.64064603]),
  'support': array([4554, 2062, 2170, 1853, 1685, 2421,  581])},
 'confusion_matrix': array([[3121,  180,   95,  273,   67,  655,  163],
        [  58, 1742,  118,   68,   10,   30,   36],
        [  84,  129, 1791,   78,   13,   25,   50],
        [ 174,   96,  108, 1067,  135,  202,   71],
        [  12,    7,    3,   58, 1587,   13,    5],
        [ 398,  149,   83,  271,  129, 1287,  104],
        [  24,   23,   18,   23,    3,   14,  476]])}

In [1]:
from src.evaluation import *

params = {
    "objective": "multi:softprob",
    "num_class": 7,

    "n_estimators": 800,
    "learning_rate": 0.05,

    "max_depth": 4,
    "min_child_weight": 3,

    "subsample": 0.85,
    "colsample_bytree": 0.75,

    "reg_alpha": 0.0,
    "reg_lambda": 1.0,
    "gamma": 0.0,

    "tree_method": "hist",
    "n_jobs": -1,
    "random_state": 42,

    "eval_metric": "mlogloss",
}

performance("xgboost", params, is_w2v=True)


  from .autonotebook import tqdm as notebook_tqdm


(61302, 260)


{'precision_macro': 0.6943387699990728,
 'recall_macro': 0.746882745552507,
 'f1-macro': 0.7139289768955124,
 'per_class': {'precision': array([0.80596621, 0.74442539, 0.80357143, 0.58441558, 0.81659836,
         0.58610004, 0.51929438]),
  'recall': array([0.67039965, 0.84190107, 0.82949309, 0.58283864, 0.94599407,
         0.54688145, 0.81067126]),
  'f1': array([0.73195876, 0.79016841, 0.81632653, 0.58362605, 0.8765466 ,
         0.56581197, 0.63306452]),
  'support': array([4554, 2062, 2170, 1853, 1685, 2421,  581])},
 'confusion_matrix': array([[3053,  195,  102,  305,   76,  656,  167],
        [  67, 1736,  116,   59,   11,   38,   35],
        [  78,  129, 1800,   75,   10,   24,   54],
        [ 188,   91,  113, 1080,  126,  180,   75],
        [  12,    9,    2,   51, 1594,   12,    5],
        [ 366,  149,   91,  259,  132, 1324,  100],
        [  24,   23,   16,   19,    3,   25,  471]])}