In [1]:
import joblib
import re
import string

import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB

In [57]:
data = pd.read_csv('cryptonews.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31037 entries, 0 to 31036
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   date       31037 non-null  object
 1   sentiment  31037 non-null  object
 2   source     31037 non-null  object
 3   subject    31037 non-null  object
 4   text       31037 non-null  object
 5   title      31037 non-null  object
 6   url        31037 non-null  object
dtypes: object(7)
memory usage: 1.7+ MB


In [58]:
data.head(10)

Unnamed: 0,date,sentiment,source,subject,text,title,url
0,2023-12-19 06:40:41,"{'class': 'negative', 'polarity': -0.1, 'subje...",CryptoNews,altcoin,Grayscale CEO Michael Sonnenshein believes the...,Grayscale CEO Calls for Simultaneous Approval ...,https://cryptonews.comhttps://cryptonews.com/n...
1,2023-12-19 06:03:24,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoNews,blockchain,"In an exclusive interview with CryptoNews, Man...",Indian Government is Actively Collaborating Wi...,https://cryptonews.comhttps://cryptonews.com/n...
2,2023-12-19 05:55:14,"{'class': 'positive', 'polarity': 0.05, 'subje...",CryptoNews,blockchain,According to the Federal Court ruling on Decem...,Judge Approves Settlement: Binance to Pay $1.5...,https://cryptonews.comhttps://cryptonews.com/n...
3,2023-12-19 05:35:26,"{'class': 'positive', 'polarity': 0.5, 'subjec...",CoinTelegraph,blockchain,Some suggest EVM inscriptions are the latest w...,Why a gold rush for inscriptions has broken ha...,https://cointelegraph.com/news/inscriptions-ev...
4,2023-12-19 05:31:08,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CoinTelegraph,ethereum,A decision by bloXroute Labs to start censorin...,‘Concerning precedent’ — bloXroute Labs' MEV r...,https://cointelegraph.com/news/concerning-prec...
5,2023-12-19 05:25:00,"{'class': 'negative', 'polarity': -0.01, 'subj...",CryptoPotato,bitcoin,Yonsei found that during BTC’s rally in early ...,Is This Why Bitcoin’s Price Rally Was Halted? ...,https://cryptopotato.com/is-this-why-bitcoins-...
6,2023-12-19 04:50:11,"{'class': 'positive', 'polarity': 0.3, 'subjec...",CryptoNews,bitcoin,Cathie Wood led ARK Invest fund sold around 80...,Cathie Wood’s Ark Invest Sells $27.6 Million i...,https://cryptonews.comhttps://cryptonews.com/n...
7,2023-12-19 04:10:00,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoPotato,bitcoin,Bitcoin's 150% surge pales in comparison to th...,Bitcoin Soared 150% in 2023 But These Companie...,https://cryptopotato.com/bitcoin-soared-150-in...
8,2023-12-19 04:00:01,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoNews,blockchain,The South Korean city of Busan is edging close...,South Korean City Busan Names Digital Exchange...,https://cryptonews.comhttps://cryptonews.com/n...
9,2023-12-19 02:59:59,"{'class': 'negative', 'polarity': -0.08, 'subj...",CoinTelegraph,bitcoin,The SEC has pushed back its decision on a rost...,"SEC delays several Ethereum ETFs, pushing fina...",https://cointelegraph.com/news/sec-delays-ethe...


Create the larget label column

In [59]:

data['label']= data['sentiment'].map(lambda x: eval(x)['class'])
data.head(5)

Unnamed: 0,date,sentiment,source,subject,text,title,url,label
0,2023-12-19 06:40:41,"{'class': 'negative', 'polarity': -0.1, 'subje...",CryptoNews,altcoin,Grayscale CEO Michael Sonnenshein believes the...,Grayscale CEO Calls for Simultaneous Approval ...,https://cryptonews.comhttps://cryptonews.com/n...,negative
1,2023-12-19 06:03:24,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoNews,blockchain,"In an exclusive interview with CryptoNews, Man...",Indian Government is Actively Collaborating Wi...,https://cryptonews.comhttps://cryptonews.com/n...,neutral
2,2023-12-19 05:55:14,"{'class': 'positive', 'polarity': 0.05, 'subje...",CryptoNews,blockchain,According to the Federal Court ruling on Decem...,Judge Approves Settlement: Binance to Pay $1.5...,https://cryptonews.comhttps://cryptonews.com/n...,positive
3,2023-12-19 05:35:26,"{'class': 'positive', 'polarity': 0.5, 'subjec...",CoinTelegraph,blockchain,Some suggest EVM inscriptions are the latest w...,Why a gold rush for inscriptions has broken ha...,https://cointelegraph.com/news/inscriptions-ev...,positive
4,2023-12-19 05:31:08,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CoinTelegraph,ethereum,A decision by bloXroute Labs to start censorin...,‘Concerning precedent’ — bloXroute Labs' MEV r...,https://cointelegraph.com/news/concerning-prec...,neutral


In [5]:
data['label'].value_counts()

label
positive    13964
neutral     10555
negative     6518
Name: count, dtype: int64

## Create the Processor

In [6]:
import spacy
import re
from sklearn.base import BaseEstimator, TransformerMixin

In [60]:
class LinguisticPreprocessor(BaseEstimator, TransformerMixin):
    """
    Preprocesador lingüístico para limpieza y normalización de texto, incluida la eliminación de stopwords.

    Attributes
    ----------
    nlp : spacy.language.Language
        Instancia de spaCy para el procesamiento del lenguaje.
    stopwords : set
        Conjunto de stopwords.
    """
    def __init__(self, nlp):
        self.nlp = nlp
        self.stopwords = nlp.Defaults.stop_words


    def lemmatize_and_remove_stopwords(self, doc):
        """
        Lematiza el documento y elimina las stopwords.

        Parameters
        ----------
        doc : spacy.tokens.doc.Doc
            Documento procesado por spaCy.

        Returns
        -------
        str
            Texto lematizado sin stopwords.
        """
        return " ".join([token.lemma_ for token in doc if not token.is_punct and token.text.lower() not in self.stopwords])

    def fit(self, X, y=None):
        """
        Método de ajuste requerido por scikit-learn TransformerMixin. No se realiza ningún ajuste.

        Parameters
        ----------
        X : iterable
            Textos a preprocesar.
        y : None
            Ignorado.

        Returns
        -------
        self
        """
        return self

    def transform(self, X, y=None):
        """
        Transforma los textos aplicando limpieza, eliminación de HTML, lematización y eliminación de stopwords.

        Parameters
        ----------
        X : iterable
            Textos a preprocesar.

        Returns
        -------
        list
            Lista de textos preprocesados.
        """
        transformed_X = []
        for text in X:
            #text = self.remove_html_tags(text)
            text = re.sub(r'\s+', ' ', text).strip()  # Elimina espacios extras
            doc = self.nlp(text)
            transformed_text = self.lemmatize_and_remove_stopwords(doc)
            transformed_X.append(transformed_text)
        return transformed_X

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline

In [62]:
# Cargar spaCy Language model
nlp = spacy.load("en_core_web_sm")

# Instanciar el preprocesador
preprocessor = LinguisticPreprocessor(nlp)

# Crear el pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('vectorizer', CountVectorizer()),
    #('classifier', OneVsRestClassifier(LinearSVC(), n_jobs=1)) #53 63 71
    #('classifier', MultinomialNB()) 0.4 0.45 0.65
    #('classifier', LinearSVC()) #50 64 70
    #('classifier', RandomForestClassifier(n_estimators=250, max_depth=4, random_state=0))
        #RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
    ('classifier', LinearSVC()) #50 64 70
])

## Preprocess the Label

In [63]:
data.head(1)

Unnamed: 0,date,sentiment,source,subject,text,title,url,label
0,2023-12-19 06:40:41,"{'class': 'negative', 'polarity': -0.1, 'subje...",CryptoNews,altcoin,Grayscale CEO Michael Sonnenshein believes the...,Grayscale CEO Calls for Simultaneous Approval ...,https://cryptonews.comhttps://cryptonews.com/n...,negative


In [64]:
label_dict={'positive': 1, 'neutral': 0, 'negative': -1}
data['label'] = data['label'].map(label_dict)
#data['label'].map(label_dict)
data.head(5)

Unnamed: 0,date,sentiment,source,subject,text,title,url,label
0,2023-12-19 06:40:41,"{'class': 'negative', 'polarity': -0.1, 'subje...",CryptoNews,altcoin,Grayscale CEO Michael Sonnenshein believes the...,Grayscale CEO Calls for Simultaneous Approval ...,https://cryptonews.comhttps://cryptonews.com/n...,-1
1,2023-12-19 06:03:24,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoNews,blockchain,"In an exclusive interview with CryptoNews, Man...",Indian Government is Actively Collaborating Wi...,https://cryptonews.comhttps://cryptonews.com/n...,0
2,2023-12-19 05:55:14,"{'class': 'positive', 'polarity': 0.05, 'subje...",CryptoNews,blockchain,According to the Federal Court ruling on Decem...,Judge Approves Settlement: Binance to Pay $1.5...,https://cryptonews.comhttps://cryptonews.com/n...,1
3,2023-12-19 05:35:26,"{'class': 'positive', 'polarity': 0.5, 'subjec...",CoinTelegraph,blockchain,Some suggest EVM inscriptions are the latest w...,Why a gold rush for inscriptions has broken ha...,https://cointelegraph.com/news/inscriptions-ev...,1
4,2023-12-19 05:31:08,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CoinTelegraph,ethereum,A decision by bloXroute Labs to start censorin...,‘Concerning precedent’ — bloXroute Labs' MEV r...,https://cointelegraph.com/news/concerning-prec...,0


## Train, validation and test splits

In [65]:
from sklearn.model_selection import train_test_split

train, retrain = train_test_split(data, random_state=42, test_size=0.3, shuffle=True, stratify=data['label'])

train, test = train_test_split(train, random_state=42, test_size=0.2, shuffle=True, stratify=train['label'])

print("Train Shape:",train.shape)
print("Test Shape:",test.shape)
print("Retrain Shape:",retrain.shape)

Train Shape: (17380, 8)
Test Shape: (4345, 8)
Retrain Shape: (9312, 8)


In [49]:
data['label'].value_counts()

label
 1    13964
 0    10555
-1     6518
Name: count, dtype: int64

In [50]:
train['label'].value_counts()

label
 1    7819
 0    5910
-1    3651
Name: count, dtype: int64

In [66]:
X_train = train['title']+" "+train['text']
y_train = train['label']

print("X Train Shape:",X_train.shape)
print("y Train Shape:",y_train.shape)

X Train Shape: (17380,)
y Train Shape: (17380,)


## Train the model

In [67]:
# Entrenar el modelo
pipeline.fit(X_train, y_train)



## Evaluate the model

In [68]:
from sklearn.metrics import classification_report

X_test = test['title']+" "+test['text']
y_test = test['label']

predictions = pipeline.predict(X_test)

print(classification_report(y_test, predictions)) #, target_names=["non-biased", "biased"]))

              precision    recall  f1-score   support

          -1       0.52      0.50      0.51       912
           0       0.59      0.62      0.60      1478
           1       0.69      0.68      0.68      1955

    accuracy                           0.62      4345
   macro avg       0.60      0.60      0.60      4345
weighted avg       0.62      0.62      0.62      4345

