In [None]:
import pandas as pd
import numpy as np
import spacy

# Cargar el Dataset y el módelo de lenguaje

In [None]:
df = pd.read_csv('/Users/diego/UValencia/DataMining/project/dataset/mLabel_tweets.csv')

In [None]:
#Primero cargo un modelo en spacy, libreria de procesado de texto
nlp = spacy.load("en_core_web_md")

# Preprocesado del texto

In [None]:
stop_words = nlp.Defaults.stop_words

def normalizar_tweet(doc):
    '''Función que normaliza un texto cogiendo sólo
    las palabras en minúsculas mayores de 3 caracteres'''
    # separamos en tokens
    tokens = nlp(doc)
    # filtramos stopwords
    filtered_tokens = [t.lower_ for t in tokens if
                       len(t.text)>3 and
                       not t.is_punct and
                       not t.lower_.startswith("@") and
                       not t.lower_.startswith("#") and
                       not t.lower_.startswith("http") and
                       not t.lower_ in stop_words]
    # juntamos de nuevo en una cadena
    doc = ' '.join(filtered_tokens)
    return doc

In [None]:
string = "@cath__kath AstraZeneca is made with the kidney cells of a little girl aborted back in the 70s"
normalizar_tweet(string)


In [None]:
df['tweet_normalizado'] = df['tweet'].apply(normalizar_tweet)
df['labels_list'] = df['labels'].apply(lambda x: x.split(' '))

In [None]:
for i in df.labels_list.value_counts():
    print(i)

Preparamos los labels

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

binaryEncoder = MultiLabelBinarizer()
binary_labels = binaryEncoder.fit_transform(df['labels_list'])
binary_labels.shape


# Analisis del Dataset

In [None]:
frequency = df['labels_list'].value_counts()

print(frequency[:30])


Matrices TFIDF

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.tweet_normalizado, binary_labels, test_size=0.2, random_state=101)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

# Ajustar y transformar los textos de tweets a una matriz TF-IDF
tfidf_matrix = tfidf_vectorizer.fit(df['tweet_normalizado'])

x_train_tfidf = tfidf_matrix.transform(X_train)
x_test_tfidf = tfidf_matrix.transform(X_test)

Word Embeddings

In [421]:
from sentence_transformers import SentenceTransformer
model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = "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"
import requests

api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}
def query(texts):
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    return response.json()

In [None]:
x_train_embeddings = query(list(X_train))

x_test_embeddings = query(list(X_test))

In [422]:
x_test_embeddings = query(list(X_test))

In [419]:
X_test

3426    question taken post blood test fact immune cel...
8452    funny shit comes cold hell marty makary neck n...
6854    people died taking pfizer vaccine according pf...
2119    bioedge governments sweeten vaccine pill incen...
6534    wants shitty vaccine research chinese vaccines...
                              ...                        
3960    astrazeneca manufacturing epic failure edsel l...
4853    answer lies result happens want vaccine vaccin...
887            okay pfizer women infertile want hell want
3854    telling sask people mounting deaths visious af...
3556    people choice untried gauranteed work safe vac...
Name: tweet_normalizado, Length: 1985, dtype: object

# Entreno Módelos

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import classification_report
for i in range(5,6):
    knn = OneVsRestClassifier(KNeighborsClassifier(i)).fit(x_train_embeddings,y_train)
    y_pred = knn.predict(x_test_embeddings)
    print_report(y_test,y_pred,f'{i}')

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer



# Define la arquitectura de la red neuronal
model = Sequential()
model.add(Embedding(input_dim=len(x_train_embeddings[0])+1, output_dim=64, input_length=384))
model.add(SimpleRNN(24, return_sequences=True))
model.add(SimpleRNN(24))
model.add(Dense(12, activation='sigmoid')) # Using sigmoid for multilabel classification


# Compila el modelo
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
# Entrena el modelo
print(model.summary)
model.fit(np.array(x_train_embeddings), np.array(y_train), epochs=5, batch_size=32, validation_data=(np.array(x_test_embeddings), np.array(y_test)))


In [None]:
from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",

    "Decision Tree",

    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]



def print_report(y_test, y_pred,name):
    report = classification_report(y_test, y_pred)
    with open('classification_report.txt', 'a') as f:
        f.write(f"////////////////////{name}/////////////////////\n")
        f.write(report)

classifiers = [
    QuadraticDiscriminantAnalysis(),

    KNeighborsClassifier(5),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),

    DecisionTreeClassifier(),

    MLPClassifier(),
    AdaBoostClassifier(algorithm="SAMME", random_state=42),

]

for clf,name in zip(classifiers,names):
    clf = MultiOutputClassifier(clf)
    clf.fit(np.array(x_train_embeddings),np.array(y_train))
    y_pred = clf.predict(np.array(x_test_embeddings))
    print_report(np.array(y_test),np.array(y_pred),name)
    

Red Neuronal

In [None]:

# Define la arquitectura de la red neuronal
model = keras.Sequential([

    Dropout(0.3),
    keras.layers.Dense(12, activation='relu'),
    Dropout(0.3),
    keras.layers.Dense(12, activation='sigmoid')  # Capa de salida con activación sigmoide
])

# Compila el modelo
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary
# Entrena el modelo

model.fit(np.array(x_train_embeddings), np.array(y_train), epochs=50, batch_size=32, validation_data=(np.array(x_test_embeddings), np.array(y_test)))



In [None]:
y_pred = model.predict(np.array(x_test_embeddings))
def threshold_array(array, threshold):
    return np.where(array > threshold, 1, 0)

y_pred_normaliced = list(map(lambda array: threshold_array(array, 0.3), y_pred))
print(classification_report(y_test, y_pred_normaliced))

# Métricas

In [None]:


from sklearn.metrics import classification_report

def print_report(y_test, y_pred,name):
    report = classification_report(y_test, y_pred)
    with open('classification_report.txt', 'a') as f:
        f.write(f"////////////////////{name}/////////////////////")
        f.write(report)

In [None]:
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

import matplotlib.pyplot as plt

for cm in multilabel_confusion_matrix(y_test,y_pred_normaliced):
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()

In [None]:
from sklearn.metrics import hamming_loss
print('Hamming Loss: ', round(hamming_loss(y_test, y_pred_normaliced),3))

In [None]:
y_pred[10:15],binary_labels[10:15]
def entreno(i):
    print(model.predict(x_test_embeddings[i]))
    print(y_test[i])
    print()
for i in range(5):
    entreno(i)