In [1]:
import pandas as pd
import os
import json

In [2]:
def clean_dataframe(path_1, path_2):
    with open(path_1, 'r') as file:
        data = json.load(file)

    flattened_data = []
    for key, value in data.items():
        row = {'id_EXIST': key}
        for k, v in value.items():
            row[k] = v
        flattened_data.append(row)

    df = pd.DataFrame(flattened_data)

    with open(path_2, 'r') as file:
        data = json.load(file)

    gold = pd.DataFrame(data)
    df = pd.merge(df, gold, left_on='id_EXIST', right_on='id')

    df = df[['id','tweet','value']]
    return df

In [3]:
dpath_training = '/kaggle/input/exist-2024-task1/EXIST2024_training.json'
dpath_gold = '/kaggle/input/exist-2024-task1/EXIST2024_training_task1_gold_hard.json'
df = clean_dataframe(dpath_training, dpath_gold)
df.head()

Unnamed: 0,id,tweet,value
0,100001,"@TheChiflis Ignora al otro, es un capullo.El p...",YES
1,100002,@ultimonomada_ Si comicsgate se parece en algo...,NO
2,100003,"@Steven2897 Lee sobre Gamergate, y como eso ha...",NO
3,100005,@novadragon21 @icep4ck @TvDannyZ Entonces como...,YES
4,100006,@yonkykong Aaah sí. Andrew Dobson. El que se d...,NO


In [4]:
dpath_dev = '/kaggle/input/exist-2024-task1/EXIST2024_dev.json'
dpath_gold_dev = '/kaggle/input/exist-2024-task1/EXIST2024_dev_task1_gold_hard.json'
df_dev = clean_dataframe(dpath_dev, dpath_gold_dev)
df_dev.head()

Unnamed: 0,id,tweet,value
0,300002,@anacaotica88 @MordorLivin No me acuerdo de lo...,YES
1,300003,@cosmicJunkBot lo digo cada pocos dias y lo re...,NO
2,300004,Also mientras les decia eso la señalaba y deci...,YES
3,300005,"And all people killed, attacked, harassed by ...",NO
4,300006,On this #WorldPressFreedomDay I’m thinking of ...,NO


In [5]:
df['value'] = df['value'].map({'YES':1,'NO':0})
df_dev['value'] = df_dev['value'].map({'YES':1,'NO':0})

In [6]:
# Function to remove words starting with '@'
# def remove_mentions(text):
#     words = text.split()
#     filtered_words = [word for word in words if not word.startswith('@')]
#     return ' '.join(filtered_words)

# # Apply the function to the 'tweet' column
# df['tweet'] = df['tweet'].apply(remove_mentions)
# df_dev['tweet'] = df_dev['tweet'].apply(remove_mentions)
# df.head()

In [9]:
pip install es-core-news-sm

Collecting es-core-news-sm
  Downloading es_core_news_sm-3.1.0-py3-none-any.whl.metadata (1.9 kB)
Collecting spacy<3.2.0,>=3.1.0 (from es-core-news-sm)
  Downloading spacy-3.1.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting thinc<8.1.0,>=8.0.12 (from spacy<3.2.0,>=3.1.0->es-core-news-sm)
  Downloading thinc-8.0.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting wasabi<1.1.0,>=0.8.1 (from spacy<3.2.0,>=3.1.0->es-core-news-sm)
  Downloading wasabi-0.10.1-py3-none-any.whl.metadata (28 kB)
Collecting typer<0.5.0,>=0.3.0 (from spacy<3.2.0,>=3.1.0->es-core-news-sm)
  Downloading typer-0.4.2-py3-none-any.whl.metadata (12 kB)
Collecting pathy>=0.3.5 (from spacy<3.2.0,>=3.1.0->es-core-news-sm)
  Downloading pathy-0.11.0-py3-none-any.whl.metadata (16 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 (from spacy<3.2.0,>=3.1.0->es-core-news-sm)
  Downloading pydantic-1.8.2-py3-none-any.whl.metadata (103 kB)
[2K     

In [10]:
import spacy
nlp = spacy.load("es_core_news_sm")
unique_words = set()

def remove_punct(text):
    doc = nlp(text)
    entry = ""
    for token in doc:
        if token.pos_ not in ["SPACE", "X", "PUNCT", "AUX"] and not token.is_stop:
            unique_words.add(token.lemma_)
            entry += " "
            entry += token.lemma_
    return entry

df['tweet'] = df['tweet'].apply(remove_punct)
df_dev['tweet'] = df_dev['tweet'].apply(remove_punct)



In [11]:
len(unique_words)

39613

In [13]:
import math

def calculate_tf_idf(sentence):
    idf_denom = [0] * len(unique_words)
    tf_denom = 0
    tf_nom = []
    idf_nom = 1 + len(sentence)
    for j, word in enumerate(unique_words):
        frequency = sentence.count(word)
        tf_nom.append(frequency)
        tf_denom += frequency
        if frequency != 0:
            idf_denom[j] +=1
    return [tf_nom[k]/tf_denom * math.log(idf_nom/(idf_denom[k]+1)) for k in range(len(unique_words))]

df['tfidf'] = [None] * len(df)
df_dev['tfidf'] = [None] * len(df_dev)

df['tfidf'] = df.tweet.apply(calculate_tf_idf)
df_dev['tfidf'] = df_dev.tweet.apply(calculate_tf_idf)

In [14]:
df.head()

Unnamed: 0,id,tweet,value,tfidf
0,100001,@TheChiflis Ignora capullo problema youtuber ...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,100002,@ultimonomado _ comicsgate a gamergate acoso ...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,100003,@steven2897 Lee Gamergate y cambiar comunicar...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,100005,@novadragon21 @icep4ck @tvdannyz mercado camb...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,100006,@yonkykong Aaah Andrew Dobson dedicar a echar...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [15]:
X = df['tfidf']
X = [list(map(float, x)) for x in X]
y = df['value']

X_val = df_dev['tfidf']
X_val = [list(map(float, x)) for x in X_val]
y_val = df_dev['value']

In [16]:
# del unique_words
# del df
# del df_dev

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = RandomForestClassifier()
model.fit(X, y)
predictions = model.predict(X_val)
accuracy = accuracy_score(y_val, predictions)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_val, predictions, digits=4))
conf_matrix = confusion_matrix(y_val, predictions)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.68
Classification Report:
              precision    recall  f1-score   support

           0     0.6361    0.8831    0.7395       479
           1     0.7918    0.4681    0.5884       455

    accuracy                         0.6809       934
   macro avg     0.7140    0.6756    0.6640       934
weighted avg     0.7120    0.6809    0.6659       934

Confusion Matrix:
[[423  56]
 [242 213]]


In [18]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X, y)
svm_predictions = svm_model.predict(X_val)
svm_accuracy = accuracy_score(y_val, svm_predictions)
print(f"SVM Accuracy: {svm_accuracy:.2f}")
print("SVM Classification Report:")
print(classification_report(y_val, svm_predictions, digits=4))
svm_conf_matrix = confusion_matrix(y_val, svm_predictions)
print("SVM Confusion Matrix:")
print(svm_conf_matrix)

SVM Accuracy: 0.71
SVM Classification Report:
              precision    recall  f1-score   support

           0     0.6625    0.8810    0.7563       479
           1     0.8081    0.5275    0.6383       455

    accuracy                         0.7088       934
   macro avg     0.7353    0.7042    0.6973       934
weighted avg     0.7334    0.7088    0.6988       934

SVM Confusion Matrix:
[[422  57]
 [215 240]]


In [19]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier()
xgb_model.fit(X, y)
xgb_predictions = xgb_model.predict(X_val)
xgb_accuracy = accuracy_score(y_val, xgb_predictions)
print(f"XGBoost Accuracy: {xgb_accuracy:.2f}")
print("XGBoost Classification Report:")
print(classification_report(y_val, xgb_predictions, digits=4))
xgb_conf_matrix = confusion_matrix(y_val, xgb_predictions)
print("XGBoost Confusion Matrix:")
print(xgb_conf_matrix)

XGBoost Accuracy: 0.77
XGBoost Classification Report:
              precision    recall  f1-score   support

           0     0.7330    0.8539    0.7888       479
           1     0.8138    0.6725    0.7365       455

    accuracy                         0.7655       934
   macro avg     0.7734    0.7632    0.7626       934
weighted avg     0.7724    0.7655    0.7633       934

XGBoost Confusion Matrix:
[[409  70]
 [149 306]]
