### Imports

In [37]:
import re
from detoxify import Detoxify
import pandas as pd
import numpy as np
from typing import List
import hackathon_somos_nlp_2023.data.load as load
from pigeon import annotate

### Load data

In [2]:
data_raw = load.data_raw
data_processed = load.data_processed

### Read data

In [17]:
# Read 'csv' file as dataframe
df = pd.read_csv(f'{data_raw}/sample.csv')

# Show dataframe
df.head(5)

Unnamed: 0,id,conversation_id,referenced_tweets.replied_to.id,referenced_tweets.retweeted.id,referenced_tweets.quoted.id,author_id,in_reply_to_user_id,in_reply_to_username,retweeted_user_id,retweeted_username,...,geo.geo.bbox,geo.geo.type,geo.id,geo.name,geo.place_id,geo.place_type,matching_rules,__twarc.retrieved_at,__twarc.url,__twarc.version
0,1641555769178079232,1641555769178079232,,,,1590296247055060992,,,,,...,,,,,,,,2023-03-30T21:39:34+00:00,https://api.twitter.com/2/tweets/sample/stream...,2.13.0
1,1641555769202974726,1641555769202974726,,1.640384e+18,,1486710311109337097,,,1.34332e+18,lycanthropexx,...,,,,,,,,2023-03-30T21:39:34+00:00,https://api.twitter.com/2/tweets/sample/stream...,2.13.0
2,1641555769194692619,1641503325123235841,1.641544e+18,,,1576752300097421312,7.754652e+17,carbolover1967,,,...,,,,,,,,2023-03-30T21:39:34+00:00,https://api.twitter.com/2/tweets/sample/stream...,2.13.0
3,1641555769186471936,1641555769186471936,,1.641526e+18,,1623575319637594113,,,1.57403e+18,suzuren921,...,,,,,,,,2023-03-30T21:39:34+00:00,https://api.twitter.com/2/tweets/sample/stream...,2.13.0
4,1641555769194844161,1641555769194844161,,,,1592728927344025600,,,,,...,,,,,,,,2023-03-30T21:39:34+00:00,https://api.twitter.com/2/tweets/sample/stream...,2.13.0


In [18]:
# Filter data
df_es = df.loc[df['lang'] == 'es'].reset_index(drop=True)

# Show dataframe
df_es.head(5)

Unnamed: 0,id,conversation_id,referenced_tweets.replied_to.id,referenced_tweets.retweeted.id,referenced_tweets.quoted.id,author_id,in_reply_to_user_id,in_reply_to_username,retweeted_user_id,retweeted_username,...,geo.geo.bbox,geo.geo.type,geo.id,geo.name,geo.place_id,geo.place_type,matching_rules,__twarc.retrieved_at,__twarc.url,__twarc.version
0,1641555769194692619,1641503325123235841,1.641544e+18,,,1576752300097421312,7.754652e+17,carbolover1967,,,...,,,,,,,,2023-03-30T21:39:34+00:00,https://api.twitter.com/2/tweets/sample/stream...,2.13.0
1,1641555769186295811,1641555054770507778,1.641555e+18,,,1498794390378070017,1.43388e+18,koicarpincho,,,...,,,,,,,,2023-03-30T21:39:34+00:00,https://api.twitter.com/2/tweets/sample/stream...,2.13.0
2,1641555769198891008,1641555769198891008,,1.641446e+18,,1633549880256757760,,,1.485945e+18,flemitang,...,,,,,,,,2023-03-30T21:39:34+00:00,https://api.twitter.com/2/tweets/sample/stream...,2.13.0
3,1641555769198796802,1641555769198796802,,,,1293944610180468742,,,,,...,,,,,,,,2023-03-30T21:39:34+00:00,https://api.twitter.com/2/tweets/sample/stream...,2.13.0
4,1641555773397381121,1641555773397381121,,1.641522e+18,,1507231284216549379,,,101822000.0,rochaperiodista,...,,,,,,,,2023-03-30T21:39:35+00:00,https://api.twitter.com/2/tweets/sample/stream...,2.13.0


### Process data

In [19]:
labels = []
def label_data(data: List) -> List:
    for x in data:
        labels.append(Detoxify('multilingual'.predict(x)))
    return labels

In [20]:
# Clean text
df_es['text'] = df_es['text'].apply(lambda x: re.sub(r'\\u[0-9A-Fa-f]+|\\n', '', x))

# Convert column values to list
texts = df_es['text'].to_list()

In [22]:
results = Detoxify('multilingual').predict(texts)

In [23]:
# Convert dict to dataframe
df_results = pd.DataFrame.from_dict(results)

# Round floats
df_results = df_results.round(2)

# Add new column
df_results['text'] = df_es['text']

# Show dataframe
df_results

Unnamed: 0,toxicity,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit,text
0,0.00,0.0,0.00,0.0,0.00,0.0,0.0,@carbolover1967 @Agusitooh Domado
1,0.12,0.0,0.02,0.0,0.02,0.0,0.0,@koicarpincho es lo más rico q hay dios Y LA M...
2,0.67,0.0,0.06,0.0,0.50,0.0,0.0,"@shitpostingMP4 nunca lo banque, nose q le veí..."
3,0.01,0.0,0.00,0.0,0.00,0.0,0.0,"Mk que frustrada me siento, a veces dar el 100..."
4,0.01,0.0,0.00,0.0,0.00,0.0,0.0,El INE obedeciendo a la queja de la senadora d...
...,...,...,...,...,...,...,...,...
633,0.00,0.0,0.00,0.0,0.00,0.0,0.0,Escrivá quiere que los autónomos aporten el 40...
634,0.00,0.0,0.00,0.0,0.00,0.0,0.0,Esta un poquito grueso pero te va hacerla bien...
635,0.00,0.0,0.00,0.0,0.00,0.0,0.0,@prospericarlos @hramosallup @PieroMaroun @con...
636,0.00,0.0,0.00,0.0,0.00,0.0,0.0,Que se vaya cuanto antes! #SVGala5 https://t.c...


In [24]:
# Create a new column
df_results['label'] = df_results['toxicity'].apply(lambda x: 0 if round(x) < 0.3 else 1)

# Show dataframe
df_results.head(5)

Unnamed: 0,toxicity,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit,text,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,@carbolover1967 @Agusitooh Domado,0
1,0.12,0.0,0.02,0.0,0.02,0.0,0.0,@koicarpincho es lo más rico q hay dios Y LA M...,0
2,0.67,0.0,0.06,0.0,0.5,0.0,0.0,"@shitpostingMP4 nunca lo banque, nose q le veí...",1
3,0.01,0.0,0.0,0.0,0.0,0.0,0.0,"Mk que frustrada me siento, a veces dar el 100...",0
4,0.01,0.0,0.0,0.0,0.0,0.0,0.0,El INE obedeciendo a la queja de la senadora d...,0


In [25]:
# Filter data
df_negative = df_results.loc[df_results['label'] == 0]

# Show dataframe
df_negative

Unnamed: 0,toxicity,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit,text,label
0,0.00,0.0,0.00,0.0,0.00,0.0,0.0,@carbolover1967 @Agusitooh Domado,0
1,0.12,0.0,0.02,0.0,0.02,0.0,0.0,@koicarpincho es lo más rico q hay dios Y LA M...,0
3,0.01,0.0,0.00,0.0,0.00,0.0,0.0,"Mk que frustrada me siento, a veces dar el 100...",0
4,0.01,0.0,0.00,0.0,0.00,0.0,0.0,El INE obedeciendo a la queja de la senadora d...,0
6,0.00,0.0,0.00,0.0,0.00,0.0,0.0,"""Qué bonita la naturaleza, cuánto hemos de apr...",0
...,...,...,...,...,...,...,...,...,...
633,0.00,0.0,0.00,0.0,0.00,0.0,0.0,Escrivá quiere que los autónomos aporten el 40...,0
634,0.00,0.0,0.00,0.0,0.00,0.0,0.0,Esta un poquito grueso pero te va hacerla bien...,0
635,0.00,0.0,0.00,0.0,0.00,0.0,0.0,@prospericarlos @hramosallup @PieroMaroun @con...,0
636,0.00,0.0,0.00,0.0,0.00,0.0,0.0,Que se vaya cuanto antes! #SVGala5 https://t.c...,0


In [26]:
# Remove rows with less than 3 words in 'text' column 
df_tokens = df_negative.loc[df['text'].str.count(" ") >= 3]

# Filter dataframe
df_final = df_tokens[['text', 'label']]

list_text = df_final['text'].to_list()

# Show dataframe
df_final

Unnamed: 0,text,label
1,@koicarpincho es lo más rico q hay dios Y LA M...,0
4,El INE obedeciendo a la queja de la senadora d...,0
6,"""Qué bonita la naturaleza, cuánto hemos de apr...",0
7,Y pensar que Carabineros tiene que lidiar con ...,0
8,@gort_lucy George harris dice lo mismo con un...,0
...,...,...
630,-Hay que votar Milei lista completa.-y quien v...,0
634,Esta un poquito grueso pero te va hacerla bien...,0
635,@prospericarlos @hramosallup @PieroMaroun @con...,0
636,Que se vaya cuanto antes! #SVGala5 https://t.c...,0


In [29]:
annotations = annotate(list_text, options=['positivo', 'negativo'])

HTML(value='0 examples annotated, 483 examples left')

HBox(children=(Button(description='positivo', style=ButtonStyle()), Button(description='negativo', style=Butto…

Output()

Annotation done.


In [36]:
df_final['annotations'] = list(map(lambda x: x[1], annotations))

df_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['annotations'] = list(map(lambda x: x[1], annotations))


Unnamed: 0,text,label,annotations
1,@koicarpincho es lo más rico q hay dios Y LA M...,0,negativo
4,El INE obedeciendo a la queja de la senadora d...,0,positivo
6,"""Qué bonita la naturaleza, cuánto hemos de apr...",0,negativo
7,Y pensar que Carabineros tiene que lidiar con ...,0,positivo
8,@gort_lucy George harris dice lo mismo con un...,0,positivo
...,...,...,...
630,-Hay que votar Milei lista completa.-y quien v...,0,positivo
634,Esta un poquito grueso pero te va hacerla bien...,0,positivo
635,@prospericarlos @hramosallup @PieroMaroun @con...,0,positivo
636,Que se vaya cuanto antes! #SVGala5 https://t.c...,0,negativo


In [38]:
# Add column for label
conditions = [
    (df_final['annotations'] == "positivo"),
    (df_final['annotations'] == "negativo"),
    ]

values=[1,0]

df_final['new_label']=np.select(conditions,values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['new_label']=np.select(conditions,values)


In [41]:
df_final = df_final.loc[df_final['new_label'] == 0]

df_final = df_final[['text', 'label']]

df_final

Unnamed: 0,text,label
1,@koicarpincho es lo más rico q hay dios Y LA M...,0
6,"""Qué bonita la naturaleza, cuánto hemos de apr...",0
12,el dolorcito en el pecho cuando ves algo que n...,0
13,@arturocazal Sea lo que sea hay que recuperar ...,0
14,cómo odio matemática hace una hora con el mism...,0
...,...,...
620,Nominación directa para Alma #SVGala5,0
621,"Antes de emitir un Juicio sobre el Mechero, pi...",0
624,"Dónde me cuidan yo cuido el triple, y dónde me...",0
627,Les gusta mi corazón?🙈💗 https://t.co/vZrpiqHtbc,0


In [44]:
# Clean text
df_final['text'] = df_final['text'].apply(lambda x: re.sub(r'\\u[0-9A-Fa-f]+', '', x))

text_clean = df_final['text'].to_list()

text_clean

['@koicarpincho es lo más rico q hay dios Y LA MOSTAZA NATURA UFFFFF',
 '"Qué bonita la naturaleza, cuánto hemos de aprender de los animales 😍🥰🤩"-Los animales: https://t.co/CEWx06uLH0',
 'el dolorcito en el pecho cuando ves algo que no querías ver 📉📉📉',
 '@arturocazal Sea lo que sea hay que recuperar la UCV Patrimonio Inmaterial de la Humanidad...la casa que vence las sombras..Duele.!!!',
 'cómo odio matemática hace una hora con el mismo problema',
 'yo solo me hago mal jajaj 😣',
 '¡Qué emoción ir a Filbo!📌El 30 de abril charlaré sobre literatura y naturaleza (4:00 p.m.)📌El 1 de mayo lanzaré mi novela Escrito en la piel del jaguar (5:00 p.m)¡Acompáñenme! https://t.co/MG17m137FT',
 'Nacho dijo que todavía no había visto un vídeo tan hermoso 😭😭😭😭En el debate lo desplazaron tanto 🥺 los odioooooo hdmilp',
 'prefiero mil veces esto a salir de fiesta https://t.co/l1EEtch1iO',
 'Pues si los suricatos tenemos la mejor plantilla de CCs del amateur y parte del profesional, se dice y punto.Ojo! Y

In [42]:
df_final.to_csv(f'{data_processed}/negatives.csv', index=False)