In [128]:
from datasets import load_dataset
import pandas as pd

## Spanish Dataset

In [129]:
twitter_spanish = load_dataset("BrunoGR/Twitter_Sentiment_Analysis_Train_Corpus_in_Spanish")

We create a Dataframe for easier management.

In [130]:
df = pd.DataFrame(twitter_spanish['train'])

In [131]:
df

Unnamed: 0,etiqueta,texto
0,Sentimiento: Negativo,Consulta: < @USUARIO Mi pobre y pequeña bola d...
1,Sentimiento: Negativo,Consulta: < Me voy demasiado a la cama. Tengo ...
2,Sentimiento: Negativo,Consulta: < Todavía no he podido escucharlo. M...
3,Sentimiento: Negativo,Consulta: < ahora recuerda por qué resolver un...
4,Sentimiento: Negativo,"Consulta: < Comí demasiado, me siento mal>\n"
...,...,...
1082816,Sentimiento: Negativo,Consulta: < Hoy vi a mucha gente que extrañé.....
1082817,Sentimiento: Positivo,Consulta: < - Me gusta>\n
1082818,Sentimiento: Positivo,Consulta: < arando a través de las asignacione...
1082819,Sentimiento: Positivo,Consulta: < @USUARIO No veo la hora de ver a J...


In [132]:
# Balanced Dataset

df['etiqueta'].value_counts() 

etiqueta
Sentimiento: Negativo    543894
Sentimiento: Positivo    538927
Name: count, dtype: int64

We take 50k randomly selected data

In [133]:
df = df.sample(50000)

In [134]:
df['etiqueta']

387072     Sentimiento: Positivo
647122     Sentimiento: Negativo
1076575    Sentimiento: Positivo
842699     Sentimiento: Positivo
64125      Sentimiento: Negativo
                   ...          
714976     Sentimiento: Positivo
858689     Sentimiento: Negativo
1081849    Sentimiento: Positivo
179247     Sentimiento: Negativo
399629     Sentimiento: Negativo
Name: etiqueta, Length: 50000, dtype: object

Change the labels to 0 for Negative and 1 for Positive

In [135]:
df['etiqueta'] = df['etiqueta'].apply(lambda x: 0 if x == 'Sentimiento: Negativo' else 1).astype(int)

In [136]:
df['texto']

387072     Consulta: < Todos mis compañeros de escuela ca...
647122     Consulta: < se siente perdido que es lo que qu...
1076575    Consulta: <Bootleg: The Killers - En vivo en L...
842699          Consulta: < @USER pero Kiwein lo compensó>\n
64125      Consulta: < @USUARIO lamento escuchar que tu p...
                                 ...                        
714976     Consulta: < @USUARIO Déjame saber lo que piens...
858689     Consulta: < No creo que el cóctel de analgésic...
1081849    Consulta: < Sólo ï¿½18, eso es genial! Puedo v...
179247     Consulta: < @USUARIO oye, leí lo que pasó, lo ...
399629          Consulta: < @USUARIO pero acabo de entrar>\n
Name: texto, Length: 50000, dtype: object

Clean the 'texto' field taking just the sentence between '< >' and delete common innecesary things.

In [137]:
df['texto'] = df['texto'].str.extract(r'<(.*?)>')
df['texto'] = df['texto'].apply(lambda x: x.replace('@USER', '') )
df['texto'] = df['texto'].apply(lambda x: x.replace('@USUARIO', '') )

In [138]:
df['texto']

387072      Todos mis compañeros de escuela cantan feliz ...
647122              se siente perdido que es lo que quiero?!
1076575    Bootleg: The Killers - En vivo en Londres 2008...
842699                               pero Kiwein lo compensó
64125        lamento escuchar que tu perro murió. Eso sie...
                                 ...                        
714976       Déjame saber lo que piensas. Tengo una larga...
858689      No creo que el cóctel de analgésicos que tomé...
1081849     Sólo ï¿½18, eso es genial! Puedo ver mi foto ...
179247       oye, leí lo que pasó, lo siento, te amo, por...
399629                                  pero acabo de entrar
Name: texto, Length: 50000, dtype: object

Reset the index and delete the old one

In [139]:
df.reset_index(drop=True, inplace= True)

In [140]:
df

Unnamed: 0,etiqueta,texto
0,1,Todos mis compañeros de escuela cantan feliz ...
1,0,se siente perdido que es lo que quiero?!
2,1,Bootleg: The Killers - En vivo en Londres 2008...
3,1,pero Kiwein lo compensó
4,0,lamento escuchar que tu perro murió. Eso sie...
...,...,...
49995,1,Déjame saber lo que piensas. Tengo una larga...
49996,0,No creo que el cóctel de analgésicos que tomé...
49997,1,"Sólo ï¿½18, eso es genial! Puedo ver mi foto ..."
49998,0,"oye, leí lo que pasó, lo siento, te amo, por..."


Save our clean dataset in csv

In [141]:
dataset_test_es = df

In [142]:
dataset_test_es.to_csv('./src/dataset_test_es')

## English Dataset

In [143]:
twitter_eng = load_dataset("tweet_eval", "sentiment")

In [144]:
# 23k

df_train = pd.DataFrame(twitter_eng['train'])
df_test = pd.DataFrame(twitter_eng['test'])

In [145]:
twitter_eng = pd.concat([df_train, df_test])

In [146]:
negative_samples = twitter_eng[twitter_eng['label'] == 0]
positive_samples = twitter_eng[twitter_eng['label'] == 2].sample(11065)

In [147]:
twitter_eng = pd.concat([negative_samples, positive_samples])

In [148]:
twitter_eng['label'].value_counts()

label
0    11065
2    11065
Name: count, dtype: int64

In [149]:
twitter_eng

Unnamed: 0,text,label
7,So disappointed in wwe summerslam! I want to s...,0
13,That sucks if you have to take the SATs tomorrow,0
18,Amy Schumer sat down with The Hollywood Report...,0
42,@user how the hell does every one else get to ...,0
50,Thanks manager for putting me on the schedule ...,0
...,...,...
37104,@user love you too bro. Be safe on the road ba...,2
32888,"Just saw ""Soaked In Bleach"". All I can say is ...",2
29563,"""On this day, 44 yrs ago, Zeppelin played thei...",2
5376,I love the Grateful Dead. I do not love weed. ...,2


In [150]:
twitter_eng['label'] = twitter_eng['label'].apply(lambda x: 1 if x == 2 else 0)

In [151]:
twitter_eng['text'] = twitter_eng['text'].apply(lambda x: x.replace('@user', '') )
twitter_eng['text'] = twitter_eng['text'].apply(lambda x: x.replace('br', '') )
twitter_eng['text'] = twitter_eng['text'].apply(lambda x: x.replace('<', '') )
twitter_eng['text'] = twitter_eng['text'].apply(lambda x: x.replace('>', '') )

In [152]:
twitter_eng['text']

7        So disappointed in wwe summerslam! I want to s...
13        That sucks if you have to take the SATs tomorrow
18       Amy Schumer sat down with The Hollywood Report...
42        how the hell does every one else get to keep ...
50       Thanks manager for putting me on the schedule ...
                               ...                        
37104     love you too o. Be safe on the road back home...
32888    Just saw "Soaked In Bleach". All I can say is ...
29563    "On this day, 44 yrs ago, Zeppelin played thei...
5376     I love the Grateful Dead. I do not love weed. ...
21025    Ghoncheh Ghavami's day in court will take plac...
Name: text, Length: 22130, dtype: object

Long sentences

In [153]:
imdb = load_dataset("imdb")

In [154]:
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [155]:
df_imdb = df = pd.DataFrame(imdb['train'])

In [156]:
df_imdb['label'].value_counts()

label
0    12500
1    12500
Name: count, dtype: int64

In [157]:
df_imdb['text'] = df_imdb['text'].apply(lambda x: x.replace('\\', '') )
df_imdb['text'] = df_imdb['text'].apply(lambda x: x.replace('br', '') )
df_imdb['text'] = df_imdb['text'].apply(lambda x: x.replace('<', '') )
df_imdb['text'] = df_imdb['text'].apply(lambda x: x.replace('>', '') )

In [158]:
df_imdb['text']

0        I rented I AM CURIOUS-YELLOW from my video sto...
1        "I Am Curious: Yellow" is a risible and preten...
2        If only to avoid making this type of film in t...
3        This film was probably inspired by Godard's Ma...
4        Oh, other...after hearing about this ridiculou...
                               ...                        
24995    A hit at the time but now better categorised a...
24996    I love this movie like no other. Another time ...
24997    This film and it's sequel Barry Mckenzie holds...
24998    'The Adventures Of Barry McKenzie' started lif...
24999    The story centers around Barry McKenzie who mu...
Name: text, Length: 25000, dtype: object

In [159]:
twitter_eng

Unnamed: 0,text,label
7,So disappointed in wwe summerslam! I want to s...,0
13,That sucks if you have to take the SATs tomorrow,0
18,Amy Schumer sat down with The Hollywood Report...,0
42,how the hell does every one else get to keep ...,0
50,Thanks manager for putting me on the schedule ...,0
...,...,...
37104,love you too o. Be safe on the road back home...,1
32888,"Just saw ""Soaked In Bleach"". All I can say is ...",1
29563,"""On this day, 44 yrs ago, Zeppelin played thei...",1
5376,I love the Grateful Dead. I do not love weed. ...,1


In [160]:
dataset_test_en = pd.concat([df_imdb, twitter_eng])

In [161]:
dataset_test_en['label'].value_counts()

label
0    23565
1    23565
Name: count, dtype: int64

In [162]:
dataset_test_en.to_csv('./src/dataset_test_en')

In [166]:
dataset_test_es.rename(columns={'texto': 'text', 'etiqueta': 'label'}, inplace=True)

In [167]:
mixed_dataset = pd.concat([dataset_test_en, dataset_test_es])

In [171]:
mixed_dataset

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, other...after hearing about this ridiculou...",0
...,...,...
49995,Déjame saber lo que piensas. Tengo una larga...,1
49996,No creo que el cóctel de analgésicos que tomé...,0
49997,"Sólo ï¿½18, eso es genial! Puedo ver mi foto ...",1
49998,"oye, leí lo que pasó, lo siento, te amo, por...",0


In [170]:
mixed_dataset['label'].value_counts()

label
0    48788
1    48342
Name: count, dtype: int64

In [172]:
mixed_dataset.to_csv('./src/dataset_test_mixed')

Other things

In [174]:
from datasets import list_datasets

# List of datasets included in the 'dataset' library
datasets_list = list_datasets()

# Looking for 'spanish' in a data set name
spanish_datasets = [dataset_name for dataset_name in datasets_list if 'spanish' in dataset_name.lower()]

print(*spanish_datasets, sep='\n')

large_spanish_corpus
spanish_billion_words
PereLluis13/spanish_speech_text
fvillena/spanish_diagnostics
jhonparra18/spanish_billion_words_clean
jorge-henao/disco_poetry_spanish
IIC/spanish_biomedical_crawled_corpus
IIC/lfqa_spanish
josearangos/spanish-calls-corpus-Home
josearangos/spanish-calls-corpus-Caribbean
josearangos/spanish-calls-corpus-Friends
sayalaruano/FakeNewsCorpusSpanish
sayalaruano/FakeNewsSpanish_Kaggle1
sayalaruano/FakeNewsSpanish_Kaggle2
wesamhaddad14/spanishNLP
IIC/spanish_biomedical_crawled_corpus_splitted
andreamorgar/spanish_poetry
hackathon-pln-es/Axolotl-Spanish-Nahuatl
hackathon-pln-es/disco_spanish_poetry
hackathon-pln-es/spanish-poetry-dataset
hackathon-pln-es/spanish-to-quechua
mathigatti/spanish_imdb_synopsis
rockdrigoma/spanish-nahuatl-flagging
pysentimiento/spanish-targeted-sentiment-headlines
Nexdata/Spanish_Conversational_Speech_Data_by_Mobile_Phone
Nexdata/Spanish_Speech_Data_by_Mobile_Phone
Nexdata/Spanish_Speech_Data_by_Mobile_Phone_Reading
Nexdata/S

# Some Spanish Datasets

'BrunoGR/Twitter_Sentiment_Analysis_Train_Corpus_in_Spanish' # negative and positive --------- more than 130k row in total (train and test)

'pysentimiento/spanish-targeted-sentiment-headlines' # A few rows 0 NEG, 1 NEU and 2 POS --------- 2k

'sepidmnorozy/Spanish_sentiment' # A few rows (2k) and disbalanced class

'beltrewilton/punta-cana-spanish-reviews' # 1 to 5 rate

'large_spanish_corpus' # General sentences in spanish

'BrunoGR/emotional_response_spanish_dataset' # Not labels just emotions
