# Conexión a google drive

In [3]:
# editar por el PATH correcto
# %cd drive/MyDrive/my_folder/patch

In [None]:
%cd drive/MyDrive/'Colab Notebooks/TTT-2020 1-16'

In [5]:
%pwd

'/content/drive/My Drive/Colab Notebooks/TTT-2020 1-16'

# Utils

## tweet_preprocessing()

In [6]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def tweet_preprocessing_1(raw_tweet):
  import re

  preprocessed_tweet = raw_tweet.lower()
  preprocessed_tweet = re.sub(r"@\S+", "_usr", preprocessed_tweet)                # user_names handling
  preprocessed_tweet = re.sub(r"#\S+", "_htg", preprocessed_tweet)                # hashtag handling
  preprocessed_tweet = re.sub(r"https?://\S+", "_url", preprocessed_tweet)        # url's handling
  preprocessed_tweet = re.sub(r"[^A-za-z\sÁÉÍÓÚáéíóúÑñ?!\.,\d]", '', preprocessed_tweet)     # numeric and other-simbols handling  
  preprocessed_tweet = word_tokenize(preprocessed_tweet)  

  return preprocessed_tweet

def tweet_preprocessing(raw_tweet, format):
  """
  user_name      ->  nombre
  https//:sdasf   ->  link
  #some_hashtag   ->  tema

  """
  import re
  raw_tweet += "  "

  # the format option only affects the way how hastags and url's are handled
  # format_1:   #some_topic -> ' _htg '  ;  https://some_link.com -> ' _url ' 
  # format_2:   #some_topic -> ' '  ;  https://some_link.com -> ' ' 

  if format == 1:
    hashtag_replace = ' _htg '
    url_replace = ' _url '
  elif format == 2:
    hashtag_replace = '  '
    url_replace = '  '

  # loweercasing
  preprocessed_tweet = raw_tweet.lower()
  # laughing variants 
  preprocessed_tweet = re.sub(r"\S+j[aeiou]j\S+", ' risa ', preprocessed_tweet)    # jajajajajjaj, jejeje, jiijijijij
  preprocessed_tweet = re.sub(r"\S+jsj\S+", ' risa ', preprocessed_tweet)          # jsjsjsj  
  preprocessed_tweet = re.sub(r"\S+hah\S+", ' risa ', preprocessed_tweet)          # hahahahah 
  preprocessed_tweet = re.sub(r"\S+ksk\S+", ' risa ', preprocessed_tweet)          # ksksksksk
  # inclusive language
  preprocessed_tweet = re.sub('(?<=\S)@(?=\S)', 'o', preprocessed_tweet)           # alumn@s -> alumnos
  # usernames
  preprocessed_tweet = re.sub(r"@\S+", " nombre ", preprocessed_tweet)             # username -> nombre       
  # hashtags
  preprocessed_tweet = re.sub(r"#\S+", hashtag_replace, preprocessed_tweet)        # #some_topic -> ' '          
  # url's
  preprocessed_tweet = re.sub(r"https?://\S+", url_replace, preprocessed_tweet)    # https://some_link.com -> ' '        
  # delete numeric and non alphabetic simbols
  preprocessed_tweet = re.sub(r"[^a-z\sáéíóúñ?!]", '', preprocessed_tweet)         # 'juan ramón tiene 2 pesos $' -> 'juan ramon tiene  pesos' 
  #repeated characters
  preprocessed_tweet=re.sub(r'([^rlce])(?=\1)', '', preprocessed_tweet)            # 'repetttidooo' -> 'repetido'   
  preprocessed_tweet=re.sub(r'[e]{3,}', 'e', preprocessed_tweet)                   # 'jodeeeer' -> 'joder' 
  preprocessed_tweet=re.sub(r'[r]{3,}', 'rr', preprocessed_tweet)                  # 'carrrrro' -> 'carro'
  preprocessed_tweet=re.sub(r'[l]{3,}', 'll', preprocessed_tweet)                  # 'llllamada' -> 'llamada'
  preprocessed_tweet=re.sub(r'[c]{3,}', 'cc', preprocessed_tweet)                  # 'cocccion' -> 'coccion'
  preprocessed_tweet = word_tokenize(preprocessed_tweet)  
  
  return preprocessed_tweet

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
tweet_preprocessing("Caricachupas 👏🏼👏🏼 Presenta 👏🏼👏🏼 Nombres de 👏🏼👏...", 2)

['caricachupas', 'presenta', 'nombres', 'de']

## mapToFiveClassesFormat()

In [8]:
def mapToFiveClassesFormat(labels):
  '''
  Maps labels for tasks A, B1 and B2 into label in five_classes_format

    [HT = 0, TR = 0, AG = 0]  ->  0
    [HT = 1, TR = 0, AG = 0]  ->  1
    [HT = 1, TR = 0, AG = 1]  ->  2
    [HT = 1, TR = 1, AG = 0]  ->  3
    [HT = 1, TR = 1, AG = 1]  ->  4

  input:
  (HT,TR,AG)   - tuple, labels for tasks A, B1 an B2

  output
  label        - int, label in five_classes_format
  
  '''
  if labels['HS']==0:
    return 0

  elif labels['HS']==1:

    if labels['TR']==0:
      if labels['AG']==0:
        return 1
      elif labels['AG']==1:
        return 2

    elif labels['TR']==1:
      if labels['AG']==0:
        return 3
      elif labels['AG']==1:
        return 4
      

# train-set

## Importar dataset desde archivos CSV

In [9]:
import pandas as pd
import csv

#pandas.read_csv(filename, sep='\t', lineterminator='\r')

df_1 = pd.read_csv("./dataset_files/CSV/train_es.tsv", 
                   sep='\t', 
                   lineterminator='\n', 
                   quoting=csv.QUOTE_NONE)

df_2 = pd.read_csv("./dataset_files/CSV/dev_es.tsv", 
                   sep='\t', 
                   lineterminator='\n', 
                   quoting=csv.QUOTE_NONE)

df_train = df_1.append(df_2, ignore_index = True)

In [None]:
# debieron cargarse 5000 instancias de entrenamiento
df_train.shape

(5000, 5)

In [None]:
df_train.head()

Unnamed: 0,id,text,HS,TR,AG
0,20001,Easyjet quiere duplicar el número de mujeres p...,1,0,0
1,20002,El gobierno debe crear un control estricto de ...,1,0,0
2,20003,Yo veo a mujeres destruidas por acoso laboral ...,0,0,0
3,20004,"— Yo soy respetuoso con los demás, sólamente l...",0,0,0
4,20007,Antonio Caballero y como ser de mal gusto e ig...,0,0,0


In [None]:
df_train.AG.value_counts()

0    3322
1    1678
Name: AG, dtype: int64

In [None]:
df_train.loc[:5,'HS':'AG']

Unnamed: 0,HS,TR,AG
0,1,0,0
1,1,0,0
2,0,0,0
3,0,0,0
4,0,0,0
5,0,0,0


## Preprocesamiento y tokenización

In [None]:
df_train['preprocessed_text_1'] = df_train.text.apply(tweet_preprocessing,args=([1]))
df_train['preprocessed_text_2'] = df_train.text.apply(tweet_preprocessing,args=([2]))

In [None]:
df_train.head()

Unnamed: 0,id,text,HS,TR,AG,preprocessed_text_1,preprocessed_text_2
0,20001,Easyjet quiere duplicar el número de mujeres p...,1,0,0,"[easyjet, quiere, duplicar, el, número, de, mu...","[easyjet, quiere, duplicar, el, número, de, mu..."
1,20002,El gobierno debe crear un control estricto de ...,1,0,0,"[el, gobierno, debe, crear, un, control, estri...","[el, gobierno, debe, crear, un, control, estri..."
2,20003,Yo veo a mujeres destruidas por acoso laboral ...,0,0,0,"[yo, veo, a, mujeres, destruidas, por, acoso, ...","[yo, veo, a, mujeres, destruidas, por, acoso, ..."
3,20004,"— Yo soy respetuoso con los demás, sólamente l...",0,0,0,"[yo, soy, respetuoso, con, los, demás, sólamen...","[yo, soy, respetuoso, con, los, demás, sólamen..."
4,20007,Antonio Caballero y como ser de mal gusto e ig...,0,0,0,"[antonio, caballero, y, como, ser, de, mal, gu...","[antonio, caballero, y, como, ser, de, mal, gu..."


## Etiqueta para la tarea conjunta AB

In [None]:
df_train['HTA'] = df_train.loc[:,['HS', 'TR', 'AG']].apply(mapToFiveClassesFormat, axis=1)

In [None]:
df_train.head()

Unnamed: 0,id,text,HS,TR,AG,preprocessed_text_1,preprocessed_text_2,HTA
0,20001,Easyjet quiere duplicar el número de mujeres p...,1,0,0,"[easyjet, quiere, duplicar, el, número, de, mu...","[easyjet, quiere, duplicar, el, número, de, mu...",1
1,20002,El gobierno debe crear un control estricto de ...,1,0,0,"[el, gobierno, debe, crear, un, control, estri...","[el, gobierno, debe, crear, un, control, estri...",1
2,20003,Yo veo a mujeres destruidas por acoso laboral ...,0,0,0,"[yo, veo, a, mujeres, destruidas, por, acoso, ...","[yo, veo, a, mujeres, destruidas, por, acoso, ...",0
3,20004,"— Yo soy respetuoso con los demás, sólamente l...",0,0,0,"[yo, soy, respetuoso, con, los, demás, sólamen...","[yo, soy, respetuoso, con, los, demás, sólamen...",0
4,20007,Antonio Caballero y como ser de mal gusto e ig...,0,0,0,"[antonio, caballero, y, como, ser, de, mal, gu...","[antonio, caballero, y, como, ser, de, mal, gu...",0


## Train folds

In [None]:
from sklearn import model_selection
n_folds = 7
df_train['kfold'] = -1
df_train = df_train.sample(frac=1).reset_index(drop=True)

#y = df_train['HS'].values
y = df_train['HTA'].values
skf = model_selection.StratifiedKFold(n_splits=n_folds)

for f, (t_, v_) in enumerate(skf.split(X=df_train, y=y)):
    df_train.loc[v_, "kfold"] = f

#df_train.to_csv("./dataset_files/CSV/train_folds.csv", index=False)

In [None]:
df_train.head()

Unnamed: 0,id,text,HS,TR,AG,preprocessed_text_1,preprocessed_text_2,HTA,kfold
0,22083,"@pnique Inmigrante motorizado,eres una rata de...",1,1,1,"[nombre, inmigrante, motorizadoeres, una, rata...","[nombre, inmigrante, motorizadoeres, una, rata...",4,0
1,20095,la sacaron justo a l dia siguiente del documen...,0,0,0,"[la, sacaron, justo, a, l, dia, siguiente, del...","[la, sacaron, justo, a, l, dia, siguiente, del...",0,0
2,22568,@FioreSalo Callate vs puta,1,1,1,"[nombre, callate, vs, puta]","[nombre, callate, vs, puta]",4,0
3,22107,@Pontifex_es @Pontifex @valealazraki precisame...,0,0,0,"[nombre, nombre, nombre, precisamente, en, mi,...","[nombre, nombre, nombre, precisamente, en, mi,...",0,0
4,20895,QUE haces con cc mogolica pili — Puta tomi cal...,1,1,1,"[que, haces, con, cc, mogolica, pili, puta, to...","[que, haces, con, cc, mogolica, pili, puta, to...",4,0


### Verificar similitud en las distribuciones de los diferentes 'folds'

In [None]:
HS_frequencies = []
TR_frequencies = []
AG_frequencies = []
HTA_frequencies = []

for i in range(K):
  HS_frequencies.append(df_train.loc[df_train.kfold==i].HS.value_counts())
  TR_frequencies.append(df_train.loc[df_train.kfold==i].TR.value_counts())
  AG_frequencies.append(df_train.loc[df_train.kfold==i].AG.value_counts())
  HTA_frequencies.append(df_train.loc[df_train.kfold==i].HTA.value_counts())

In [None]:
HS_frequencies

[0    417
 1    298
 Name: HS, dtype: int64, 0    417
 1    298
 Name: HS, dtype: int64, 0    417
 1    297
 Name: HS, dtype: int64, 0    417
 1    297
 Name: HS, dtype: int64, 0    418
 1    296
 Name: HS, dtype: int64, 0    418
 1    296
 Name: HS, dtype: int64, 0    417
 1    297
 Name: HS, dtype: int64]

In [None]:
TR_frequencies

[0    533
 1    182
 Name: TR, dtype: int64, 0    533
 1    182
 Name: TR, dtype: int64, 0    533
 1    181
 Name: TR, dtype: int64, 0    533
 1    181
 Name: TR, dtype: int64, 0    534
 1    180
 Name: TR, dtype: int64, 0    534
 1    180
 Name: TR, dtype: int64, 0    534
 1    180
 Name: TR, dtype: int64]

In [None]:
AG_frequencies

In [None]:
HTA_frequencies

[0    417
 4    169
 2     71
 1     45
 3     13
 Name: HTA, dtype: int64, 0    417
 4    169
 2     71
 1     45
 3     13
 Name: HTA, dtype: int64, 0    417
 4    169
 2     71
 1     45
 3     12
 Name: HTA, dtype: int64, 0    417
 4    169
 2     71
 1     45
 3     12
 Name: HTA, dtype: int64, 0    418
 4    168
 2     71
 1     45
 3     12
 Name: HTA, dtype: int64, 0    418
 4    168
 2     71
 1     45
 3     12
 Name: HTA, dtype: int64, 0    417
 4    168
 2     72
 1     45
 3     12
 Name: HTA, dtype: int64]

## Guardar el dataset preprocesado

In [None]:
columns_list = ['id', 'text', 'preprocessed_text_1', 'preprocessed_text_2', 'HS', 'TR', 'AG', 'HTA', 'kfold']
df_train = df_train[columns_list]

In [None]:
df_train.head(5)

Unnamed: 0,id,text,preprocessed_text_1,preprocessed_text_2,HS,TR,AG,HTA,kfold
0,22083,"@pnique Inmigrante motorizado,eres una rata de...","[nombre, inmigrante, motorizadoeres, una, rata...","[nombre, inmigrante, motorizadoeres, una, rata...",1,1,1,4,0
1,20095,la sacaron justo a l dia siguiente del documen...,"[la, sacaron, justo, a, l, dia, siguiente, del...","[la, sacaron, justo, a, l, dia, siguiente, del...",0,0,0,0,0
2,22568,@FioreSalo Callate vs puta,"[nombre, callate, vs, puta]","[nombre, callate, vs, puta]",1,1,1,4,0
3,22107,@Pontifex_es @Pontifex @valealazraki precisame...,"[nombre, nombre, nombre, precisamente, en, mi,...","[nombre, nombre, nombre, precisamente, en, mi,...",0,0,0,0,0
4,20895,QUE haces con cc mogolica pili — Puta tomi cal...,"[que, haces, con, cc, mogolica, pili, puta, to...","[que, haces, con, cc, mogolica, pili, puta, to...",1,1,1,4,0


In [None]:
df_train.to_pickle('./dataset_files/preprocessed_train_dataset_7_folds.data', None)

In [None]:
import pandas as pd

In [None]:
reloaded_df_train = pd.read_pickle('./dataset_files/preprocessed_train_dataset_7_folds.data', None)

In [None]:
reloaded_df_train.head()

Unnamed: 0,id,text,preprocessed_text_1,preprocessed_text_2,HS,TR,AG,HTA,kfold
0,22083,"@pnique Inmigrante motorizado,eres una rata de...","[nombre, inmigrante, motorizadoeres, una, rata...","[nombre, inmigrante, motorizadoeres, una, rata...",1,1,1,4,0
1,20095,la sacaron justo a l dia siguiente del documen...,"[la, sacaron, justo, a, l, dia, siguiente, del...","[la, sacaron, justo, a, l, dia, siguiente, del...",0,0,0,0,0
2,22568,@FioreSalo Callate vs puta,"[nombre, callate, vs, puta]","[nombre, callate, vs, puta]",1,1,1,4,0
3,22107,@Pontifex_es @Pontifex @valealazraki precisame...,"[nombre, nombre, nombre, precisamente, en, mi,...","[nombre, nombre, nombre, precisamente, en, mi,...",0,0,0,0,0
4,20895,QUE haces con cc mogolica pili — Puta tomi cal...,"[que, haces, con, cc, mogolica, pili, puta, to...","[que, haces, con, cc, mogolica, pili, puta, to...",1,1,1,4,0


In [None]:
reloaded_df_train.AG.value_counts()

0    3322
1    1678
Name: AG, dtype: int64

In [None]:
reloaded_df_train.HTA.value_counts()

0    2921
4    1180
2     498
1     315
3      86
Name: HTA, dtype: int64

# test-set

## Importar dataset desde archivos CSV

In [None]:
import pandas as pd
import csv

df_test = pd.read_csv("./dataset_files/CSV/test_es.tsv",
                   names=['id', 'text', 'HS', 'TR', 'AG'],
                   sep='\t', 
                   lineterminator='\n', 
                   quoting=csv.QUOTE_NONE)

In [None]:
# debieron cargarse 5000 instancias de entrenamiento
df_test.shape

(1600, 5)

In [None]:
df_test.head()

Unnamed: 0,id,text,HS,TR,AG
0,30344,#CadaMañana cállate la puta que me pario Kohan...,0,0,0
1,30466,Estas navidades mi polla mereces,0,0,0
2,31084,Si no aprobas te pego una cojida que no te la ...,1,1,1
3,34552,"@AlecitoGamer @falklands_utd A, no entendiste ...",0,0,0
4,32538,"@deborahhq1973 Es normal, en Cataluña los Inde...",1,0,1


In [None]:
df_test.HTA.value_counts()

0    940
4    371
1    134
2    103
3     52
Name: HTA, dtype: int64

## Preprocesamiento y tokenización

In [None]:
df_test['preprocessed_text_1'] = df_test.text.apply(tweet_preprocessing,args=([1]))
df_test['preprocessed_text_2'] = df_test.text.apply(tweet_preprocessing,args=([2]))

In [None]:
df_test.head()

Unnamed: 0,id,text,HS,TR,AG,preprocessed_text_1,preprocessed_text_2
0,30344,#CadaMañana cállate la puta que me pario Kohan...,0,0,0,"[htg, cállate, la, puta, que, me, pario, kohan...","[cállate, la, puta, que, me, pario, kohan, vos..."
1,30466,Estas navidades mi polla mereces,0,0,0,"[estas, navidades, mi, polla, mereces]","[estas, navidades, mi, polla, mereces]"
2,31084,Si no aprobas te pego una cojida que no te la ...,1,1,1,"[si, no, aprobas, te, pego, una, cojida, que, ...","[si, no, aprobas, te, pego, una, cojida, que, ..."
3,34552,"@AlecitoGamer @falklands_utd A, no entendiste ...",0,0,0,"[nombre, nombre, a, no, entendiste, nada, ud, ...","[nombre, nombre, a, no, entendiste, nada, ud, ..."
4,32538,"@deborahhq1973 Es normal, en Cataluña los Inde...",1,0,1,"[nombre, es, normal, en, cataluña, los, indepe...","[nombre, es, normal, en, cataluña, los, indepe..."


## Etiqueta para la tarea conjunta AB

In [None]:
df_test['HTA'] = df_test.loc[:,['HS', 'TR', 'AG']].apply(mapToFiveClassesFormat, axis=1)

In [None]:
df_test.head()

Unnamed: 0,id,text,HS,TR,AG,preprocessed_text_1,preprocessed_text_2,HTA
0,30344,#CadaMañana cállate la puta que me pario Kohan...,0,0,0,"[htg, cállate, la, puta, que, me, pario, kohan...","[cállate, la, puta, que, me, pario, kohan, vos...",0
1,30466,Estas navidades mi polla mereces,0,0,0,"[estas, navidades, mi, polla, mereces]","[estas, navidades, mi, polla, mereces]",0
2,31084,Si no aprobas te pego una cojida que no te la ...,1,1,1,"[si, no, aprobas, te, pego, una, cojida, que, ...","[si, no, aprobas, te, pego, una, cojida, que, ...",4
3,34552,"@AlecitoGamer @falklands_utd A, no entendiste ...",0,0,0,"[nombre, nombre, a, no, entendiste, nada, ud, ...","[nombre, nombre, a, no, entendiste, nada, ud, ...",0
4,32538,"@deborahhq1973 Es normal, en Cataluña los Inde...",1,0,1,"[nombre, es, normal, en, cataluña, los, indepe...","[nombre, es, normal, en, cataluña, los, indepe...",2


## Guardar el dataset preprocesado

In [None]:
columns_list = ['id', 'text', 'preprocessed_text_1', 'preprocessed_text_2', 'HS', 'TR', 'AG', 'HTA']
df_test = df_test[columns_list]

In [None]:
df_test.head(5)

Unnamed: 0,id,text,preprocessed_text_1,preprocessed_text_2,HS,TR,AG,HTA
0,30344,#CadaMañana cállate la puta que me pario Kohan...,"[htg, cállate, la, puta, que, me, pario, kohan...","[cállate, la, puta, que, me, pario, kohan, vos...",0,0,0,0
1,30466,Estas navidades mi polla mereces,"[estas, navidades, mi, polla, mereces]","[estas, navidades, mi, polla, mereces]",0,0,0,0
2,31084,Si no aprobas te pego una cojida que no te la ...,"[si, no, aprobas, te, pego, una, cojida, que, ...","[si, no, aprobas, te, pego, una, cojida, que, ...",1,1,1,4
3,34552,"@AlecitoGamer @falklands_utd A, no entendiste ...","[nombre, nombre, a, no, entendiste, nada, ud, ...","[nombre, nombre, a, no, entendiste, nada, ud, ...",0,0,0,0
4,32538,"@deborahhq1973 Es normal, en Cataluña los Inde...","[nombre, es, normal, en, cataluña, los, indepe...","[nombre, es, normal, en, cataluña, los, indepe...",1,0,1,2


In [None]:
df_test.to_pickle('./dataset_files/preprocessed_test_dataset.data', None)

# Recursos y sidenotes

### csv.quoting (para cargar los archivos **.tsv** correctamente)

  1. https://stackoverflow.com/questions/35598249/rows-are-lost-when-reading-this-tab-separated-file-with-pandas-read-csv

  2. https://stackoverflow.com/questions/21147058/pandas-to-csv-output-quoting-issue

  3. https://stackoverflow.com/questions/43344241/quoting-parameter-in-pandas-read-csv
