## Using translations

In [1]:
%load_ext autoreload
%autoreload 2
import os
from datetime import datetime
import fire
import torch
import pandas as pd
from torchtext import data
import torch.nn as nn
from transformers import (
    AdamW, BertForSequenceClassification, BertTokenizer,
    get_constant_schedule_with_warmup
)
import csv

from offenseval.nn import (
    Tokenizer,
    train, evaluate, train_cycle, save_model, load_model, evaluate_dataset
)
from offenseval.datasets import datasets

pd.options.display.max_rows = 200
pd.options.display.max_colwidth = 300

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [34]:
import glob
import csv

arabic_train = pd.read_table(
    "../../data/Arabic/offenseval-ar-training-v1.tsv", 
    index_col=0,
    quoting=csv.QUOTE_NONE
)

dfs = []
for file in glob.glob("../../data/translations/Arabic/*training*.json"):
    translation_df = pd.read_json(file)
    translation_df.set_index("id", inplace=True)
    text_col = translation_df.columns[0]
    translation_df["subtask_a"] = arabic_train["subtask_a"]
    translation_df.reset_index(inplace=True)
    translation_df.rename(columns={text_col: "tweet", "id": "original_id"}, inplace=True)
    
    dfs.append(translation_df)

In [46]:
arabic_df_train = pd.concat(dfs)

arabic_df_train.index = range(1, 1+len(arabic_df_train))

arabic_df_train.to_csv("../../data/translations/Arabic/train.tsv", sep="\t", index_label="id")

## Turkish

In [53]:
import glob
import csv

turkish_train = pd.read_table(
    "../../data/Turkish/train.tsv", 
    index_col=0,
    quoting=csv.QUOTE_NONE
)

dfs = []
for file in glob.glob("../../data/translations/Turkish/*training*.json"):
    translation_df = pd.read_json(file)
    translation_df.set_index("id", inplace=True)
    # Usar solamente los que están en train
    translation_df = translation_df.loc[turkish_train.index]
    text_col = translation_df.columns[0]
    translation_df["subtask_a"] = turkish_train["subtask_a"]
    
    translation_df.reset_index(inplace=True)
    translation_df.rename(columns={text_col: "tweet", "id": "original_id"}, inplace=True)
    
    dfs.append(translation_df)

In [60]:
turkish_df_trans = pd.concat(dfs)

turkish_df_trans.index = range(1, 1+len(turkish_df_trans))

turkish_df_trans.to_csv("../../data/translations/Turkish/train.tsv", sep="\t", index_label="id")

## Greek 

In [63]:
import glob
import csv

greek_train = pd.read_table(
    "../../data/Greek/train.tsv", 
    index_col=0,
    quoting=csv.QUOTE_NONE
)

dfs = []
for file in glob.glob("../../data/translations/Greek/*training*.json"):
    translation_df = pd.read_json(file)
    translation_df.set_index("id", inplace=True)
    # Usar solamente los que están en train
    translation_df = translation_df.loc[greek_train.index]
    text_col = translation_df.columns[0]
    translation_df["subtask_a"] = greek_train["subtask_a"]
    
    translation_df.reset_index(inplace=True)
    translation_df.rename(columns={text_col: "tweet", "id": "original_id"}, inplace=True)
    
    dfs.append(translation_df)

In [69]:
greek_df_trans = pd.concat(dfs)

greek_df_trans.index = range(1, 1+len(greek_df_trans))

greek_df_trans.to_csv("../../data/translations/Greek/train.tsv", sep="\t", index_label="id")

## Danish

In [70]:
import glob
import csv

danish_train = pd.read_table(
    "../../data/Danish/train.tsv", 
    index_col=0,
    quoting=csv.QUOTE_NONE
)

dfs = []
for file in glob.glob("../../data/translations/Danish/*training*.json"):
    translation_df = pd.read_json(file)
    translation_df.set_index("id", inplace=True)
    # Usar solamente los que están en train
    translation_df = translation_df.loc[danish_train.index]
    text_col = translation_df.columns[0]
    translation_df["subtask_a"] = danish_train["subtask_a"]
    
    translation_df.reset_index(inplace=True)
    translation_df.rename(columns={text_col: "tweet", "id": "original_id"}, inplace=True)
    
    dfs.append(translation_df)

In [72]:
danish_df_trans = pd.concat(dfs)

danish_df_trans.index = range(1, 1+len(danish_df_trans))

danish_df_trans.to_csv("../../data/translations/Danish/train.tsv", 
                       sep="\t", index_label="id")