In [None]:
!pip install sentencepiece

In [None]:
!pip install transformers command

In [1]:
from transformers import MarianMTModel, MarianTokenizer
from sklearn.preprocessing import MultiLabelBinarizer
from collections import defaultdict
from csv import writer
import pandas as pd
import os

In [2]:
class DontPatronizeMe:

    def __init__(self, train_path, test_path):

        self.train_path = train_path
        self.test_path = test_path
        self.train_task1_df = None
        self.train_task2_df = None
        self.test_set = None

    def load_task1(self):
        """
        Load task 1 training set and convert the tags into binary labels. 
        Paragraphs with original labels of 0 or 1 are considered to be negative examples of PCL and will have the label 0 = negative.
        Paragraphs with original labels of 2, 3 or 4 are considered to be positive examples of PCL and will have the label 1 = positive.
        It returns a pandas dataframe with paragraphs and labels.
        """
        rows=[]
        with open(os.path.join(self.train_path, 'C:\\Users\\Larisa\\Downloads\\machine_translation\\Project\\data\\dontpatronizeme_pcl.tsv')) as f:
            for line in f.readlines()[::]:
                par_id=line.strip().split('\t')[0]
                art_id = line.strip().split('\t')[1]
                keyword=line.strip().split('\t')[2]
                country=line.strip().split('\t')[3]
                t=line.strip().split('\t')[4].lower()
                l=line.strip().split('\t')[-1]
                if l=='0' or l=='1':
                    lbin=0
                else:
                    lbin=1
                rows.append({'par_id':par_id,
                            'art_id':art_id,
                            'keyword':keyword,
                            'country':country,
                            'text':t, 
                            'label':lbin, 
                            'orig_label':l
                        } )
        df=pd.DataFrame(rows, columns=['par_id', 'art_id', 'keyword', 'country', 'text', 'label', 'orig_label']) 
        self.train_task1_df = df

    def load_task2(self, return_one_hot=True):
        # Reads the data for task 2 and present it as paragraphs with binarized labels (a list with seven positions, "activated or not (1 or 0)",
        # depending on wether the category is present in the paragraph).
        # It returns a pandas dataframe with paragraphs and list of binarized labels.
        tag2id = {
                'Unbalanced_power_relations':0,
                'Shallow_solution':1,
                'Presupposition':2,
                'Authority_voice':3,
                'Metaphors':4,
                'Compassion':5,
                'The_poorer_the_merrier':6
                }
        print('Map of label to numerical label:')
        print(tag2id)
        data = defaultdict(list)
        with open (os.path.join(self.train_path, 'dontpatronizeme_categories.tsv')) as f:
            for line in f.readlines()[4:]:
                par_id=line.strip().split('\t')[0]
                art_id = line.strip().split('\t')[1]
                text=line.split('\t')[2].lower()
                keyword=line.split('\t')[3]
                country=line.split('\t')[4]
                start=line.split('\t')[5]
                finish=line.split('\t')[6]
                text_span=line.split('\t')[7]
                label=line.strip().split('\t')[-2]
                num_annotators=line.strip().split('\t')[-1]
                labelid = tag2id[label]
                if not labelid in data[(par_id, art_id, text, keyword, country)]:
                    data[(par_id,art_id, text, keyword, country)].append(labelid)

        par_ids=[]
        art_ids=[]
        pars=[]
        keywords=[]
        countries=[]
        labels=[]

        for par_id, art_id, par, kw, co in data.keys():
            par_ids.append(par_id)
            art_ids.append(art_id)
            pars.append(par)
            keywords.append(kw)
            countries.append(co)

        for label in data.values():
            labels.append(label)

        if return_one_hot:
            labels = MultiLabelBinarizer().fit_transform(labels)
        df = pd.DataFrame(list(zip(par_ids, 
                                    art_ids, 
                                    pars, 
                                    keywords,
                                    countries, 
                                    labels)), columns=['par_id',
                                                        'art_id', 
                                                        'text', 
                                                        'keyword',
                                                        'country', 
                                                        'label',
                                                        ])
        self.train_task2_df = df


    def load_test(self):
        #self.test_df = [line.strip() for line in open(self.test_path)]
        rows=[]
        with open(self.test_path) as f:
            for line in f.readlines()[4:]:
                t=line.strip().split('\t')[3].lower()
                rows.append(t)
        self.test_set = rows


In [3]:
def translate(model, tokenizer, text):
    text = text[:512]
    batch = tokenizer([text], return_tensors="pt")
    gen = model.generate(**batch)
    output = tokenizer.batch_decode(gen, skip_special_tokens=True)
    
    return output[0]

In [4]:
def model_tokenizer(language):
    src = 'en'  # source language
    trg = language  # target language
    model_name = f'Helsinki-NLP/opus-mt-{src}-{trg}'
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    
    return model, tokenizer

In [5]:
def write_prediction(language, column_translate):
    dpm = DontPatronizeMe('.', 'C:\\Users\\Larisa\\Downloads\\machine_translation\\Project\\data\\dontpatronizeme_pcl.tsv')
    # This method loads the subtask 1 data
    dpm.load_task1()
    #take dataframe
    train_df = dpm.train_task1_df
    #define model and tokenizer according to the target language
    model, tokenizer = model_tokenizer(language)
    #define output path 
    pred_csv_path = "translated_{}.tsv".format(language)

    # Open the input_file in read mode and output_file in write mode
    with open(pred_csv_path, 'wt',encoding="utf-8-sig") as write_obj:
        # Create a csv.writer object from the output file object
        csv_writer = writer(write_obj,delimiter='\t')
        # Read each row of the input csv file as list
        for i, row in train_df.iterrows():
            row[column_translate] = translate(model, tokenizer, row[column_translate])
            csv_writer.writerow(row)
 

In [6]:
write_prediction('de', 'text')

In [None]:
def write_prediction_encoding(language, column_translate):
    dpm = DontPatronizeMe('.', 'C:\\Users\\Larisa\\Downloads\\machine_translation\\Project\\data\\dontpatronizeme_pcl.tsv')
    # This method loads the subtask 1 data
    dpm.load_task1()
    #take dataframe
    train_df = dpm.train_task1_df
    #define model and tokenizer according to the target language
    model, tokenizer = model_tokenizer(language)
    #define output path 
    pred_csv_path = "translated_{}.tsv".format(language)

    # Open the input_file in read mode and output_file in write mode
    with open(pred_csv_path, 'wt',encoding="utf-8-sig") as write_obj:
        # Create a csv.writer object from the output file object
        csv_writer = writer(write_obj,delimiter='\t')
        # Read each row of the input csv file as list
        for i, row in train_df.iterrows():
            row[column_translate] = translate(model, tokenizer, row[column_translate])
            csv_writer.writerow(row)

In [None]:
write_prediction_encoding('fr', 'text')

In [None]:
write_prediction_encoding('it', 'text')

In [7]:
def write_prediction_task2(language, column_translate):
    task2_df = pd.read_csv("C:\\Users\\Larisa\\Downloads\\machine_translation\\Project\\data\\dontpatronizeme_categories.tsv",sep='\t', usecols=[i for i in range(10)], names=['id', 'paragraph_id', 'paragraph', 'keyword', 'country_code', 'span_start' ,'span_end' ,'span_text', 'category_label', 'annotators'], header = None)
    #define model and tokenizer according to the target language
    model, tokenizer = model_tokenizer(language)
    #define output path 
    pred_csv_path = "task2_translated_{}.tsv".format(language)

    # Open the input_file in read mode and output_file in write mode
    with open(pred_csv_path, 'wt', encoding = "utf-8-sig") as write_obj:
        # Create a csv.writer object from the output file object
        csv_writer = writer(write_obj,delimiter='\t')
        # Read each row of the input csv file as list
        for i, row in task2_df.iterrows():
            row[column_translate] = translate(model, tokenizer, row[column_translate])
            csv_writer.writerow(row)
 

In [8]:
write_prediction_task2('de', 'span_text')

In [None]:
def write_prediction_task2_encoding(language, column_translate):
    task2_df = pd.read_csv("C:\\Users\\Larisa\\Downloads\\machine_translation\\Project\\data\\dontpatronizeme_categories.tsv",sep='\t', usecols=[i for i in range(10)], names=['id', 'paragraph_id', 'paragraph', 'keyword', 'country_code', 'span_start' ,'span_end' ,'span_text', 'category_label', 'annotators'], header = None)
    #define model and tokenizer according to the target language
    model, tokenizer = model_tokenizer(language)
    #define output path 
    pred_csv_path = "task2_translated_{}.tsv".format(language)

    # Open the input_file in read mode and output_file in write mode
    with open(pred_csv_path, 'wt',encoding="utf-8-sig") as write_obj:
        # Create a csv.writer object from the output file object
        csv_writer = writer(write_obj,delimiter='\t')
        # Read each row of the input csv file as list
        for i, row in task2_df.iterrows():
            row[column_translate] = translate(model, tokenizer, row[column_translate])
            csv_writer.writerow(row)

In [None]:
write_prediction_task2_encoding('fr', 'span_text')

In [None]:
write_prediction_task2_encoding('it', 'span_text')