# Clone full repo to copy aditional python files if running on colab

This cell download all the aditional pythons files that are used in the notebook.

In [4]:
# Only run if in colab
!RunningInCOLAB = 'google.colab' in str(get_ipython()) if hasattr(__builtins__,'__IPYTHON__') else False
# Continue only if running on Google Colab
![[ ! RunningInCOLAB ]] && exit

# clone repo and move to current working dir
!git clone https://github.com/evaluating-effectiveness-cloud-nlp/replication_package.git repo
!rsync -av repo/ .
!rm -rf repo

# Installing dependencies with pip

Install dependencies using the file `requirements.txt` downloaded from previous cell.

In [2]:
# installs dependencies
%pip install -r requirements.txt

# Providers Credentials

Insert here Azure (Microsoft), AWS (Amazon) and Google credentials. For Google is needed to obtain the `google-credentials.json` with all credentials and upload it when the cell executes.
This cells will create an `credentials.py` file in the same directory of the *notebook*.

> Note: If you don't have credentials and just want to test the experiment, you can run the experiment using mock providers with random outputs (see section "Importing or Mocking MLaaS providers").

In [3]:
import os
from google.colab import files

# @markdown Microsoft
azure_key_1 = '6565' # @param {type:"string"}
azure_key_2 = '656565' # @param {type:"string"}
azure_location = '6565656' # @param {type:"string"}
azure_endpoint = '6565656' # @param {type:"string"}

# @markdown Amazon
aws_access_key_id='6565656' # @param {type:"string"}
aws_secret_access_key='46556' # @param {type:"string"}


# @markdown Google
# @markdown >*You will need to upload `google-credentials.json` file on runtime as described in: https://developers.google.com/workspace/guides/create-credentials?hl=pt-br#create_credentials_for_a_service_account*
print('Please upload "google-credentials.json" file')
google_credentials_file = files.upload()

file_name = list(google_credentials_file.keys())[0]

## write credentials to a credentials.py file
file_content = f"""
import os

# Microsoft
azure_key_1 = '{azure_key_1}'
azure_key_2 = '{azure_key_2}'
azure_location = '{azure_location}'
azure_endpoint = '{azure_endpoint}'

# Amazon
aws_access_key_id = '{aws_access_key_id}'
aws_secret_access_key = '{aws_secret_access_key}'

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./{file_name}"
"""

file_name = "credentials.py"

# Escreve o conteúdo no arquivo
with open(file_name, "w") as f:
    f.write(file_content)


## Download the pre-trained ``glove.twitter`` word embedding model

This *wordembedding* model is only used in the noise WordEmbeddings.
If you don't want to use it, just remove it from the noise list and don't run this cell.
> Note: The file has been placed in a personal repository just for ease of download, the original model is available as a *.zip file at: https://github.com/stanfordnlp/GloVe

In [4]:
!python -m pip install ipywidgets
import urllib.request
from os.path import exists
import ipywidgets as widgets
from IPython.display import display
import os

progress = None
def show_progress(block_num, block_size, total_size):
    global progress
    if not progress :
        progress = widgets.FloatProgress(
            value=0,
            min=0,
            max=total_size,
            step=0.1,
            description='Downloading',
            bar_style='info',
            orientation='horizontal'
        )
        display(progress)
        
    downloaded = (block_num * block_size)
    print(block_num * block_size, "/", total_size,"\r", end="")
    
    progress.value = downloaded

model_path = "models/glove.twitter.27B.100d.txt"
word_embedding_url = "https://huggingface.co/anonymoususer/fault_injection_mlaas/resolve/main/glove.twitter.27B.100d.txt"

file_exists = exists(model_path)

if file_exists :
    print("file ", model_path, " already exists.")
else:
    filename = "models"
    os.makedirs(filename, exist_ok=True)
    urllib.request.urlretrieve(word_embedding_url, model_path, show_progress)
    print("File downloaded!")

Defaulting to user installation because normal site-packages is not writeable
file  models/glove.twitter.27B.100d.txt  already exists.


## Importing and Mocking MLaaS providers

This cell imports the module container all providers implementations.

In [3]:
from mlaas_providers import providers as ml_providers

Run the cell below in addition to the previous one if you want to use simulated providers instead of real providers.
> Note: This will return random values for sentiment analysis.

In [7]:
ml_providers.amazon = ml_providers.return_mock_of(ml_providers.amazon)
ml_providers.google = ml_providers.return_mock_of(ml_providers.google)
ml_providers.microsoft = ml_providers.return_mock_of(ml_providers.microsoft)

# `RQ1`: How effective are the Cloud NLP services when subjected to noise?

## Importing aditional python modules

In [4]:
from datetime import datetime
from typing import List
from mlaas_providers.providers import read_dataset
from noise_insertion.utils import save_data_to_file
from data_sampling.data_sampling import DataSampling
from noise_insertion.percent_insertion import noises
from noise_insertion import noise_insertion
from visualization import visualization
from progress import progress_manager
from metrics import metrics
import ipywidgets as widgets
data_sampling = DataSampling()


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rocha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rocha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rocha\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rocha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Parameters

Choose the sample size, types of noise to be used and noise levels.

In [4]:
sample_size = 99

noise_list =[
    noises.Keyboard,
    noises.OCR,
    noises.RandomCharReplace,
    noises.CharSwap,
    noises.WordSwap,
    noises.WordSplit,
    noises.Antonym,
    noises.Synonym,
    noises.Spelling,
    noises.TfIdfWord,
    noises.WordEmbeddings,
    noises.ContextualWordEmbs,
]

noise_level=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

## Running the experiment

**TODO: ADICIONAR UMA DESCRIÇÃO DO QUE É FEITO NESTA RQ1, TALVEZ CITAR O PIPELINE E SEÇÃO**

The RQ1 experiment raw results are stored in a new directory inside outputs/experiment1 folder.

If for any reason an error occurs during execution, you can continue where you left off by entering the name of the directory created during execution below. Ex.: `size99_07-12-2022 09_34_29`.

To continue from previously ongoing progress insert the name of a /outputs/experiment1 folder

In [6]:
# @markdown ### Type the name of a /outputs/experiment1 folder to continue from:
continue_from = "" # @param {type:"string"}

def get_main_path(size):
    now = datetime.now()
    timestamp = now.strftime("%m-%d-%Y %H_%M_%S")
    main_dir = './outputs/experiment1/size'+str(size)+'_' + timestamp
    return main_dir

def run_evaluation(sample_size: int,
                  noise_levels: List[int] =[0.1, 0.15, 0.2, 0.25, 0.3],
                  noise_algorithms=[noises.no_noise, noises.RandomCharReplace, noises.Keyboard, noises.OCR],
                  mlaas_providers=[ml_providers.google],
                  continue_from=None):
    if(continue_from):
        main_path = './outputs/experiment1/'+continue_from
        progress = progress_manager.load_progress(main_path)
        x_dataset = read_dataset(main_path + '/data' + "/dataset.xlsx")
        y_labels = read_dataset(main_path + '/data' + "/labels.xlsx")
    else:
        x_dataset, y_labels = data_sampling.get_dataset_sample('./Tweets_dataset.csv', sample_size)
        main_path = get_main_path(len(x_dataset))
        save_data_to_file(x_dataset, main_path + '/data', "dataset")
        save_data_to_file(y_labels, main_path + '/data', "labels")
        
        progress = progress_manager.init_progress(main_path, noise_algorithms, noise_levels, mlaas_providers)
    print("Results will be stored at: ", main_path)
    print('Generating noise...')
    progress = noise_insertion.generate_noised_data(x_dataset, main_path)

    print('Getting predictions from providers...')
    progress = ml_providers.get_prediction_results(main_path)

    print('Calculating metrics...')
    metrics_results = metrics.metrics(progress, y_labels, main_path)

    noise_list = [0.0]
    noise_list.extend(noise_levels)

    visualization.plot_results(metrics_results, main_path + '/results', noise_list)

    print("Results were saved to:", main_path)

run_evaluation(
    sample_size,
    noise_levels=noise_level,
    noise_algorithms=noise_list,
    mlaas_providers=[ml_providers.amazon, ml_providers.microsoft, ml_providers.google],
    continue_from=continue_from
)

Results will be stored at:  ./outputs/experiment1/size99_07-12-2022 09_34_29
Generating noise...
- Keyboard
-- 
- OCR
-- 
- RandomCharReplace
-- 
- CharSwap
-- 
- WordSwap
-- 
- WordSplit
-- 
- Antonym
-- 
- Synonym
-- 
- Spelling
-- 
- TfIdfWord
-- 
- WordEmbeddings
-- 
- ContextualWordEmbs
-- 
Getting predictions from providers...
- google
-- Keyboard
--- 
-- OCR
--- 
-- RandomCharReplace
--- 
-- CharSwap
--- 
-- WordSwap
--- 
-- WordSplit
--- 
-- Antonym
--- 
-- Synonym
--- 
-- Spelling
--- 
-- TfIdfWord
--- 
-- WordEmbeddings
--- 
-- ContextualWordEmbs
--- 
- microsoft
-- Keyboard
--- 
-- OCR
--- 
-- RandomCharReplace
--- 
-- CharSwap
--- 
-- WordSwap
--- 
-- WordSplit
--- 
-- Antonym
--- 
-- Synonym
--- 
-- Spelling
--- 
-- TfIdfWord
--- 
-- WordEmbeddings
--- 
-- ContextualWordEmbs
--- 
- amazon
-- Keyboard
--- 
-- OCR
--- 
-- RandomCharReplace
--- 
-- CharSwap
--- 
-- WordSwap
--- 
-- WordSplit
--- 
-- Antonym
--- 
-- Synonym
--- 
-- Spelling
--- 
-- TfIdfWord
--- 
-- WordEmbedd

  df.loc[:, noise_column_name] = pd.to_numeric(df[noise_column_name])


Results were saved to: ./outputs/experiment1/size99_07-12-2022 09_34_29


# Experiment 2

## Importing aditional python modules

In [1]:
from pathlib import Path
from typing import TypedDict, List
from datetime import datetime
from data_sampling.data_sampling import DataSampling
from progress import progress_manager
from noise_insertion.percent_insertion import noises
from noise_insertion import noise_insertion
from mlaas_providers.providers import read_dataset
from metrics import metrics
from visualization import visualization

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rocha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rocha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rocha\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rocha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Parameters

In [2]:
sample_size=100
noise_level=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

word_counts = [
    {"min_width": 12, "max_width": 12},
    {"min_width": 19, "max_width": 19},
    {"min_width": 23, "max_width": 23},
]

noise_algo = [
    noises.Keyboard,
    noises.OCR,
    noises.RandomCharReplace,
    noises.CharSwap,
    noises.WordSwap,
    noises.WordSplit,
    noises.Antonym,
    noises.Synonym,
    noises.Spelling,
    noises.TfIdfWord,
    noises.WordEmbeddings,
    noises.ContextualWordEmbs,
]

## Running

To continue from previously ongoing progress insert the name of a /outputs/outputs/experiment2 folder

In [3]:
import ipywidgets as widgets

continue_widget = widgets.Textarea(
    value='',
    placeholder='Type the name of a /outputs/experiment2 folder to continue from',
    description='Continue from',
    disabled=False
)
continue_widget

Textarea(value='', description='Continue from', placeholder='Type the name of a /outputs/experiment2 folder to…

In [13]:
continue_from = continue_widget.value 
print(continue_from)
class Size(TypedDict):
    min_width: int
    max_width: int
    
def create_main_path(timestamp, size):
    main_dir = f'./outputs/experiment2/size{str(size)}_{timestamp}'

    Path(main_dir).mkdir(parents=True, exist_ok=True)
    return main_dir

def create_sub_path(main_path: str, min_width: int, max_width: int):
    path = f'{main_path}/[{str(min_width)}-{str(max_width)}]'
    
    Path(path).mkdir(parents=True, exist_ok=True)
    Path(path+'/data').mkdir(parents=True, exist_ok=True)
    
    return path

def prepare_execution(
    continue_from: str,
    timestamp: str,
    sample_size: int,
    sizes: List[Size],
    noise_algorithms,
    noise_levels,
    mlaas_providers
):
    dataSampling = DataSampling()
    if not len(continue_from) > 0:
        main_path = create_main_path(timestamp, sample_size)
    else:
        continue_from = './outputs/experiment2/'+ continue_from
        print("continue_from:", continue_from)
        main_path = continue_from
    
    sub_path_list = []
    for size in sizes:
        min_width = size['min_width']
        max_width = size['max_width']
        sub_path = create_sub_path(main_path, min_width, max_width)

        data, labels = dataSampling.get_by_word_count('Tweets_dataset.csv',
                                              sample_size,
                                              min_width,
                                              max_width)

        path = Path(sub_path+"/data/dataset.xlsx")
        if not path.is_file():
            data.to_excel(sub_path+"/data/dataset.xlsx", 'data', index=False)
        
        path = Path(sub_path+"/data/labels.xlsx")
        if not path.is_file():
            labels.to_excel(sub_path+"/data/labels.xlsx", 'data', index=False)
        sub_path_list.append(sub_path)
        progress = progress_manager.init_progress(sub_path, noise_algorithms, noise_levels, mlaas_providers)
    return sub_path_list

def run_evaluation(noise_levels_units: List[int],
                   continue_from: str,    
):
    main_path = continue_from
    progress = progress_manager.load_progress(main_path)

    x_dataset = read_dataset(main_path + '/data/dataset.xlsx')
    y_labels = read_dataset(main_path + '/data/labels.xlsx')

    print('Generating noise...')
    progress = noise_insertion.generate_noised_data(x_dataset, main_path, noise_package=noises)

    print('Getting predictions from providers...')
    progress = ml_providers.get_prediction_results(main_path)

    print('Calculating metrics...')
    metrics_results = metrics.metrics(progress, y_labels, main_path)

    noise_list = [0]
    noise_list.extend(noise_levels_units)

    visualization.plot_results(metrics_results, main_path + '/results', noise_list, percent_noise=True)

    print(main_path)

timestamp = datetime.now().strftime("%m-%d-%Y %H_%M_%S")

path_list = prepare_execution(continue_from,
                          timestamp, 
                          sample_size,
                          word_counts,
                          noise_algo,
                          noise_level,
                          [ml_providers.google, ml_providers.amazon, ml_providers.microsoft])
for path in path_list:
    run_evaluation(noise_level, 
                   continue_from=path)
print(path_list)

size100_12-21-2022 20_53_00
continue_from: ./outputs/experiment2/size100_12-21-2022 20_53_00
Generating noise...
- Keyboard
-- 
- OCR
-- 
- RandomCharReplace
-- 
- CharSwap
-- 
- WordSwap
-- 
- WordSplit
-- 
- Antonym
-- 
- Synonym
-- 
- Spelling
-- 
- TfIdfWord
-- 
- WordEmbeddings
-- 
- ContextualWordEmbs
-- 
Getting predictions from providers...
- google
-- Keyboard
--- 
-- OCR
--- 
-- RandomCharReplace
--- 
-- CharSwap
--- 
-- WordSwap
--- 
-- WordSplit
--- 
-- Antonym
--- 
-- Synonym
--- 
-- Spelling
--- 
-- TfIdfWord
--- 
-- WordEmbeddings
--- 
-- ContextualWordEmbs
--- 
- amazon
-- Keyboard
--- 
-- OCR
--- 
-- RandomCharReplace
--- 
-- CharSwap
--- 
-- WordSwap
--- 
-- WordSplit
--- 
-- Antonym
--- 
-- Synonym
--- 
-- Spelling
--- 
-- TfIdfWord
--- 
-- WordEmbeddings
--- 
-- ContextualWordEmbs
--- 
- microsoft
-- Keyboard
--- 
-- OCR
--- 
-- RandomCharReplace
--- 
-- CharSwap
--- 
-- WordSwap
--- 
-- WordSplit
--- 
-- Antonym
--- 
-- Synonym
--- 
-- Spelling
--- 
-- TfIdfWord
--

  df.loc[:, noise_column_name] = pd.to_numeric(df[noise_column_name])


./outputs/experiment2/size100_12-21-2022 20_53_00/[12-12]
Generating noise...
- Keyboard
-- 
- OCR
-- 
- RandomCharReplace
-- 
- CharSwap
-- 
- WordSwap
-- 
- WordSplit
-- 
- Antonym
-- 
- Synonym
-- 
- Spelling
-- 
- TfIdfWord
-- 
- WordEmbeddings
-- 
- ContextualWordEmbs
-- 
Getting predictions from providers...
- google
-- Keyboard
--- 
-- OCR
--- 
-- RandomCharReplace
--- 
-- CharSwap
--- 
-- WordSwap
--- 
-- WordSplit
--- 
-- Antonym
--- 
-- Synonym
--- 
-- Spelling
--- 
-- TfIdfWord
--- 
-- WordEmbeddings
--- 
-- ContextualWordEmbs
--- 
- amazon
-- Keyboard
--- 
-- OCR
--- 
-- RandomCharReplace
--- 
-- CharSwap
--- 
-- WordSwap
--- 
-- WordSplit
--- 
-- Antonym
--- 
-- Synonym
--- 
-- Spelling
--- 
-- TfIdfWord
--- 
-- WordEmbeddings
--- 
-- ContextualWordEmbs
--- 
- microsoft
-- Keyboard
--- 
-- OCR
--- 
-- RandomCharReplace
--- 
-- CharSwap
--- 
-- WordSwap
--- 
-- WordSplit
--- 
-- Antonym
--- 
-- Synonym
--- 
-- Spelling
--- 
-- TfIdfWord
--- 
-- WordEmbeddings
--- 
-- Contex

  df.loc[:, noise_column_name] = pd.to_numeric(df[noise_column_name])


./outputs/experiment2/size100_12-21-2022 20_53_00/[19-19]
Generating noise...
- Keyboard
-- 
- OCR
-- 
- RandomCharReplace
-- 
- CharSwap
-- 
- WordSwap
-- 
- WordSplit
-- 
- Antonym
-- 
- Synonym
-- 
- Spelling
-- 
- TfIdfWord
-- 
- WordEmbeddings
-- 
- ContextualWordEmbs
-- 
Getting predictions from providers...
- google
-- Keyboard
--- 
-- OCR
--- 
-- RandomCharReplace
--- 
-- CharSwap
--- 
-- WordSwap
--- 
-- WordSplit
--- 
-- Antonym
--- 
-- Synonym
--- 
-- Spelling
--- 
-- TfIdfWord
--- 
-- WordEmbeddings
--- 
-- ContextualWordEmbs
--- 
- amazon
-- Keyboard
--- 
-- OCR
--- 
-- RandomCharReplace
--- 
-- CharSwap
--- 
-- WordSwap
--- 
-- WordSplit
--- 
-- Antonym
--- 
-- Synonym
--- 
-- Spelling
--- 
-- TfIdfWord
--- 
-- WordEmbeddings
--- 
-- ContextualWordEmbs
--- 
- microsoft
-- Keyboard
--- 
-- OCR
--- 
-- RandomCharReplace
--- 
-- CharSwap
--- 
-- WordSwap
--- 
-- WordSplit
--- 
-- Antonym
--- 
-- Synonym
--- 
-- Spelling
--- 
-- TfIdfWord
--- 0.9 , 
-- WordEmbeddings
--- 0.1 

  df.loc[:, noise_column_name] = pd.to_numeric(df[noise_column_name])


./outputs/experiment2/size100_12-21-2022 20_53_00/[23-23]
['./outputs/experiment2/size100_12-21-2022 20_53_00/[12-12]', './outputs/experiment2/size100_12-21-2022 20_53_00/[19-19]', './outputs/experiment2/size100_12-21-2022 20_53_00/[23-23]']
