# control: b5 dataset

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
import pandas as pd
import unicodedata
import string
import re
import random
import copy
from contra_qa.plots.functions  import simple_step_plot, plot_confusion_matrix
import  matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from nltk.translate.bleu_score import sentence_bleu


% matplotlib inline

### Preparing data

In [2]:
df2 = pd.read_csv("data/boolean5_train.csv")
df2_test = pd.read_csv("data/boolean5_test.csv")

df2["text"] = df2["sentence1"] + df2["sentence2"] 
df2_test["text"] = df2_test["sentence1"] + df2_test["sentence2"] 

all_sentences = list(df2.text.values) + list(df2_test.text.values)

df2train = df2

In [3]:
df2train.tail()

Unnamed: 0,sentence1,sentence2,and_A,and_B,label,text
9995,Kristin created a victorious and ashy work of art,Kristin didn't create a victorious work of art,Kristin created a victorious work of art,Kristin created an ashy work of art,1,Kristin created a victorious and ashy work of ...
9996,Doris created an ashy and easy work of art,Wendy didn't create an easy work of art,Doris created an ashy work of art,Doris created an easy work of art,0,Doris created an ashy and easy work of art Wen...
9997,Dawn created a shapely and plain work of art,Wilma didn't create a plain work of art,Dawn created a shapely work of art,Dawn created a plain work of art,0,Dawn created a shapely and plain work of art W...
9998,Gertrude created an ugly and eager work of art,Gertrude didn't create an ugly work of art,Gertrude created an ugly work of art,Gertrude created an eager work of art,1,Gertrude created an ugly and eager work of art...
9999,Elizabeth created a gifted and worried work of...,Elizabeth didn't create a gifted work of art,Elizabeth created a gifted work of art,Elizabeth created a worried work of art,1,Elizabeth created a gifted and worried work of...


In [4]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

# Lowercase, trim, and remove non-letter characters

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


example = "ddddda'''~~çãpoeéééééÈ'''#$$##@!@!@AAS@#12323fdf"
print("Before:", example)
print()
print("After:", normalizeString(example))

Before: ddddda'''~~çãpoeéééééÈ'''#$$##@!@!@AAS@#12323fdf

After: ddddda capoeeeeeee ! ! aas fdf


In [5]:
df2train.and_A = df2train.and_A.map(normalizeString)
df2train.and_B = df2train.and_B.map(normalizeString)

In [6]:
df2train.tail()

Unnamed: 0,sentence1,sentence2,and_A,and_B,label,text
9995,Kristin created a victorious and ashy work of art,Kristin didn't create a victorious work of art,kristin created a victorious work of art,kristin created an ashy work of art,1,Kristin created a victorious and ashy work of ...
9996,Doris created an ashy and easy work of art,Wendy didn't create an easy work of art,doris created an ashy work of art,doris created an easy work of art,0,Doris created an ashy and easy work of art Wen...
9997,Dawn created a shapely and plain work of art,Wilma didn't create a plain work of art,dawn created a shapely work of art,dawn created a plain work of art,0,Dawn created a shapely and plain work of art W...
9998,Gertrude created an ugly and eager work of art,Gertrude didn't create an ugly work of art,gertrude created an ugly work of art,gertrude created an eager work of art,1,Gertrude created an ugly and eager work of art...
9999,Elizabeth created a gifted and worried work of...,Elizabeth didn't create a gifted work of art,elizabeth created a gifted work of art,elizabeth created a worried work of art,1,Elizabeth created a gifted and worried work of...


In [7]:
df2train["sentence1_p"] = df2train["and_A"] + " and " + df2train["and_B"]

In [8]:
df2train.head()

Unnamed: 0,sentence1,sentence2,and_A,and_B,label,text,sentence1_p
0,Jeffery created a silly and vast work of art,Jeffery didn't create a silly work of art,jeffery created a silly work of art,jeffery created a vast work of art,1,Jeffery created a silly and vast work of art J...,jeffery created a silly work of art and jeffer...
1,Hilda created a zealous and better work of art,Hilda didn't create a zealous work of art,hilda created a zealous work of art,hilda created a better work of art,1,Hilda created a zealous and better work of art...,hilda created a zealous work of art and hilda ...
2,Cheryl created an ugly and obedient work of art,Cheryl didn't create an ugly work of art,cheryl created an ugly work of art,cheryl created an obedient work of art,1,Cheryl created an ugly and obedient work of ar...,cheryl created an ugly work of art and cheryl ...
3,Hector created a plump and bald work of art,Hector didn't create a plump work of art,hector created a plump work of art,hector created a bald work of art,1,Hector created a plump and bald work of art He...,hector created a plump work of art and hector ...
4,Madeline created a lemon and important work of...,Madeline didn't create a careful work of art,madeline created a lemon work of art,madeline created an important work of art,0,Madeline created a lemon and important work of...,madeline created a lemon work of art and madel...


In [9]:
df2train_plus = df2train[["sentence1_p", "sentence2", "label"]]

df2train_plus.sentence2 = df2train_plus.sentence2.map(normalizeString)
df2train_plus.rename(columns={"sentence1_p": "sentence1"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [10]:
df2train_plus.head()

Unnamed: 0,sentence1,sentence2,label
0,jeffery created a silly work of art and jeffer...,jeffery didn t create a silly work of art,1
1,hilda created a zealous work of art and hilda ...,hilda didn t create a zealous work of art,1
2,cheryl created an ugly work of art and cheryl ...,cheryl didn t create an ugly work of art,1
3,hector created a plump work of art and hector ...,hector didn t create a plump work of art,1
4,madeline created a lemon work of art and madel...,madeline didn t create a careful work of art,0


In [11]:
df2train_plus.to_csv("data/boolean5_control_train.csv", index=False)

## generating new data for test

In [12]:
df2_test.and_A = df2_test.and_A.map(normalizeString)
df2_test.and_B = df2_test.and_B.map(normalizeString)

In [13]:
df2_test["sentence1_p"] = df2_test["and_A"] + " and " + df2_test["and_B"]

In [14]:
df2_test.head()

Unnamed: 0,sentence1,sentence2,and_A,and_B,label,text,sentence1_p
0,Jared created an ashy and witty work of art,Alexander didn't create an ashy work of art,jared created an ashy work of art,jared created a witty work of art,0,Jared created an ashy and witty work of art Al...,jared created an ashy work of art and jared cr...
1,Patricia created a calm and icy work of art,Patricia didn't create a calm work of art,patricia created a calm work of art,patricia created an icy work of art,1,Patricia created a calm and icy work of art Pa...,patricia created a calm work of art and patric...
2,Elsie created a beautiful and gorgeous work of...,Juana didn't create a beautiful work of art,elsie created a beautiful work of art,elsie created a gorgeous work of art,0,Elsie created a beautiful and gorgeous work of...,elsie created a beautiful work of art and elsi...
3,Herbert created a mysterious and worried work ...,Herbert didn't create a mysterious work of art,herbert created a mysterious work of art,herbert created a worried work of art,1,Herbert created a mysterious and worried work ...,herbert created a mysterious work of art and h...
4,Janice created an easy and agreeable work of art,Janice didn't create an easy work of art,janice created an easy work of art,janice created an agreeable work of art,1,Janice created an easy and agreeable work of a...,janice created an easy work of art and janice ...


In [15]:
df2_test_plus = df2_test[["sentence1_p", "sentence2", "label"]]

df2_test_plus.sentence2 = df2_test_plus.sentence2.map(normalizeString)
df2_test_plus.rename(columns={"sentence1_p": "sentence1"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [16]:
df2_test_plus.head()

Unnamed: 0,sentence1,sentence2,label
0,jared created an ashy work of art and jared cr...,alexander didn t create an ashy work of art,0
1,patricia created a calm work of art and patric...,patricia didn t create a calm work of art,1
2,elsie created a beautiful work of art and elsi...,juana didn t create a beautiful work of art,0
3,herbert created a mysterious work of art and h...,herbert didn t create a mysterious work of art,1
4,janice created an easy work of art and janice ...,janice didn t create an easy work of art,1


In [17]:
df2_test_plus.to_csv("data/boolean5_control_test.csv", index=False)