# control b9

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
import pandas as pd
import unicodedata
import string
import re
import random
import copy
from contra_qa.plots.functions  import simple_step_plot, plot_confusion_matrix
import  matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from nltk.translate.bleu_score import sentence_bleu


% matplotlib inline

### Preparing data

In [2]:
df2 = pd.read_csv("data/boolean9_train.csv")
df2_test = pd.read_csv("data/boolean9_test.csv")

df2["text"] = df2["sentence1"] + df2["sentence2"] 
df2_test["text"] = df2_test["sentence1"] + df2_test["sentence2"] 

all_sentences = list(df2.text.values) + list(df2_test.text.values)

df2train = df2

In [3]:
df2train.tail()

Unnamed: 0,sentence1,sentence2,or_A,or_B,label,text
9995,Belinda will lend me some money or swim out to...,Belinda will not swim out toward the orange boat,Belinda will lend me some money,Belinda will swim out toward the orange boat,0,Belinda will lend me some money or swim out to...
9996,Kimberly will burn some calories or sleep the ...,Kimberly will neither burn some calories nor s...,Kimberly will burn some calories,Kimberly will sleep the night,1,Kimberly will burn some calories or sleep the ...
9997,Barbara will wear a white hat or meet her,Barbara will neither wear a white hat nor meet...,Barbara will wear a white hat,Barbara will meet her,1,Barbara will wear a white hat or meet herBarba...
9998,Kevin will hit her or meet Lucy,Kevin will neither hit her nor meet Lucy,Kevin will hit her,Kevin will meet Lucy,1,Kevin will hit her or meet LucyKevin will neit...
9999,Ricardo will buy a watch or spend all my money,Ricardo will not buy a watch,Ricardo will buy a watch,Ricardo will spend all my money,0,Ricardo will buy a watch or spend all my money...


In [4]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

# Lowercase, trim, and remove non-letter characters

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


example = "ddddda'''~~çãpoeéééééÈ'''#$$##@!@!@AAS@#12323fdf"
print("Before:", example)
print()
print("After:", normalizeString(example))

Before: ddddda'''~~çãpoeéééééÈ'''#$$##@!@!@AAS@#12323fdf

After: ddddda capoeeeeeee ! ! aas fdf


In [5]:
df2train.or_A = df2train.or_A.map(normalizeString)
df2train.or_B = df2train.or_B.map(normalizeString)

In [6]:
df2train.tail()

Unnamed: 0,sentence1,sentence2,or_A,or_B,label,text
9995,Belinda will lend me some money or swim out to...,Belinda will not swim out toward the orange boat,belinda will lend me some money,belinda will swim out toward the orange boat,0,Belinda will lend me some money or swim out to...
9996,Kimberly will burn some calories or sleep the ...,Kimberly will neither burn some calories nor s...,kimberly will burn some calories,kimberly will sleep the night,1,Kimberly will burn some calories or sleep the ...
9997,Barbara will wear a white hat or meet her,Barbara will neither wear a white hat nor meet...,barbara will wear a white hat,barbara will meet her,1,Barbara will wear a white hat or meet herBarba...
9998,Kevin will hit her or meet Lucy,Kevin will neither hit her nor meet Lucy,kevin will hit her,kevin will meet lucy,1,Kevin will hit her or meet LucyKevin will neit...
9999,Ricardo will buy a watch or spend all my money,Ricardo will not buy a watch,ricardo will buy a watch,ricardo will spend all my money,0,Ricardo will buy a watch or spend all my money...


In [7]:
df2train["sentence1_p"] = df2train["or_A"] + " or " + df2train["or_B"]

In [8]:
df2train.head()

Unnamed: 0,sentence1,sentence2,or_A,or_B,label,text,sentence1_p
0,Patsy will cut himself or hurt me,Patsy will neither cut himself nor hurt me,patsy will cut himself,patsy will hurt me,1,Patsy will cut himself or hurt mePatsy will ne...,patsy will cut himself or patsy will hurt me
1,Opal will pay me or give Lynn a lemon car,Opal will not pay me,opal will pay me,opal will give lynn a lemon car,0,Opal will pay me or give Lynn a lemon carOpal ...,opal will pay me or opal will give lynn a lemo...
2,Mark will fall to the floor or lend me Matthew...,Mark will not fall to the floor,mark will fall to the floor,mark will lend me matthew s icy bicycle,0,Mark will fall to the floor or lend me Matthew...,mark will fall to the floor or mark will lend ...
3,Andy will lay his hands on him or drink a bott...,Andy will neither come at night nor drink a bo...,andy will lay his hands on him,andy will drink a bottle of water,0,Andy will lay his hands on him or drink a bott...,andy will lay his hands on him or andy will dr...
4,Cecilia will fight the law or fall off,Cecilia will neither fight the law nor fall off,cecilia will fight the law,cecilia will fall off,1,Cecilia will fight the law or fall offCecilia ...,cecilia will fight the law or cecilia will fal...


In [9]:
df2train_plus = df2train[["sentence1_p", "sentence2", "label"]]

df2train_plus.sentence2 = df2train_plus.sentence2.map(normalizeString)
df2train_plus.rename(columns={"sentence1_p": "sentence1"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [10]:
df2train_plus.to_csv("data/boolean9_control_train.csv", index=False)

## generating new data for test

In [11]:
df2_test.or_A = df2_test.or_A.map(normalizeString)
df2_test.or_B = df2_test.or_B.map(normalizeString)

In [12]:
df2_test["sentence1_p"] = df2_test["or_A"] + " or " + df2_test["or_B"]

In [13]:
df2_test.head()

Unnamed: 0,sentence1,sentence2,or_A,or_B,label,text,sentence1_p
0,Kristin will break the record or get the money,Kristin will not break the record,kristin will break the record,kristin will get the money,0,Kristin will break the record or get the money...,kristin will break the record or kristin will ...
1,William will understand what Oscar comment abo...,William will not understand what Oscar comment...,william will understand what oscar comment abo...,william will lay his hands on him,0,William will understand what Oscar comment abo...,william will understand what oscar comment abo...
2,Cory will beat Steve out of 20 dollars or brea...,Cory will not beat Steve out of 20 dollars and...,cory will beat steve out of dollars,cory will break the record,1,Cory will beat Steve out of 20 dollars or brea...,cory will beat steve out of dollars or cory wi...
3,Barry will lay his hands on him or teach math,Barry will neither choose this soccer team nor...,barry will lay his hands on him,barry will teach math,0,Barry will lay his hands on him or teach mathB...,barry will lay his hands on him or barry will ...
4,Marcia will put the phone down or read the news,Marcia will not read the news,marcia will put the phone down,marcia will read the news,0,Marcia will put the phone down or read the new...,marcia will put the phone down or marcia will ...


In [14]:
df2_test_plus = df2_test[["sentence1_p", "sentence2", "label"]]

df2_test_plus.sentence2 = df2_test_plus.sentence2.map(normalizeString)
df2_test_plus.rename(columns={"sentence1_p": "sentence1"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [15]:
df2_test_plus.head()

Unnamed: 0,sentence1,sentence2,label
0,kristin will break the record or kristin will ...,kristin will not break the record,0
1,william will understand what oscar comment abo...,william will not understand what oscar comment...,0
2,cory will beat steve out of dollars or cory wi...,cory will not beat steve out of dollars and co...,1
3,barry will lay his hands on him or barry will ...,barry will neither choose this soccer team nor...,0
4,marcia will put the phone down or marcia will ...,marcia will not read the news,0


In [16]:
df2_test_plus.to_csv("data/boolean9_control_test.csv", index=False)