# control b8

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
import pandas as pd
import unicodedata
import string
import re
import random
import copy
from contra_qa.plots.functions  import simple_step_plot, plot_confusion_matrix
import  matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from nltk.translate.bleu_score import sentence_bleu


% matplotlib inline

### Preparing data

In [3]:
df2 = pd.read_csv("data/boolean8_train.csv")
df2_test = pd.read_csv("data/boolean8_test.csv")

df2["text"] = df2["sentence1"] + df2["sentence2"] 
df2_test["text"] = df2_test["sentence1"] + df2_test["sentence2"] 

all_sentences = list(df2.text.values) + list(df2_test.text.values)

df2train = df2

In [4]:
df2train.tail()

Unnamed: 0,sentence1,sentence2,or_A,or_B,label,text
9995,Nigeria or Brazil are in the geopolitical posi...,Nigeria isn't in the geopolitical position of ...,Nigeria is in the geopolitical position of Russia,Brazil is in the geopolitical position of Russia,1,Nigeria or Brazil are in the geopolitical posi...
9996,Nigeria or Mexico got to the quarter finals la...,neither Nigeria nor Morocco got to the quarte...,Nigeria got to the quarter finals last year,Mexico got to the quarter finals last year,0,Nigeria or Mexico got to the quarter finals la...
9997,Mexico or England got to the quarter finals la...,neither Switzerland nor Mexico got to the qu...,Mexico got to the quarter finals last year,England got to the quarter finals last year,0,Mexico or England got to the quarter finals la...
9998,Denmark or Costa Rica are in the geopolitical ...,Denmark isn't in the geopolitical position of ...,Denmark is in the geopolitical position of Uru...,Costa Rica is in the geopolitical position of ...,1,Denmark or Costa Rica are in the geopolitical ...
9999,Gilbert has visited Dej or Arad,Gilbert visited neither Reghin nor Arad,Gilbert has visited Dej,Gilbert has visited Arad,0,Gilbert has visited Dej or AradGilbert visited...


In [5]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

# Lowercase, trim, and remove non-letter characters

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


example = "ddddda'''~~çãpoeéééééÈ'''#$$##@!@!@AAS@#12323fdf"
print("Before:", example)
print()
print("After:", normalizeString(example))

Before: ddddda'''~~çãpoeéééééÈ'''#$$##@!@!@AAS@#12323fdf

After: ddddda capoeeeeeee ! ! aas fdf


In [6]:
df2train.or_A = df2train.or_A.map(normalizeString)
df2train.or_B = df2train.or_B.map(normalizeString)

In [7]:
df2train.tail()

Unnamed: 0,sentence1,sentence2,or_A,or_B,label,text
9995,Nigeria or Brazil are in the geopolitical posi...,Nigeria isn't in the geopolitical position of ...,nigeria is in the geopolitical position of russia,brazil is in the geopolitical position of russia,1,Nigeria or Brazil are in the geopolitical posi...
9996,Nigeria or Mexico got to the quarter finals la...,neither Nigeria nor Morocco got to the quarte...,nigeria got to the quarter finals last year,mexico got to the quarter finals last year,0,Nigeria or Mexico got to the quarter finals la...
9997,Mexico or England got to the quarter finals la...,neither Switzerland nor Mexico got to the qu...,mexico got to the quarter finals last year,england got to the quarter finals last year,0,Mexico or England got to the quarter finals la...
9998,Denmark or Costa Rica are in the geopolitical ...,Denmark isn't in the geopolitical position of ...,denmark is in the geopolitical position of uru...,costa rica is in the geopolitical position of ...,1,Denmark or Costa Rica are in the geopolitical ...
9999,Gilbert has visited Dej or Arad,Gilbert visited neither Reghin nor Arad,gilbert has visited dej,gilbert has visited arad,0,Gilbert has visited Dej or AradGilbert visited...


In [9]:
df2train["sentence1_p"] = df2train["or_A"] + " or " + df2train["or_B"]

In [10]:
df2train.head()

Unnamed: 0,sentence1,sentence2,or_A,or_B,label,text,sentence1_p
0,England or Japan got to the quarter finals las...,neither Peru nor England got to the quarter f...,england got to the quarter finals last year,japan got to the quarter finals last year,0,England or Japan got to the quarter finals las...,england got to the quarter finals last year or...
1,Glenda has visited Craiova or Dej,Glenda didn't visit Craiova and Glenda didn't ...,glenda has visited craiova,glenda has visited dej,1,Glenda has visited Craiova or DejGlenda didn't...,glenda has visited craiova or glenda has visit...
2,Poland or Russia won the last world cup,neither Poland nor Russia won the last world cup,poland won the last world cup,russia won the last world cup,1,Poland or Russia won the last world cup neithe...,poland won the last world cup or russia won th...
3,Kimberly traveled to Tulcea or Blaj,Kimberly didn't travel to Tulcea and Kimberly ...,kimberly has traveled to tulcea,kimberly has traveled to blaj,1,Kimberly traveled to Tulcea or BlajKimberly di...,kimberly has traveled to tulcea or kimberly ha...
4,Tunisia or Russia won the last world cup,Tunisia didn't win the last world cup and Russ...,tunisia won the last world cup,russia won the last world cup,1,Tunisia or Russia won the last world cupTunisi...,tunisia won the last world cup or russia won t...


In [11]:
df2train_plus = df2train[["sentence1_p", "sentence2", "label"]]

df2train_plus.sentence2 = df2train_plus.sentence2.map(normalizeString)
df2train_plus.rename(columns={"sentence1_p": "sentence1"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [12]:
df2train_plus.to_csv("data/boolean8_control_train.csv", index=False)

## generating new data for test

In [13]:
df2_test.or_A = df2_test.or_A.map(normalizeString)
df2_test.or_B = df2_test.or_B.map(normalizeString)

In [14]:
df2_test["sentence1_p"] = df2_test["or_A"] + " or " + df2_test["or_B"]

In [15]:
df2_test.head()

Unnamed: 0,sentence1,sentence2,or_A,or_B,label,text,sentence1_p
0,Jessica went to Reghin or Dej,Jessica didn't go to Reghin,jessica went to reghin,jessica went to dej,0,Jessica went to Reghin or DejJessica didn't go...,jessica went to reghin or jessica went to dej
1,Korea Republic or Argentina are in the geopoli...,neither Korea Republic nor Russia are in the ...,korea republic is in the geopolitical position...,argentina is in the geopolitical position of s...,0,Korea Republic or Argentina are in the geopoli...,korea republic is in the geopolitical position...
2,Victoria went to Giurgiu or Mangalia,Victoria didn't go to Giurgiu and Victoria did...,victoria went to giurgiu,victoria went to mangalia,1,Victoria went to Giurgiu or MangaliaVictoria d...,victoria went to giurgiu or victoria went to m...
3,Mexico or Iran are in the geopolitical positio...,neither Mexico nor Iran are in the geopolitic...,mexico is in the geopolitical position of serbia,iran is in the geopolitical position of serbia,1,Mexico or Iran are in the geopolitical positio...,mexico is in the geopolitical position of serb...
4,Panama or Nigeria are in the geopolitical posi...,neither Panama nor Nigeria are in the geopoli...,panama is in the geopolitical position of tunisia,nigeria is in the geopolitical position of tun...,1,Panama or Nigeria are in the geopolitical posi...,panama is in the geopolitical position of tuni...


In [16]:
df2_test_plus = df2_test[["sentence1_p", "sentence2", "label"]]

df2_test_plus.sentence2 = df2_test_plus.sentence2.map(normalizeString)
df2_test_plus.rename(columns={"sentence1_p": "sentence1"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [17]:
df2_test_plus.head()

Unnamed: 0,sentence1,sentence2,label
0,jessica went to reghin or jessica went to dej,jessica didn t go to reghin,0
1,korea republic is in the geopolitical position...,neither korea republic nor russia are in the g...,0
2,victoria went to giurgiu or victoria went to m...,victoria didn t go to giurgiu and victoria did...,1
3,mexico is in the geopolitical position of serb...,neither mexico nor iran are in the geopolitica...,1
4,panama is in the geopolitical position of tuni...,neither panama nor nigeria are in the geopolit...,1


In [18]:
df2_test_plus.to_csv("data/boolean8_control_test.csv", index=False)