In [1]:
# Importing libraries
import json
import pandas as pd
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os
from configuration import Configuration
from configuration import CONSTANTS as C
# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration
from nltk.translate.bleu_score import sentence_bleu,SmoothingFunction
from rich.table import Column, Table
from rich import box
from rich.console import Console
from tensorboardX import SummaryWriter
import time
from torch import cuda
import glob

In [2]:
class YourDataSetClass(Dataset):
    """
    Creating a custom dataset for reading the dataset and 
    loading it into the dataloader to pass it to the neural network for finetuning the model
    """    
    def __init__(self, dataframe, tokenizer, source_len, target_len,answer_len, source_text, target_text,answer_text):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.ans_len = answer_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]
        self.answer_text = self.data[answer_text]

    def __len__(self):
        return len(self.target_text)

    def __getitem__(self, index):
        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])
        answer_text = str(self.answer_text[index])
        #cleaning data so as to ensure data is in string type
        source_text = ' '.join(source_text.split())
        target_text = ' '.join(target_text.split())
        answer_text = ' '.join(answer_text.split())
        source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
        answer = self.tokenizer.batch_encode_plus([answer_text], max_length= self.ans_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()
        answer_ids = answer['input_ids'].squeeze()
        answer_mask = answer['attention_mask'].squeeze()
        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long),
            'answer_ids': answer_ids.to(dtype=torch.long),
            'answer_mask': answer_mask.to(dtype=torch.long)
        }


def create_model_dir(experiment_main_dir, experiment_id, model_summary):
    """
    Create a new model directory.
    :param experiment_main_dir: Where all experiments are stored.
    :param experiment_id: The ID of this experiment.
    :param model_summary: A summary string of the model.
    :return: A directory where we can store model logs. Raises an exception if the model directory already exists.
    """
    model_name = "{}-{}".format(experiment_id, model_summary)
    model_dir = os.path.join(experiment_main_dir, model_name)
    #if os.path.exists(model_dir):
    #    raise ValueError("Model directory already exists {}".format(model_dir))
    #os.makedirs(model_dir)
    return model_dir

def train(epoch, tokenizer, model, device, loader, optimizer,writer,global_step,records):

    """
    Function to be called for training with the parameters passed from main function

    """
    model.train()
    c=0
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)
        
        ans_str = data['answer_ids'].to(device, dtype = torch.long)
        ans_mask = data['answer_mask'].to(device, dtype = torch.long)
        
        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids,
                        labels=lm_labels,answer_str=ans_str,answer_mask=ans_mask)
        loss = outputs[0]

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        writer.add_scalar("loss", loss, global_step)
        
        
        ### measure bleu
        if c%10==0:
            model.eval()
            predictions = []
            actuals = []

            generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              num_beams=2,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True
              )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=False)for t in y]

            predictions.extend(preds)
            actuals.extend(target)

            temp_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})

            val=records.rename(columns={'distractor':'Actual Text'})

            gen_dist=val.merge(temp_df,on=['Actual Text']).loc[:,['text','Generated Text']]

            distractors=val.groupby(['text']).agg({ 'Actual Text': lambda x: list(x.str.split())}).reset_index()

            dist_compare=distractors.merge(gen_dist,on=['text'])
            dist_compare['Generated Text']=dist_compare['Generated Text'].str.split()
            aa=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 1, 0, 0),smoothing_function=SmoothingFunction().method1),axis=1)
            dist_compare=dist_compare.assign(bleu=aa)
            bleu_2=dist_compare.bleu.mean()
            if c%1000==0:
                path = os.path.join(model_dir, "model_files")
                model.save_pretrained(path)
                tokenizer.save_pretrained(path)

            model.train()
            writer.add_scalar("bleu2", bleu_2, global_step)
        
        
        global_step += 1
    return global_step


def validate(epoch, tokenizer, model, device, loader,writer):

    """
    Function to evaluate model for predictions

    """
    global_step = 0
    model.eval()
    predictions = []
    actuals = []
    c=0
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              num_beams=2,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True
              )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]

            predictions.extend(preds)
            actuals.extend(target)
            c=c+1
            if c%100==0:
                print(c)
        temp_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})

        val=records.rename(columns={'distractor':'Actual Text'})

        gen_dist=val.merge(temp_df,on=['Actual Text']).loc[:,['text','Generated Text']]

        distractors=val.groupby(['text']).agg({ 'Actual Text': lambda x: list(x.str.split())}).reset_index()

        dist_compare=distractors.merge(gen_dist,on=['text'])
        dist_compare['Generated Text']=dist_compare['Generated Text'].str.split()
        aa=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 1, 0, 0),smoothing_function=SmoothingFunction().method1),axis=1)
        dist_compare=dist_compare.assign(bleu=aa)
        bleu_2=dist_compare.bleu.mean()
        writer.add_scalar("bleu2_val", bleu_2, 1)
        
    return predictions, actuals



In [3]:
def main(config):
    model_params={
        "MODEL":"t5-small",             # model_type: t5-base/t5-large
        "TRAIN_BATCH_SIZE":4,          # training batch size
        "VALID_BATCH_SIZE":4,          # validation batch size
        "TRAIN_EPOCHS":2,              # number of training epochs
        "VAL_EPOCHS":1,                # number of validation epochs
        "LEARNING_RATE":1e-4,          # learning rate
        "MAX_SOURCE_TEXT_LENGTH":1800,  # max length of source text
        "MAX_TARGET_TEXT_LENGTH":200,   # max length of target text
        "SEED": 42                     # set seed for reproducibility 

    }


    source_text='text'
    target_text='distractor'
    model_params=model_params

    with open(os.path.join(C.DATA_DIR, "distractor/race_train_original.json"), 'r') as content_file:
        content = content_file.read()
    content=content.replace('\n',',')
    content='['+content[:-1]+']'
    records = json.loads(content)
    records=pd.DataFrame(records)
    
    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"]) # pytorch random seed
    np.random.seed(model_params["SEED"]) # numpy random seed
    torch.backends.cudnn.deterministic = True


    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(C.DEVICE)

    ## format the input
    records=records.assign(question=records.question.str.join(' '))
    records=records.assign(distractor=records.distractor.str.join(' '))
    records=records.assign(article=records.article.str.join(' '))
    records=records.assign(answer_text=records.answer_text.str.join(' '))
    records=records.loc[:,['article','question','answer_text','distractor']]
    records=records.assign(text="distraction passage: "+records.article+" question: "+records.question+" answer: "+records.answer_text)
    records=records.loc[:,['text','distractor']]

    with open(os.path.join(C.DATA_DIR, "distractor/race_dev_original.json"), 'r') as content_file:
        content = content_file.read()
    content=content.replace('\n',',')
    content='['+content[:-1]+']'
    records_test = json.loads(content)
    records_test=pd.DataFrame(records_test)

    ## format the input
    records_test=records_test.assign(question=records_test.question.str.join(' '))
    records_test=records_test.assign(distractor=records_test.distractor.str.join(' '))
    records_test=records_test.assign(article=records_test.article.str.join(' '))
    records_test=records_test.assign(answer_text=records_test.answer_text.str.join(' '))
    records_test=records_test.loc[:,['article','question','answer_text','distractor']]
    records_test=records_test.assign(text="distraction passage: "+records_test.article+" question: "+records_test.question+" answer: "+records_test.answer_text)
    records_test=records_test.loc[:,['text','distractor']]

    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest for validation. 
    val_dataset=records_test
    train_dataset = records


    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = YourDataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
    val_set = YourDataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)


    # Defining the parameters for creation of dataloaders
    train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }


    val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }


    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)


    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])
    
    # Create Tensorboard logger.
    experiment_id = int(time.time())
    experiment_name = "name"
    model_dir = create_model_dir(os.path.join(C.DATA_DIR, "experiments/"), experiment_id, experiment_name)
        
    global_step = 0
    writer = SummaryWriter(os.path.join(model_dir, 'logs'))
    for epoch in range(model_params["TRAIN_EPOCHS"]):
        global_step=train(epoch, tokenizer, model, C.DEVICE, training_loader, optimizer,writer,global_step,records)

    #Saving the model after training
    path = os.path.join(model_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)


    # evaluating test dataset
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(epoch, tokenizer, model, C.DEVICE, val_loader,writer)
        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv(os.path.join(model_dir, 'predictions.csv'),index=False)




In [4]:
model_params={
    "MODEL":"t5-small",             # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE":2,          # training batch size
    "VALID_BATCH_SIZE":2,          # validation batch size
    "TRAIN_EPOCHS":2,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":1e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":900,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":901,   # max length of target text
    "MAX_ANSWER_LENGTH":900,   # max length of answer text
    "SEED": 42                     # set seed for reproducibility 

}


source_text='text'
target_text='distractor'
answer_text='answer_text'
model_params=model_params

with open(os.path.join(C.DATA_DIR, "distractor/race_train_original.json"), 'r') as content_file:
    content = content_file.read()
content=content.replace('\n',',')
content='['+content[:-1]+']'
records = json.loads(content)
records=pd.DataFrame(records)

# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(model_params["SEED"]) # pytorch random seed
np.random.seed(model_params["SEED"]) # numpy random seed
torch.backends.cudnn.deterministic = True


# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
model = model.to(C.DEVICE)

## format the input
records=records.assign(question=records.question.str.join(' '))
records=records.assign(distractor=records.distractor.str.join(' '))
records=records.assign(article=records.article.str.join(' '))
records=records.assign(answer_text=records.answer_text.str.join(' '))
records=records.loc[:,['article','question','answer_text','distractor']]
records=records.assign(text="distraction passage: "+records.article+" question: "+records.question+" answer: "+records.answer_text)
records=records.loc[:,['text','distractor','answer_text']]

with open(os.path.join(C.DATA_DIR, "distractor/race_dev_original.json"), 'r') as content_file:
    content = content_file.read()
content=content.replace('\n',',')
content='['+content[:-1]+']'
records_test = json.loads(content)
records_test=pd.DataFrame(records_test)

## format the input
records_test=records_test.assign(question=records_test.question.str.join(' '))
records_test=records_test.assign(distractor=records_test.distractor.str.join(' '))
records_test=records_test.assign(article=records_test.article.str.join(' '))
records_test=records_test.assign(answer_text=records_test.answer_text.str.join(' '))
records_test=records_test.loc[:,['article','question','answer_text','distractor']]
records_test=records_test.assign(text="distraction passage: "+records_test.article+" question: "+records_test.question+" answer: "+records_test.answer_text)
records_test=records_test.loc[:,['text','distractor','answer_text']]

# Creation of Dataset and Dataloader
# Defining the train size. So 80% of the data will be used for training and the rest for validation. 
val_dataset=records_test
train_dataset = records


# Creating the Training and Validation dataset for further creation of Dataloader
training_set = YourDataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"],model_params["MAX_ANSWER_LENGTH"], source_text, target_text,answer_text)
val_set = YourDataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"],model_params["MAX_ANSWER_LENGTH"], source_text, target_text,answer_text)


# Defining the parameters for creation of dataloaders
train_params = {
  'batch_size': model_params["TRAIN_BATCH_SIZE"],
  'shuffle': True,
  'num_workers': 0
  }


val_params = {
  'batch_size': model_params["VALID_BATCH_SIZE"],
  'shuffle': False,
  'num_workers': 0
  }


# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)


# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])

# Create Tensorboard logger.
experiment_id =55555 #int(time.time())
experiment_name = "name"



In [None]:
model_dir = create_model_dir(os.path.join(C.DATA_DIR, "experiments/"), experiment_id, experiment_name)

global_step = 0
writer = SummaryWriter(os.path.join(model_dir, 'logs'))
for epoch in range(model_params["TRAIN_EPOCHS"]):
    global_step=train(epoch, tokenizer, model, C.DEVICE, training_loader, optimizer,writer,global_step,records)




In [None]:
#Saving the model after training
path = os.path.join(model_dir, "model_files")
model.save_pretrained(path)
tokenizer.save_pretrained(path)


# evaluating test dataset
for epoch in range(model_params["VAL_EPOCHS"]):
    predictions, actuals = validate(epoch, tokenizer, model, C.DEVICE, val_loader,writer)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv(os.path.join(model_dir, 'predictions.csv'),index=False)


In [5]:
def get_model_dir(experiment_dir, model_id):
    """Return the directory in `experiment_dir` that contains the given `model_id` string."""
    model_dir = glob.glob(os.path.join(experiment_dir, str(model_id) + "-*"), recursive=False)
    return None if len(model_dir) == 0 else model_dir[0]

def get_model_config(model_id):
    model_id = model_id
    model_dir = get_model_dir(os.path.join(C.DATA_DIR, "experiments/"), model_id)
    model_config = 0#Configuration.from_json(os.path.join(model_dir, 'config.json'))
    return model_config, model_dir

def load_model(model_id):
    model_config, model_dir = get_model_config(model_id)
    path = os.path.join(model_dir, "model_files")
    tokenizer = T5Tokenizer.from_pretrained(path)

    model = T5ForConditionalGeneration.from_pretrained(path)

    model.to(C.DEVICE)

    return model,tokenizer, model_config, model_dir

In [6]:
model,tokenizer, model_config, model_dir = load_model(1622411389)


In [7]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [8]:
writer = SummaryWriter(os.path.join(model_dir, 'logs'))

In [9]:
for epoch in range(model_params["VAL_EPOCHS"]):
    predictions, actuals = validate(epoch, tokenizer, model, C.DEVICE, val_loader,writer)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv(os.path.join(model_dir, 'predictions.csv'),index=False)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000


In [10]:
final_df=pd.read_csv(os.path.join(model_dir, 'predictions.csv'))

In [11]:
records_test_fil=records_test[~records_test.text.isin(records.text)]

In [18]:
temp_df = final_df

val=records_test_fil.rename(columns={'distractor':'Actual Text'})

gen_dist=val.merge(temp_df,on=['Actual Text']).loc[:,['text','Generated Text']].drop_duplicates()

distractors=val.groupby(['text']).agg({ 'Actual Text': lambda x: list(x.str.split())}).reset_index()

dist_compare=distractors.merge(gen_dist,on=['text'])

dist_compare['Generated Text']=dist_compare['Generated Text'].str.split()

In [20]:
b1=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(1, 0, 0, 0),smoothing_function=SmoothingFunction().method1),axis=1)
b2=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 1, 0, 0),smoothing_function=SmoothingFunction().method1),axis=1)
b3=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 0, 1, 0),smoothing_function=SmoothingFunction().method1),axis=1)
b4=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 0, 0, 1),smoothing_function=SmoothingFunction().method1),axis=1)
dist_compare=dist_compare.assign(bleu1=b1)
dist_compare=dist_compare.assign(bleu2=b2)
dist_compare=dist_compare.assign(bleu3=b3)
dist_compare=dist_compare.assign(bleu4=b4)
#dist_compare=dist_compare.assign(bleu=dist_compare.apply(lambda x:sentence_bleu(x['Actual Text'],x['Generated Text'],weights=(0, 0, 1, 0)),axis=1))
bleu_1=dist_compare.bleu1.mean()
bleu_2=dist_compare.bleu2.mean()
bleu_3=dist_compare.bleu3.mean()
bleu_4=dist_compare.bleu4.mean()

In [21]:
bleu_3

0.03404204049848772

In [22]:
dist_compare.sort_values('bleu2')

Unnamed: 0,text,Actual Text,Generated Text,bleu1,bleu2,bleu3,bleu4
1521,distraction passage: In all one 's lifetime it...,"[[not, to, be, too, proud], [not, to, go, down...","[one's, life, is, full, of, color, and, flavor]",0.0,0.0,0.00,0.000000
1611,distraction passage: Interactive television ad...,"[[showed, an, indifferent, attitude, toward, it]]","[a, new, advertising, agency, has, been, rolli...",0.0,0.0,0.00,0.000000
3566,"distraction passage: When we donate blood , ...",[[O]],"[AB, is, the, universal, receiver]",0.0,0.0,0.00,0.000000
1609,distraction passage: Instead of hitting the be...,"[[paid, for, their, research], [found, way, to...","[students'interest, in, science, and, agricult...",0.0,0.0,0.00,0.000000
1606,distraction passage: Inprefix = st1 /Kansas Ci...,"[[the, City, Hall], [the, centre, of, the, city]]","[900, fire, fighters]",0.0,0.0,0.00,0.000000
...,...,...,...,...,...,...,...
2049,distraction passage: Money and Happiness A Gui...,"[[Money, and, Happiness], [The, Happiness, Mak...","[The, Happiness, Makeover]",1.0,1.0,1.00,0.100000
1553,"distraction passage: In one study , college st...","[[substantive, talks, make, people, happier, t...","[small, talks, are, important, in, communication]",1.0,1.0,0.75,0.666667
779,"distraction passage: Decisions , decisions ! O...","[[emotions, are, the, enemy, of, decision, mak...","[emotions, are, the, enemy, of, decision, making]",1.0,1.0,1.00,1.000000
2591,distraction passage: School cleaning day Today...,"[[cleaning, the, windows], [sweeping, the, flo...","[cleaning, the, windows]",1.0,1.0,1.00,0.100000


In [24]:
dist_compare[dist_compare.bleu2>0.5].sort_values('bleu2')

Unnamed: 0,text,Actual Text,Generated Text,bleu1,bleu2,bleu3,bleu4
1288,distraction passage: I 've often had difficult...,"[[The, writer, had, difficulty, remembering, n...","[Billerica, was, the, name, of, a, town, in, B...",0.711767,0.500461,0.343173,0.266912
871,"distraction passage: Each Sunday , people can ...","[[Music, of, the, Mission, District], [The, Sp...","[the, Mission, District]",0.513417,0.513417,0.513417,0.051342
1200,distraction passage: High school could be a sc...,"[[how, to, take, art, classes], [the, fun, sid...","[high, school, life]",0.513417,0.513417,0.513417,0.051342
1072,distraction passage: Fruit salad is a deliciou...,"[[fruit, salad, is, very, delicious], [people,...","[eat, fruit, salad]",0.513417,0.513417,0.513417,0.051342
3210,"distraction passage: To be a good teacher , yo...","[[How, to, be, a, good, actor], [A, good, teac...","[a, good, teacher]",0.513417,0.513417,0.051342,0.051342
...,...,...,...,...,...,...,...
1038,distraction passage: For kids and many adults ...,"[[San, Diego, Zoo], [Maritime, Museum, of, San...","[Maritime, Museum, of, San, Diego]",1.000000,1.000000,1.000000,1.000000
779,"distraction passage: Decisions , decisions ! O...","[[emotions, are, the, enemy, of, decision, mak...","[emotions, are, the, enemy, of, decision, making]",1.000000,1.000000,1.000000,1.000000
457,distraction passage: Around the world there ar...,"[[the, world, championship, of, grimaces], [th...","[the, world, championship, of, grimaces]",1.000000,1.000000,1.000000,1.000000
2658,distraction passage: Singapore is a dynamic ci...,"[[9:00, am, to, 11:00, am], [11:30, am, to, 1:...","[2:30, pm, to, 4:30, pm]",1.000000,1.000000,1.000000,1.000000


In [101]:
dist_compare.iloc[2869]

text              dist q: We can judge from the Deloitte study t...
Actual Text       [[the, French, are, less, willing, to, buy, ec...
Generated Text    [French, holiday, shoppers, are, choosing, mor...
bleu                                                       0.526316
Name: 2869, dtype: object

In [102]:
dist_compare.iloc[2869]['text']

'dist q: We can judge from the Deloitte study that a: over a quarter of the French give second - hand Christmas gifts p: A used book or nearly - new kitchen gadget    may not be at the top of every Christmas wish list , but hard economic times coupled with a new green awareness are changing attitudes about gift - giving in France . French holiday shoppers are choosing larger numbers for " green " gifting this Christmas , studies show . About 30 percent of French consumers will give second - hand items as gifts to stretch out their tight budgets but also to do their little bit for recycling , according to a study by international consulting firm Deloitte . The survey of Christmas consumer behaviors in 18 European countries found the French were more than twice as likely as other Europeans to give second - hand items . Websites promoting re - gifting and green gifting are popular in France , with many reporting a rise in business . " Concerns about the ecology and the economy have come t

In [103]:
dist_compare.iloc[2869]['Actual Text']

[['the',
  'French',
  'are',
  'less',
  'willing',
  'to',
  'buy',
  'eco',
  '-',
  'friendly',
  'gifts',
  'than',
  'other',
  'Europeans'],
 ['80',
  '%',
  'of',
  'French',
  'people',
  'are',
  'happy',
  'to',
  'receive',
  'second',
  '-',
  'hand',
  'gifts'],
 ['less',
  'than',
  '10',
  '%',
  'of',
  'European',
  'consumers',
  'are',
  'likely',
  'to',
  'give',
  'second',
  '-',
  'hand',
  'gifts']]

In [104]:
dist_compare.iloc[2869]['Generated Text']

['French',
 'holiday',
 'shoppers',
 'are',
 'choosing',
 'more',
 'than',
 'a',
 'quarter',
 'of',
 'the',
 'French',
 'consumers',
 'will',
 'give',
 'second',
 '-',
 'hand',
 'items']

## scores

In [25]:
bleu_1

0.18203008418568886

In [26]:
bleu_2

0.05949882910240971

In [27]:
bleu_3

0.03404204049848772

In [28]:
bleu_4

0.026776927356500054

In [None]:
if __name__ == '__main__':
    main(Configuration.parse_cmd())