In [1]:
!pip install wandb > /dev/null
!pip install pytorch_lightning > /dev/null
!pip install sacrebleu > /dev/null
!pip install rouge > /dev/null
!pip install datasets > /dev/null
!pip install rouge_score > /dev/null
!pip install bert_score > /dev/null
!pip install torch > /dev/null
!pip install tensorflow > /dev/null
!pip install wget > /dev/null
!pip install sentencepiece > /dev/null

In [2]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation
import wandb

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
    )

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!cp /content/drive/MyDrive/Paraphrasing\ API/src . -r

In [38]:
import sys
sys.path.append("./src")

from importlib import reload
import metrics
reload(metrics)
from metrics import Metrics, BleurtModelsLinks

In [39]:
the_metrics = Metrics()

In [12]:
# Installing the smallest
# That can take time...
the_metrics.install_bleurt_model(BleurtModelsLinks.BLEURT_20_D3)

Installing https://storage.googleapis.com/bleurt-oss-21/BLEURT-20-D3.zip to /content/src/models/bleurt_model...


In [36]:
# Should be several minutes if connection is good
the_metrics.install_bert(verbose=False) # Verbose=False, otherwise the output it too big

Computed heavy metrics with return code 0


In [40]:
# 128 sentences. Depending on model, it should take minutes/seconds on GPU. 
# On cpu, for the smaller 3 layer models, it should take several minutes.
# If it takes much more time, use smaller model or do not use it!
the_metrics.test_bleurt()

2021-11-13 12:07:58.746465: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Starting executing of compute_bleurt_for_every_sample.
Execution was succesful!
Computed heavy metrics with return code 0
Test was successful! Output value for same strings but terrible english is 0.9610422253608704


In [41]:
# 128 sentences. It should take several seconds on a good cpu, even less on GPU. 
# If it takes much more time, use GPU, or use super small sets, or do not use it.
the_metrics.test_bert()

Starting executing of compute_bert_for_every_sample.
Execution was succesful!
Computed heavy metrics with return code 0
Test was successful! Output value for same strings but terrible english is {'precision': 1.0000001192092896, 'recall': 1.0000001192092896, 'f1': 1.0000001192092896}


In [43]:
dummy_data = the_metrics.get_dummy_data()
the_metrics.compute_metrics(dummy_data, dummy_data, dummy_data)

Starting executing of compute_bert_for_every_sample.
Execution was succesful!
Computed heavy metrics with return code 0
2021-11-13 12:51:14.773519: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Starting executing of compute_bleurt_for_every_sample.
Execution was succesful!
Computed heavy metrics with return code 0


{'bert_f1': 1.0000001192092896,
 'bert_precision': 1.0000001192092896,
 'bert_recall': 1.0000001192092896,
 'bleu': 1.0000000000000004,
 'bleu_diversity': 1.0,
 'bleu_output_input': 1.0000000000000004,
 'bleurt': 0.9610422253608704,
 'char_ngram_overlap': 1.0,
 'gleu': 1.0,
 'gleu_output_input': 1.0,
 'intersection_over_union': 1.0,
 'rouge1_f1': 1.0,
 'rouge1_precision': 1.0,
 'rouge1_recall': 1.0,
 'rouge2_f1': 1.0,
 'rouge2_precision': 1.0,
 'rouge2_recall': 1.0,
 'rougeL_f1': 1.0,
 'rougeL_precision': 1.0,
 'rougeL_recall': 1.0}

In [None]:
# connection to wandb
YOUR_API_KEY = 'n/a'
os.environ["WANDB_API_KEY"] = '3d4d9c4f219a83a45067149237d96e54395bffa4'
wandb.init(project="dyna_T5", entity='cfg2')
run_name = wandb.run.name

[34m[1mwandb[0m: Currently logged in as: [33mcfg2[0m (use `wandb login --relogin` to force relogin)


In [None]:
# get training data
train_path = "data/quora_train.csv"
val_path = "data/quora_eval.csv"

In [None]:
train = pd.read_csv("data/quora_train.csv")
eval_df = pd.read_csv("data/quora_eval.csv")


In [None]:
eval_df.head()

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,375490,375496,506538,506539,What is new and noteworthy in the Marketing fi...,Are there any certifications and learnings onl...,0
1,195357,195363,295784,236283,Do girls with large breasts avoid hugs?,What are some bad things about having large br...,0
2,232791,232797,240315,342882,What does it mean when you dream of getting sh...,What does it mean to be shot in the back of th...,0
3,148747,148753,234468,234469,What are some examples of a flat character in ...,What are some examples of flat characters in l...,1
4,371036,371042,104329,474854,What do you do on weekends?,What do you do during weekends?,1


In [None]:
# setup model training
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)


class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        #print(hparams)
        #self.hparams = hparams
        
        self.hparams.update(vars(hparams))
        
        #for key in hparams.keys():
        #    help_hparams[key]=hparams[key]
        #self.hparams = argparse.Namespace(**help_hparams)
        
        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)

    def is_logger(self):
        return self.trainer.global_rank <= 0

    def forward(
            self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
    ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
        )

    def _step(self, batch):
        labels = batch["target_ids"]
        labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            labels=labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]

        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)

        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}

    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}
        #return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        return {"val_loss": loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}
        return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        self.opt = optimizer
        return [optimizer]

    # def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
    #     # if self.trainer.use_tpu:
    #     #     xm.optimizer_step(optimizer)
    #     # else:
    #         # optimizer.step()
    #     optimizer.step()
    #     optimizer.zero_grad()
    #     self.lr_scheduler.step()

    def optimizer_step(self,
                     epoch=None,
                     batch_idx=None,
                     optimizer=None,
                     optimizer_idx=None,
                     optimizer_closure=None,
                     on_tpu=None,
                     using_native_amp=None,
                     using_lbfgs=None):

        optimizer.step() # remove 'closure=optimizer_closure' here
        optimizer.zero_grad()
        self.lr_scheduler.step()


    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict

    def train_dataloader(self):
        train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="quora_train", args=self.hparams)
        dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True,
                                num_workers=4)
        t_total = (
                (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
                // self.hparams.gradient_accumulation_steps
                * float(self.hparams.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="quora_eval", args=self.hparams)
        return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

logger = logging.getLogger(__name__)


class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
      # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))

    def on_test_end(self, trainer, pl_module):
        logger.info("***** Test results *****")

        if pl_module.is_logger():
            metrics = trainer.callback_metrics

      # Log and save results to file
            output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
            with open(output_test_results_file, "w") as writer:
                for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                        logger.info("{} = {}\n".format(key, str(metrics[key])))
                        writer.write("{} = {}\n".format(key, str(metrics[key])))



args_dict = dict(
    data_dir="", # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path='t5-small',
    tokenizer_name_or_path='t5-small',
    max_seq_length=64,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=64,
    eval_batch_size=64,
    num_train_epochs=2,
    gradient_accumulation_steps=2,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
    report_to="wandb",
)



tokenizer = T5Tokenizer.from_pretrained('t5-small')



class ParaphraseDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, max_len=64):
        self.path = os.path.join(data_dir, type_path + '.csv')

        self.source_column = "question1"
        self.target_column = "question2"
        self.data = pd.read_csv(self.path)

        #data_help = pd.read_csv(self.path)

        #self.data = data_help.loc[:20] 

        
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

    def _build(self):
        for idx in range(len(self.data)):
            input_, target = self.data.loc[idx, self.source_column], self.data.loc[idx, self.target_column]

            input_ = "paraphrase: "+ str(input_) + ' </s>'
            target = str(target) + " </s>"

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)



dataset = ParaphraseDataset(tokenizer, 'data', 'quora_eval', 64)
print("Val dataset: ",len(dataset))

data = dataset[3]
print(tokenizer.decode(data['source_ids']))
print(tokenizer.decode(data['target_ids']))

if not os.path.exists('t5_paraphrase'):
    os.makedirs('t5_paraphrase')

args_dict.update({'data_dir': 'data', 'output_dir': 't5_paraphrase', 'num_train_epochs':2,'max_seq_length':64})
args = argparse.Namespace(**args_dict)
print(args_dict)



checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, monitor="val_loss", mode="min", save_top_k=5
    #filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    #early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    # checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)



def get_dataset(tokenizer, type_path, args):
    return ParaphraseDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Val dataset:  40429
paraphrase: What are some examples of a flat character in a piece of literature?</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
What are some examples of flat characters in literature?</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
{'data_dir': 'data', 'output_dir': 't5_paraphrase', 'model_name_or_path': 't5-small', 'tokenizer_name_or_path': 't5-small', 'max_seq_length': 64, 'learning_rate': 0.0003, 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'warmup_steps': 0, 'train_batch_size'

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


In [None]:
print ("Initialize model")
model = T5FineTuner(args)


trainer = pl.Trainer(**train_params)


print (" Training model")
trainer.fit(model)

print ("training finished")


print ("Saving model")
model.model.save_pretrained('t5_paraphrase_quora')

print ("Saved model")

Initialize model


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


 Training model



  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training: -1it [00:00, ?it/s]



Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

training finished
Saving model
Saved model


In [None]:
print("t")

t


In [None]:
model_final = T5ForConditionalGeneration.from_pretrained('t5_paraphrase_quora')
tokenizer = T5Tokenizer.from_pretrained(args.tokenizer_name_or_path)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device ",device)
model = model_final.to(device)

device  cuda


In [None]:
#sentence = "She was one of them, really, blithe and girlish in her manner and her tastes—video games, Harry Potter, the baffling pop music they listened to."
#sentence = "What are the ingredients required to bake a perfect cake?"
sentence = "What is the best possible approach to learn aeronautical engineering?"
#sentence = "Do apples taste better than oranges in general?"


text =  "paraphase:" + sentence + " </s>"


max_len = 64

encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)


# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
beam_outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    do_sample=True,
    max_length=64,
    top_k=120,
    top_p=0.98,
    early_stopping=True,
    num_return_sequences=5
)


print ("\nOriginal Sentence::")
print (sentence)
print ("\n")
print ("Paraphrased Sentence: ")
final_outputs =[]
for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    if sent.lower() != sentence.lower() and sent not in final_outputs:
        final_outputs.append(sent)

for i, final_output in enumerate(final_outputs):
    print("{}: {}".format(i, final_output))


Original Sentence::
What is the best possible approach to learn aeronautical engineering?


Paraphrased Sentence: 
0: Is there a way to learn aeronautical engineering?
1: Which is the best method to learn aeronautical engineering?
2: What is the best way to learn aeronautical engineering?
3: What is the best way to learn aeronautic engineering in real-world life?
4: What are some good options for studying aeronautical engineering?


In [None]:
df_test = pd.read_csv('data/quora_test.csv')

In [None]:
df_test

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,149643,149649,117395,58546,What hotel in Mysore would be safe for unmarri...,What hotel in Kochi would be safe for unmarrie...,0
1,274741,274747,303590,210228,What are the typical physical traits of an att...,Can a girl rule a boy by using her attractive ...,0
2,142010,142016,225316,225317,What some examples of workplace risks and haza...,What are workplace risks and hazards? What are...,1
3,162575,162581,253079,131365,In what scenario would the U S and China use m...,How would a war between the US and China play ...,0
4,401575,401581,49318,162037,How do you self publish a book?,What is the process of publishing a book?,1
...,...,...,...,...,...,...,...
40423,26034,26040,48504,48505,Which actors were screen-tested for the role o...,The Dark Knight Rises (2012 movie): The same d...,0
40424,292713,292719,414366,93356,Psychology says that if you dream about someon...,What does it mean when you dream about someone...,0
40425,126055,126061,203263,203264,Certain people of reserved category of college...,Fedora 24 how to prevent celestia from crashing?,0
40426,159769,159775,34003,249353,Which is the best question you've read on Quora?,What is the best question on Quora ever?,1


In [None]:
df_test_slice = df_test[0:5000]

In [None]:
def testing(sentence):

    
    text =  "paraphrase:" + str(sentence) + " </s>"

    max_len = 64

    encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)


    # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
    beam_outputs = model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        do_sample=True,
        max_length=64,
        top_k=120,
        top_p=0.98,
        early_stopping=True,
        num_return_sequences=5
    )
    

    final_outputs =[]
    for beam_output in beam_outputs:
        sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
        if sent.lower() != sentence.lower() and sent not in final_outputs:
            final_outputs.append(sent)

    #for i, final_output in enumerate(final_outputs):
    #   print("{}: {}".format(i, final_output))
    #print(final_outputs)

    if not final_outputs:
        final_outputs= [""]
    
    return final_outputs[0]
    

In [None]:
df_test

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,149643,149649,117395,58546,What hotel in Mysore would be safe for unmarri...,What hotel in Kochi would be safe for unmarrie...,0
1,274741,274747,303590,210228,What are the typical physical traits of an att...,Can a girl rule a boy by using her attractive ...,0
2,142010,142016,225316,225317,What some examples of workplace risks and haza...,What are workplace risks and hazards? What are...,1
3,162575,162581,253079,131365,In what scenario would the U S and China use m...,How would a war between the US and China play ...,0
4,401575,401581,49318,162037,How do you self publish a book?,What is the process of publishing a book?,1
...,...,...,...,...,...,...,...
40423,26034,26040,48504,48505,Which actors were screen-tested for the role o...,The Dark Knight Rises (2012 movie): The same d...,0
40424,292713,292719,414366,93356,Psychology says that if you dream about someon...,What does it mean when you dream about someone...,0
40425,126055,126061,203263,203264,Certain people of reserved category of college...,Fedora 24 how to prevent celestia from crashing?,0
40426,159769,159775,34003,249353,Which is the best question you've read on Quora?,What is the best question on Quora ever?,1


In [1]:
df_test['prediction'] = df_test.question1.apply(lambda x: testing(x))

In [None]:
df_test_slice

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,prediction
0,149643,149649,117395,58546,What hotel in Mysore would be safe for unmarri...,What hotel in Kochi would be safe for unmarrie...,0,What hotel in San Francisco would be safe for ...
1,274741,274747,303590,210228,What are the typical physical traits of an att...,Can a girl rule a boy by using her attractive ...,0,What are the typical physical traits of a keen...
2,142010,142016,225316,225317,What some examples of workplace risks and haza...,What are workplace risks and hazards? What are...,1,What are some examples of workplace risk or ha...
3,162575,162581,253079,131365,In what scenario would the U S and China use m...,How would a war between the US and China play ...,0,Are China able to use their military force aga...
4,401575,401581,49318,162037,How do you self publish a book?,What is the process of publishing a book?,1,How to self publish a book?
...,...,...,...,...,...,...,...,...
4995,5099,5105,10059,10060,What political orientation does India follow?,Which political orientation does India follow?,1,What is your biggest objective in India?
4996,93244,93250,94629,155954,I'm in college how do I start my own hedge fund?,How do you start your own hedge fund? How do y...,1,Why do hedge funds work? Why are banks so used...
4997,300399,300405,423188,91511,How do I retrieve a WhatsApp message from some...,How can I hack someone else's WhatsApp account...,0,How can I retrieve a WhatsApp message from the...
4998,57165,57171,100502,100503,Can Plan B cause an ectopic pregnancy? Why or ...,Can you get your period with an ectopic pregna...,0,Why does CPA cause the fetal ectopic pregnancy?


In [None]:
df_test_slice.to_csv('data/quora_preds_slice.csv')