In [1]:
!pip install sentencepiece
from google.colab import drive
drive.flush_and_unmount()

from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
#Emma's directory:
%cd gdrive/MyDrive/6.8610\ Research\ Project/experiments
#David's directory:
#%%cd gdrive/MyDrive/Colab Notebooks

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.3 MB[0m [31m9.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Drive not mounted, so nothing to flush and unmount.
Mounted at /content/gdrive/
/content/gdrive/MyDrive/6.8610 Research Project/experiments


In [2]:
import time
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

import sentencepiece

class DipperParaphraser(object):
    def __init__(self, model="kalpeshk2011/dipper-paraphraser-xxl-no-context", verbose=True):
        time1 = time.time()
        self.tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-xxl')
        self.model = T5ForConditionalGeneration.from_pretrained(model)
        if verbose:
            print(f"{model} model loaded in {time.time() - time1}")
        self.model.cuda()
        self.model.eval()

    def paraphrase(self, input_text, lex_diversity, order_diversity, sent_interval=3, **kwargs):
        """Paraphrase a text using the DIPPER model.

        Args:
            input_text (str): The text to paraphrase. Make sure to mark the sentence to be paraphrased between <sent> and </sent> blocks, keeping space on either side.
            lex_diversity (int): The lexical diversity of the output, choose multiples of 20 from 0 to 100. 0 means no diversity, 100 means maximum diversity.
            order_diversity (int): The order diversity of the output, choose multiples of 20 from 0 to 100. 0 means no diversity, 100 means maximum diversity.
            **kwargs: Additional keyword arguments like top_p, top_k, max_length.
        """
        assert lex_diversity in [0, 20, 40, 60, 80, 100], "Lexical diversity must be one of 0, 20, 40, 60, 80, 100."
        assert order_diversity in [0, 20, 40, 60, 80, 100], "Order diversity must be one of 0, 20, 40, 60, 80, 100."

        lex_code = int(100 - lex_diversity)
        order_code = int(100 - order_diversity)

        input_text = " ".join(input_text.split())
        sentences = sent_tokenize(input_text)
        output_text = ""

        for sent_idx in range(0, len(sentences), sent_interval):
            curr_sent_window = " ".join(sentences[sent_idx:sent_idx + sent_interval])
            final_input_text = f"lexical = {lex_code}, order = {order_code} {curr_sent_window}"

            final_input = self.tokenizer([final_input_text], return_tensors="pt")
            final_input = {k: v.cuda() for k, v in final_input.items()}

            with torch.inference_mode():
                outputs = self.model.generate(**final_input, **kwargs)
            outputs = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
            output_text += " " + outputs[0]

        return output_text

# if __name__ == "__main__":
#     device = torch.device('cuda')
#     dp = DipperParaphraser(model="kalpeshk2011/dipper-paraphraser-xxl-no-context").to(device)

#     input_text = "They have never been known to mingle with humans. Today, it is believed these unicorns live in an unspoilt environment which is surrounded by mountains. Its edge is protected by a thick wattle of wattle trees, giving it a majestic appearance. Along with their so-called miracle of multicolored coat, their golden coloured feather makes them look like mirages. Some of them are rumored to be capable of speaking a large amount of different languages. They feed on elk and goats as they were selected from those animals that possess a fierceness to them, and can \"eat\" them with their long horns."

#     print(f"Input = {prompt} <sent> {input_text} </sent>\n")
#     output_l60_sample = dp.paraphrase(input_text, lex_diversity=60, order_diversity=0, do_sample=True, top_p=0.75, top_k=None, max_length=512)
#     print(f"Output (Lexical diversity = 60, Sample p = 0.75) = {output_l60_sample}\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids.to(device)

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [11]:
import pandas as pd
import numpy as np
import torch
import os

dir = "/content/gdrive/MyDrive/6.8610 Research Project/experiments/watermarked/yang"
all_csvs = []
for file in os.listdir(dir):
  if file.endswith(".csv"):
    df = pd.read_csv(dir + "/" + file)
    df = df.head(min(500, df.shape[0]))   # Limit dataframe since the entire dataframe would be really large

    df[df.columns[2]] = df[df.columns[2]].map(lambda x: paraphrase(x)[0]) # changed index to 2 cuz yang watemark is in 3rd col
    df.rename(columns={'easy on [yang on kirch] watermarked text': 'EYK-Watermarked_And_Paraphrased'}, inplace=True)
    save_location = "/content/gdrive/MyDrive/6.8610 Research Project/experiments/" + file[:-4] + "newparaphrased.csv"
    df.to_csv(save_location, index=False)
    print("Done:", file[:-4], save_location)



Done: 500_easy_on_yang_on_kirch_watermarked_texts /content/gdrive/MyDrive/6.8610 Research Project/experiments/500_easy_on_yang_on_kirch_watermarked_textsnewparaphrased.csv


In [None]:
import pandas as pd
import numpy as np
import torch

df = pd.read_csv('/content/gdrive/MyDrive/6.8610 Research Project/experiments/easy_set1.csv')
df = df.head(500)
df

Unnamed: 0.1,Unnamed: 0,water_easy
0,0,Sexhow railway station was a railway station l...
1,1,"In Finnish folklore, all places and things, an..."
2,2,"In mathematics, specifically differential calc..."
3,3,is a Japanese shōjo manga series written and i...
4,4,"Robert Milner ""Rob"" Bradley, Jr. (born August ..."
...,...,...
495,495,"The 2016–17 National League season, known as t..."
496,496,CAINE Linux (Computer Aided INvestigative Envi...
497,497,"The meat ant (Iridomyrmex purpureus), also kno..."
498,498,"Caradog, Caradoc, Caradawg, or Cradawg, Latini..."


In [None]:
df[df.columns[1]] = df[df.columns[1]].map(lambda x: paraphrase(x)[0])

#x=paraphrase('Going to class is essential to getting good grades')




In [None]:
df
df.to_csv('/content/gdrive/My Drive/6.8610 Research Project/experiments/paraphrased/easy_set1_paraphrased.csv', index=False)

yang watermarked texts

In [9]:
import pandas as pd
import numpy as np
import torch

df = pd.read_csv('/content/gdrive/MyDrive/6.8610 Research Project/experiments/watermarked/yang/500_easy_on_yang_on_kirch_watermarked_texts.csv')
df = df.head(500)
df

Unnamed: 0.1,Unnamed: 0,original text,easy on [yang on kirch] watermarked text
0,0,Sexhow railway station was a railway station l...,Sexhow railway station was a railway station l...
1,1,"In Finnish folklore , all places and things , ...","In Finnish folklore , all places and things , ..."
2,2,"In mathematics , particularly differential cal...","In mathematics , particularly differential cal..."
3,3,is a Japanese shōjo manga series penned and il...,is a Japanese shōjo manga series penned and il...
4,4,"Robert Milner "" Rob "" Bradley , Jr . ( born Au...","Robert Milner "" Rob "" Bradley , Jr . ( born Au..."
...,...,...,...
495,495,"The 2016 – 17 National League season , known a...","The 2016 – 17 National League season , known a..."
496,496,CAINE Linux ( Computer Aided INvestigative Env...,CAINE Linux ( Computer Aided INvestigative Env...
497,497,"The meat ant ( Iridomyrmex purpureus ) , somet...","The meat ant ( Iridomyrmex purpureus ) , somet..."
498,498,"Caradog , Caradoc , Caradawg , or Cradawg , La...","Caradog , Caradoc , Caradawg , or Cradawg , La..."
