In [2]:
import pandas as pd
import torch
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

ModuleNotFoundError: No module named 'torch'

In [2]:
# !pip install pandarallel

In [1]:
from pandarallel import pandarallel
import multiprocessing

num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False, progress_bar=True)

Available CPUs: 4
INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


## Define checkpoints

In [4]:
cache_dir = './cache_mod'
checkpoint_t5 = "google/flan-t5-large"
checkpoint_dolly = "databricks/dolly-v2-2-8b"


In [5]:
def generate_text(model, tokenizer, prompt, is_pipeline=False, max_new_tokens=100):
    if is_pipeline:
        return model(prompt)[0]['generated_text']
    else:
        inputs = tokenizer(prompt, return_tensors="pt")
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
        result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return "\n".join(result)

## Read sample data file

In [6]:
df = pd.read_csv("./Text_Simplification/trial_smpl_medium.csv")
df.head(2)

Unnamed: 0,data_source,source_level_og,target_level_og,Unnamed: 3,source,target,source_level_og.1,target_level_og.1,data_source.1,data_type,source_level_cefr,target_level_cefr,id
0,BreakingNewsEnglish,2.0,1.0,1587,Donald Trump is interested in buying Greenland...,Donald Trump is interested in buying Greenland...,2.0,1.0,BreakingNewsEnglish,text_simplification,,,TS000001588
1,BreakingNewsEnglish,2.0,1.0,1749,Everyone knows that children don't like eating...,Everyone knows children don't like eating gree...,2.0,1.0,BreakingNewsEnglish,text_simplification,,,TS000001750


In [5]:
df1 = df.head(10)

### Run simplification for various prompts

In [None]:
%%time

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

prompts = ["Simplify","Simplify to elementary level","Simplify to CEFR A1","Simplify to intermediate level", "Simplify to CEFR B1"]

model_checkpoints = {
    'dolly': checkpoint_dolly,
    'flant5': checkpoint_t5
}

    
for model_name, checkpoint in model_checkpoints.items():
    if checkpoint == checkpoint_dolly:
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        print("Tokenizer is loaded for Dolly")
        model = pipeline(model=checkpoint, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")
        print("Model is loaded for Dolly")
        is_pipeline = True
        print("Running dolly...")
    else:
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        print("Tokenizer is loaded for T5")
        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, cache_dir=cache_dir, device_map="auto")
        print("Model is loaded for T5")
        is_pipeline = False
        print("Running t5...")

    for inst in prompts:
        df[f"{model_name}-{inst}"] = df['source'].apply(lambda x: generate_text(model, tokenizer, inst + ': ' + x, is_pipeline=is_pipeline))
        
    print("Done!\n")

Tokenizer is loaded for Dolly
Model is loaded for Dolly
Running dolly...


### Run the model in parallelized workers

In [7]:
from helper.worker import worker
import torch.multiprocessing as mp
import os

In [8]:
import time
time.sleep(600)

In [8]:
from helper.worker import worker
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# mp.set_start_method('spawn')

device_ids = [0, 1]  # GPU IDs
prompts = ["Simplify","Simplify to elementary level","Simplify to CEFR A1","Simplify to intermediate level", "Simplify to CEFR B1"]

model_checkpoints = {
    'dolly': 'databricks/dolly-v2-2-8b',
    'flant5': 'google/flan-t5-large'
}

models = {}
tokenizers = {}

for model_name, checkpoint in model_checkpoints.items():
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    if model_name == 'dolly':
        model = pipeline(model=checkpoint, torch_dtype=torch.bfloat16, trust_remote_code=True)
    else:
        model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, cache_dir='./cache_mod')
    models[model_name] = model
    tokenizers[model_name] = tokenizer

processes = []
for i, (model_name, checkpoint) in enumerate(model_checkpoints.items()):
    device = device_ids[i % len(device_ids)]
    for inst in prompts:
        p = mp.Process(target=worker, args=(device, models[model_name], 
                                            tokenizers[model_name], 
                                            inst, 
                                            df, 
                                            model_name, 
                                            True if model_name=='dolly' else False))
        p.start()
        processes.append(p)

for p in processes:
    p.join()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
df.to_csv("./Text_Simplification/simplified_df_new.csv")

## Add CEFR labels and evaluate


In [1]:
import pandas as pd
from pandarallel import pandarallel
import multiprocessing

num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)

Available CPUs: 4
INFO: Pandarallel will run on 3 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


### Load data

In [2]:
df = pd.read_csv("gs://XXX/Text_Simplification/simplified_df_new.csv")

## Make sure all the correct cols are selected below. Ow the pivot logic will fail
df = df[["source","target","source_level_og","target_level_og","data_source","id","source_level_cefr","target_level_cefr","flant5-Simplify","flant5-Simplify to elementary level","flant5-Simplify to CEFR A1","flant5-Simplify to intermediate level","flant5-Simplify to CEFR B1","dolly-Simplify","dolly-Simplify to elementary level","dolly-Simplify to CEFR A1","dolly-Simplify to intermediate level","dolly-Simplify to CEFR B1"]]
df.head(2)

Unnamed: 0,source,target,source_level_og,target_level_og,data_source,id,source_level_cefr,target_level_cefr,flant5-Simplify,flant5-Simplify to elementary level,flant5-Simplify to CEFR A1,flant5-Simplify to intermediate level,flant5-Simplify to CEFR B1,dolly-Simplify,dolly-Simplify to elementary level,dolly-Simplify to CEFR A1,dolly-Simplify to intermediate level,dolly-Simplify to CEFR B1
0,Donald Trump is interested in buying Greenland...,Donald Trump is interested in buying Greenland...,2.0,1.0,BreakingNewsEnglish,TS000001588,,,Donald Trump has said he would be interested i...,Donald Trump is interested in buying Greenland.,Donald Trump is interested in buying Greenland.,Donald Trump is interested in buying Greenland.,Donald Trump is interested in buying Greenland.,Buy Greenland: Mr Trump said he was interested...,This is simplified to elementary level as all ...,"Denmark does not own Greenland, although it ha...",Denmark owns Greenland. The future President o...,CEFR B1: Donald Trump is interested in buying ...
1,Everyone knows that children don't like eating...,Everyone knows children don't like eating gree...,2.0,1.0,BreakingNewsEnglish,TS000001750,,,Children dislike greens.,Children don't like eating greens. Parents can...,CEFR A1: Everyone knows that children don't li...,The study looked at the eating habits of young...,CEFR B1:,No wonder children don't like vegetables. Gene...,"In short, children will not eat greens because...",CEFR A1: Everyone knows that children do not l...,Children do not like eating greens because the...,Children's' dislike of greens might be gene re...


In [3]:
df_long = df.melt(id_vars=["source","target","source_level_og","target_level_og","data_source","id","source_level_cefr","target_level_cefr"], var_name="model_prompt", value_name="generated_text")
df_long[['model', 'prompt']] = df_long['model_prompt'].str.split('-', expand=True)
df_long = df_long[["source","target","source_level_og","target_level_og","data_source","id","source_level_cefr","target_level_cefr", "model","prompt","generated_text"]]

df_long.head(2)

Unnamed: 0,source,target,source_level_og,target_level_og,data_source,id,source_level_cefr,target_level_cefr,model,prompt,generated_text
0,Donald Trump is interested in buying Greenland...,Donald Trump is interested in buying Greenland...,2.0,1.0,BreakingNewsEnglish,TS000001588,,,flant5,Simplify,Donald Trump has said he would be interested i...
1,Everyone knows that children don't like eating...,Everyone knows children don't like eating gree...,2.0,1.0,BreakingNewsEnglish,TS000001750,,,flant5,Simplify,Children dislike greens.


### Preprocess data

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import string
import re

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


def preprocess_text(text):
    """
    Note: low-income -> low income
    """
    # Convert text to lowercase
    text = text.lower()
    
    # Replace special characters with whitespace
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Remove leading and trailing whitespace
    text = text.strip()
    
    # Tokenize text into individual words
    words = nltk.word_tokenize(text)
    
    # Remove stopwords from the text
    # stop_words = set(stopwords.words('english'))
    # words = [word for word in words if word not in stop_words]
    
    # Lemmatize the words using WordNetLemmatizer
    # lemmatizer = WordNetLemmatizer()
    # words = [lemmatizer.lemmatize(word) for word in words]
    
    # Perform snowball stemming on the words
    # stemmer = SnowballStemmer("english")
    # words = [stemmer.stem(word) for word in words]
    
    # Join the preprocessed words back into a single string
    preprocessed_text = ' '.join(words)
    
    # Remove leading and trailing whitespace
    preprocessed_text = preprocessed_text.strip()
    
    
    return preprocessed_text

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
df_long["generated_text"] = df_long["generated_text"].parallel_apply(lambda t: preprocess_text(t))

### Downloading the Model

In [6]:
BUCKET_NAME = "XXX"

MODEL_BLOB_NAME = "CEFR/models/cefr_ktrain_bert/tf_model.h5"
MODEL_FILE_NAME = "./model/tf_model.h5"

PREPROC_BLOB_NAME = "CEFR/models/cefr_ktrain_bert/tf_model.preproc"
PREPROC_FILE_NAME = "./model/tf_model.preproc"

model_load_path = "gs://XXX/CEFR/models/cefr_ktrain_bert/"

In [7]:
from google.cloud import storage

def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from COS bucket."""
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

In [8]:
%ls -al

total 230388
drwxr-xr-x 5 jupyter jupyter      4096 May 25 05:38 [0m[01;34m.[0m/
drwxr-xr-x 6 jupyter jupyter      4096 May 17 20:14 [01;34m..[0m/
drwxr-xr-x 2 jupyter jupyter      4096 May 24 17:36 [01;34m.ipynb_checkpoints[0m/
drwxr-xr-x 2 jupyter jupyter      4096 May 17 20:53 [01;34m__pycache__[0m/
-rw-r--r-- 1 jupyter jupyter     12225 May 17 20:48 data_prep.py
-rw-r--r-- 1 jupyter jupyter  16126579 May 18 14:56 huggingface_cefr_labeled.csv
drwxr-xr-x 2 jupyter jupyter      4096 May 17 21:07 [01;34mmodel[0m/
-rw-r--r-- 1 jupyter jupyter    183859 May 24 17:52 predict_cefr_label.ipynb
-rw-r--r-- 1 jupyter jupyter     85481 May 25 05:38 simp_multmod.ipynb
-rw-r--r-- 1 jupyter jupyter   5942659 May 25 05:31 simplified_df_cefr_labeled.csv
-rw-r--r-- 1 jupyter jupyter 213539545 May 19 05:50 wiki_cefr_labeled.csv


In [9]:
download_blob(bucket_name = BUCKET_NAME, 
              source_blob_name = MODEL_BLOB_NAME, 
              destination_file_name = MODEL_FILE_NAME)

In [10]:
download_blob(bucket_name = BUCKET_NAME, 
              source_blob_name = PREPROC_BLOB_NAME, 
              destination_file_name = PREPROC_FILE_NAME)

### CEFR Inference

In [11]:
import ktrain

KeyboardInterrupt: 

In [None]:
predictor = ktrain.load_predictor("model")

In [None]:
labels = predictor.predict(list(df_long["generated_text"]))
df_long["cefr_labels"] = [label.split("_")[1] for label in labels]

In [None]:
## NOTE: Run this cell only if source-target cefr labels are not generated in raw df 

no_raw_cefr = True

if(no_raw_cefr):
    
    ## Source
    source_labels = predictor.predict(list(df_long["source"]))
    df_long["source_level_cefr"] = [label.split("_")[1] for label in source_labels]
    
    ## Target
    target_labels = predictor.predict(list(df_long["target"]))
    df_long["target_level_cefr"] = [label.split("_")[1] for label in target_labels]
    
df_long.head()

In [None]:
if(1):
    
    ## Save to .csv
    df_long.to_csv("simplified_df_cefr_labeled.csv", index=False)
    
    ## Save to GC bucket
    !gsutil cp -r simplified_df_cefr_labeled.csv gs://XXX/Text_Simplification/simplified_df_cefr_labeled.csv

Copying file://simplified_df_cefr_labeled.csv [Content-Type=text/csv]...
/ [1 files][  5.7 MiB/  5.7 MiB]                                                
Operation completed over 1 objects/5.7 MiB.                                      


In [None]:
df_long.columns

Index(['source', 'target', 'source_level_og', 'target_level_og', 'data_source',
       'id', 'source_level_cefr', 'target_level_cefr', 'model', 'prompt',
       'generated_text', 'cefr_labels'],
      dtype='object')

### Evaluate

### Scoring rules

* "Simplify": If source_level_og is 3 or 2, and the cefr_labels level is less than source_level_og, consider it a correct simplification.
* "Simplify to elementary level", "Simplify to CEFR A1": If source_level_og is 3 or 2, and the cefr_labels level is 1, consider it a correct simplification.
* "Simplify to intermediate level", "Simplify to CEFR B1": If source_level_og is 3, and the cefr_labels level is 2, consider it a correct simplification.
* If source_level_og is 3 or 2, and cefr_labels is equal to source_level_og, consider it an incorrect simplification, but keep a count of such instances.

In [12]:
df_long = pd.read_csv("simplified_df_cefr_labeled.csv")

In [15]:
df_long = df_long.dropna(subset=['source_level_og'])

df_long['cefr_labels'] = df_long['cefr_labels'].astype(int)
df_long['source_level_og'] = df_long['source_level_og'].astype(int)
# df_long['source_level_cefr'] = df_long['source_level_cefr'].astype(int)

def check_correct_simplification(row):
    source_level = row['source_level_og']
    cefr_level = row['cefr_labels']
    prompt = row['prompt']

    # Level mapping for different prompts
    level_mapping = {
        "Simplify": source_level - 1, 
        "Simplify to elementary level": 1, 
        "Simplify to CEFR A1": 1, 
        "Simplify to intermediate level": 2, 
        "Simplify to CEFR B1": 2,
    }

    if source_level in [3, 2]:
        if prompt == "Simplify":
            return (source_level == 3 and cefr_level == 2) or cefr_level == 1
        elif prompt in level_mapping:
            # Check if correct simplification
            return cefr_level == level_mapping[prompt]
        elif cefr_level == source_level:
            # For the case when level stays the same
            return False
    return None

# Add 'correct_simplification' column
df_long['correct_simplification'] = df_long.apply(check_correct_simplification, axis=1)


In [16]:
# Compute scores
scores = df_long.groupby(['model', 'prompt', 'source_level_og'])['correct_simplification'].mean().unstack().reset_index()

# Rename columns for clarity
scores.columns = ['Model', 'Prompt', 'Score for source level 2', 'Score for source level 3']

# Count of instances where CEFR level stays the same as source level
same_level_count = df_long[df_long['cefr_labels'] == df_long['source_level_og']].groupby(['model', 'prompt']).size()

# Print results
print(scores)
print(f"\nCount of instances where CEFR level is the same as source level out of {df.shape[0]}:")
print(same_level_count)

    Model                          Prompt  Score for source level 2  \
0   dolly                        Simplify                    0.6500   
1   dolly             Simplify to CEFR A1                    0.5750   
2   dolly             Simplify to CEFR B1                    0.1750   
3   dolly    Simplify to elementary level                    0.6500   
4   dolly  Simplify to intermediate level                    0.1625   
5  flant5                        Simplify                    0.6375   
6  flant5             Simplify to CEFR A1                    0.6125   
7  flant5             Simplify to CEFR B1                    0.0500   
8  flant5    Simplify to elementary level                    0.5750   
9  flant5  Simplify to intermediate level                    0.1625   

   Score for source level 3  
0                   0.92500  
1                   0.48750  
2                   0.37500  
3                   0.63750  
4                   0.30000  
5                   0.87500  
6       

#### Using source_level_cefr

In [82]:
def check_correct_simplification(row):
    source_level = row['source_level_cefr']
    cefr_level = row['cefr_labels']
    prompt = row['prompt']

    # Level mapping for different prompts
    level_mapping = {
        "Simplify": source_level - 1, 
        "Simplify to elementary level": 1, 
        "Simplify to CEFR A1": 1, 
        "Simplify to intermediate level": 2, 
        "Simplify to CEFR B1": 2,
    }

    if source_level in [3, 2]:
        if prompt in level_mapping:
            # Check if correct simplification
            return cefr_level == level_mapping[prompt]
        elif cefr_level == source_level:
            # For the case when level stays the same
            return False
    return None

# Add 'correct_simplification' column
df_long['correct_simplification'] = df_long.apply(check_correct_simplification, axis=1)


In [83]:
# Compute scores
scores = df_long.groupby(['model', 'prompt', 'source_level_cefr'])['correct_simplification'].mean().unstack().reset_index()

# Rename columns for clarity
# scores.columns = ['Model', 'Prompt', 'Score for source level 2', 'Score for source level 3']

# Count of instances where CEFR level stays the same as source level
same_level_count = df_long[df_long['cefr_labels'] == df_long['source_level_cefr']].groupby(['model', 'prompt']).size()

# Print results
print(scores)
print("\nCount of instances where CEFR level is the same as source level:")
print(same_level_count)

source_level_cefr   model                        prompt   1         2
0                   dolly                      Simplify NaN  0.769231
1                   dolly           Simplify to CEFR A1 NaN  0.743590
2                   dolly  Simplify to elementary level NaN  0.794872
3                  flant5                      Simplify NaN  0.512821
4                  flant5           Simplify to CEFR A1 NaN  0.461538
5                  flant5           Simplify to CEFR B1 NaN  0.461538
6                  flant5  Simplify to elementary level NaN  0.589744

Count of instances where CEFR level is the same as source level:
model   prompt                      
dolly   Simplify                        27
        Simplify to CEFR A1             22
        Simplify to elementary level    24
flant5  Simplify                        30
        Simplify to CEFR A1             27
        Simplify to CEFR B1             33
        Simplify to elementary level    25
dtype: int64


In [73]:
df_long['source_level_cefr'].value_counts()

2    273
1    147
Name: source_level_cefr, dtype: int64