# RoBERTa Embeddings 

This note book was run in Google Colab because of the coputational requirements.

https://colab.research.google.com/drive/1Al8zLJqciXpRxKIj3eg3UowpgE7IpdzN#scrollTo=pwUnqb5xgrKE

In [None]:
from transformers import RobertaModel, RobertaTokenizer

def get_word_vector(sentence, word):
    # Load pretrained RoBERTa model and tokenizer
    model = RobertaModel.from_pretrained('roberta-base')
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    # Tokenize the sentence and convert tokens to their corresponding IDs
    input_ids = tokenizer.encode(sentence, add_special_tokens=True)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Prepare the word token, append leading whitespace if not the first word in the sentence
    word_token = word if sentence.startswith(word) else 'Ġ' + word

    # Wrap input IDs tensor in another tensor for batch dimension
    input_ids = torch.tensor([input_ids])

    # Get RoBERTa output, hidden states has shape [1, seq_len, emb_dim]
    with torch.no_grad():
        hidden_states = model(input_ids)[0]

    # Initialize variables to store word vectors and count
    word_vectors = []
    token_count = 0

    # Iterate over tokens, add hidden state to word_vectors if token matches word_token
    for i, token in enumerate(tokens):
        if token == word_token:
            word_vectors.append(hidden_states[0, i].tolist())
            token_count += 1

    # Return average of word_vectors if word is found, else return None
    if token_count > 0:
        average_word_vector = [sum(col)/token_count for col in zip(*word_vectors)]
        print(torch.tensor(average_word_vector))  # Print the word vector
        return torch.tensor(average_word_vector)
    else:
        print(word)  # Print the word if not found in the sentence
        return None



In [None]:
# Code uses to get predictions based on contextual infomation
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.metrics.pairwise import cosine_similarity

# Define the device for training (use GPU if available, otherwise use CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")




# Define a custom dataset class
class ComplexWordDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        phrase = self.data.loc[index, 'phrase']
        sentence = self.data.loc[index, 'sentence']
        label = self.data.loc[index, 'complex_binary']
        
        encoding = self.tokenizer.encode_plus(
            phrase,
            sentence,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Function to train the model
def train_model(train_data, tokenizer, max_length, num_epochs=5, batch_size=32, learning_rate=2e-5):
    # Prepare the dataset and dataloader
    train_dataset = ComplexWordDataset(train_data, tokenizer, max_length)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    # Load the pre-trained RoBERTa model
    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
    model.to(device)
    
    # Set up the optimizer and learning rate scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_dataloader) * num_epochs
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_steps)
    
    # Training loop
    model.train()
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        total_loss = 0.0
        
        for batch in train_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            optimizer.step()
            scheduler.step()
        
        average_loss = total_loss / len(train_dataloader)
        print(f'Average Loss: {average_loss:.4f}')
    
    return model

# Function to predict word complexity using the trained model
def predict_complexity(test_data, model, tokenizer, max_length, batch_size=32):
    test_dataset = ComplexWordDataset(test_data, tokenizer, max_length)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
    
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted_labels = torch.max(logits, dim=1)
            
            predictions.extend(predicted_labels.cpu().numpy())
    
    return predictions

    
    


In [None]:
# Set the file paths for training and test sets
import pandas as pd

train_data = pd.read_csv('/content/drive/MyDrive/CWI_data/WikiNews_Train_pp.csv')
test_data = pd.read_csv('/content/drive/MyDrive/CWI_data/WikiNews_Dev_pp.csv')

# Set the maximum sentence length and batch size
max_length = 128
batch_size = 32

# Create a tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

model = train_model(train_data, tokenizer, max_length)



# Tune the model and predict word complexity on the test data
predictions_df = tune_and_predict(train_file, test_file, tokenizer, max_length, num_epochs=5, batch_size=batch_size)

# Print or analyze the dataframe with the predicted complexity values
print(predictions_df)


In [None]:
# Save the trained model
model.save_pretrained('/content/drive/MyDrive/model_directory')

In [None]:
# Load the saved model
loaded_model = RobertaForSequenceClassification.from_pretrained('roberta-base')
loaded_model.load_state_dict(torch.load('/content/drive/MyDrive/model_directory/'))

In [None]:


# call the predict_complexity function to make predictions using the loaded model on the new data.



# Load the new data for prediction
new_data = pd.read_csv('new_data.csv')

# Prepare the new data for prediction (similar to the training and test data)
new_dataset = ComplexWordDataset(new_data, tokenizer, max_length)
new_dataloader = DataLoader(new_dataset, batch_size=batch_size)

# Put the loaded model in evaluation mode
loaded_model.eval()

# Make predictions on the new data
predictions = predict_complexity(new_data, loaded_model, tokenizer, max_length)

In [None]:
# Load the saved model
loaded_model = RobertaForSequenceClassification.from_pretrained('roberta-base')
loaded_model.load_state_dict(torch.load('model.pt'))

In [None]:
# Set the file paths for training and test sets
import pandas as pd

train_data = pd.read_csv('/content/drive/MyDrive/CWI_data/WikiNews_Train_pp.csv')
test_data = pd.read_csv('/content/drive/MyDrive/CWI_data/WikiNews_Dev_pp.csv')

# Set the maximum sentence length and batch size
max_length = 128
batch_size = 32

# Create a tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

model = train_model(train_data, tokenizer, max_length)



# Tune the model and predict word complexity on the test data
predictions_df = tune_and_predict(train_file, test_file, tokenizer, max_length, num_epochs=5, batch_size=batch_size)

# Print or analyze the dataframe with the predicted complexity values
print(predictions_df)


In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score


In [3]:
predictions_data = pd.read_csv("predictions.csv")
wikinews_data = pd.read_csv("WikiNews_Dev_pp.csv")


In [4]:
test_targets = wikinews_data['complex_binary'].values


In [7]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
training_data = test_targets
train_targets = training_data['complex_binary'].values
        
feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(training_data)
        
model = AdaBoostClassifier(n_estimators=5000, random_state=67)
pipeline = Pipeline([
    ('features',feats),
    ('classifier', model),
    ])

pipeline.fit(training_data, train_targets)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [13]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import pandas as pd

# Load the CSV files
predictions_data = pd.read_csv("predictions.csv")
wikinews_data = pd.read_csv("WikiNews_Dev_pp.csv")

# Extract the predicted labels and ground truth labels
predicted_labels = predictions_data['Predicted_Complexity'].values
ground_truth_labels = wikinews_data['complex_binary'].values

# Calculate evaluation metrics
accuracy = accuracy_score(ground_truth_labels, predicted_labels)
precision = precision_score(ground_truth_labels, predicted_labels)
recall = recall_score(ground_truth_labels, predicted_labels)
f1 = f1_score(ground_truth_labels, predicted_labels)

# Define and store the evaluation metrics in the model_stats dataframe
model_stats = pd.DataFrame(columns=['Data', 'Classifier', 'Precision', 'Recall', 'F-Score'])
model_stats.loc[len(model_stats)] = [0, "Custom Model", precision, recall, f1]


In [14]:
# Print out the evaluation scores
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Accuracy: 0.8377445339470656
Precision: 0.784366576819407
Recall: 0.8267045454545454
F1-Score: 0.8049792531120332


In [None]:
import torch
from transformers import RobertaForSequenceClassification

# Set the device for training (use GPU if available, otherwise use CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model first (replace with your actual model)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# Now load the state_dict
model.load_state_dict(torch.load('complexity_model.pth', map_location=device))

# Move the model to the appropriate device
model = model.to(device)



In [17]:
import torch

model= ('Corpus/complexity_model.pth')
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to GPU if available
model = model.to(device)
input_data = input_data.to(device)


AttributeError: 'str' object has no attribute 'to'

In [None]:
# Load the model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model.load_state_dict(torch.load(model_path))
model.to(device)  # Don't forget to move the model to GPU if available

In [5]:
import os
import pickle

input_folder = "All_features_pp"
output_file_dev = "Combined_Dev_pp.pkl"
output_file_train = "Combined_Train_pp.pkl"

data_dev = []
data_train = []

for filename in os.listdir(input_folder):
    if filename.endswith("Dev_pp.pkl"):
        file_path = os.path.join(input_folder, filename)
        with open(file_path, "rb") as file:
            data = pickle.load(file)
            data_dev.extend(data)
    elif filename.endswith("Train_pp.pkl"):
        file_path = os.path.join(input_folder, filename)
        with open(file_path, "rb") as file:
            data = pickle.load(file)
            data_train.extend(data)

with open(output_file_dev, "wb") as file:
    pickle.dump(data_dev, file)

with open(output_file_train, "wb") as file:
    pickle.dump(data_train, file)


In [None]:
from transformers import RobertaModel, RobertaTokenizer
#import torch

In [None]:
import pandas as pd
import numpy as np
from transformers import RobertaTokenizerFast, RobertaModel
import torch


def get_word_embedding(row):
    sentence = row['clean sentence']
    target_word = row['phrase']

    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')

    # Tokenize the sentence and the target word
    inputs = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True)
    target_word_tokens = tokenizer.tokenize(' ' + target_word)

    # Get the position of the target word in the sentence
    start_index = sentence.lower().find(target_word.lower())
    if start_index == -1:
        print(f"Target word '{target_word}' not found in the sentence.")
        return None

    end_index = start_index + len(target_word)
    token_positions = []

    for i, (start, end) in enumerate(inputs["offset_mapping"][0]):
        if start >= start_index and end <= end_index:
            token_positions.append(i)

    # Get the model's output
    with torch.no_grad():
        outputs = model(**{k: v for k, v in inputs.items() if k in ['input_ids', 'attention_mask']})

    # Get the embeddings of the target word
    embeddings = outputs.last_hidden_state[0, token_positions, :]

    # Average the embeddings if the target word has been split into several tokens
    word_embedding = embeddings.mean(dim=0).numpy()

    return word_embedding

# load your dataframe from a .pkl file
df = pd.read_pickle('/content/drive/MyDrive/CWI_data/features_NEW/final_camb_feats/News_Dev_actual')


# apply the function to each row and store the result in the 'Embedding' column
df['Embedding'] = df.apply(get_word_embedding, axis=1)

# Find words that have no embeddings
df_no_embeddings = df[df['Embedding'].isnull()]

print(df_no_embeddings['phrase'])




# write the dataframe to a new pkl file
df.to_pickle('/content/drive/MyDrive/CWI_data/features_NEW/final_camb_feats/News_Dev_actual_RB')


In [5]:
import pandas as pd
df = pd.read_pickle('News_Dev_actual_RB')

In [6]:
df


Unnamed: 0,sentence,ID,clean sentence,parse,start_index,end_index,phrase,total_native,total_non_native,native_complex,...,sub_imdb,google frequency,KFCAT,FAM,KFSMP,KFFRQ,AOA,NPHN,T-LFRQ,Embedding
0,Syrian troops shelled a rebel-held town on Mon...,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",0,6,syrian,10,10,0,...,0,3.513071,1,0,1,1,0,6,0,"[-0.048358764, 0.04159808, -0.11586902, -0.025..."
1,Syrian troops shelled a rebel-held town on Mon...,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",14,21,shelled,10,10,6,...,1,19.559029,7,524,10,22,0,0,218,"[0.21271735, 0.07035407, -0.047027998, -0.2875..."
2,Syrian troops shelled a rebel-held town on Mon...,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",24,34,rebel-held,10,10,4,...,0,0.000000,0,0,0,0,0,0,0,"[0.09429837, 0.1951797, -0.17190206, -0.031182..."
3,Syrian troops shelled a rebel-held town on Mon...,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",51,59,sparking,10,10,5,...,1,4.740991,6,505,9,12,0,4,75,"[0.29501337, 0.35755643, -0.1997622, -0.034754..."
4,Syrian troops shelled a rebel-held town on Mon...,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",35,39,town,10,10,0,...,1,90.138295,14,589,103,212,0,0,1607,"[-0.027314782, 0.31355807, -0.08525312, -0.164..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1496,The state will put less than 15 billion euros ...,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,The state will put less than 15 billion euros ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",149,156,guindos,10,10,0,...,0,0.004614,0,0,0,0,0,0,0,"[0.13226731, 0.075516224, 0.1255717, -0.027254..."
1497,The state will put less than 15 billion euros ...,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,The state will put less than 15 billion euros ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",88,95,rescues,10,10,2,...,1,0.490457,9,532,11,15,367,6,105,"[-0.0639202, 0.048246473, 0.14042006, -0.05273..."
1498,The state will put less than 15 billion euros ...,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,The state will put less than 15 billion euros ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",111,118,enacted,10,10,6,...,0,6.731140,5,0,6,7,0,0,33,"[-0.01024009, 0.020139743, 0.09040777, 0.00749..."
1499,The state will put less than 15 billion euros ...,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,The state will put less than 15 billion euros ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",157,166,estimated,10,10,2,...,0,27.054409,9,505,27,39,503,7,160,"[-0.026576916, 0.060671188, 0.09911453, -0.017..."


In [None]:
#This works on Google Colab.

from transformers import RobertaTokenizerFast, RobertaModel

def get_word_embedding(sentence, target_word):
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')

    # Tokenize the sentence and the target word
    inputs = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True)
    target_word_tokens = tokenizer.tokenize(' ' + target_word)

    # Get the position of the target word in the sentence
    start_index = sentence.lower().find(target_word.lower())
    if start_index == -1:
        print(f"Target word '{target_word}' not found in the sentence.")
        return None

    end_index = start_index + len(target_word)
    token_positions = []

    for i, (start, end) in enumerate(inputs["offset_mapping"][0]):
        if start >= start_index and end <= end_index:
            token_positions.append(i)

    # Get the model's output
    with torch.no_grad():
        outputs = model(**{k: v for k, v in inputs.items() if k in ['input_ids', 'attention_mask']})

    # Get the embeddings of the target word
    embeddings = outputs.last_hidden_state[0, token_positions, :]

    # Average the embeddings if the target word has been split into several tokens
    word_embedding = embeddings.mean(dim=0)
    
    return word_embedding


In [None]:
#Adapted from above working code to re .pkl file 

import pandas as pd
import numpy as np
import pickle
from transformers import RobertaTokenizerFast, RobertaModel

def get_word_embedding(row):
    sentence = row['clean sentence']
    target_word = row['original phrase']

    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')

    # Tokenize the sentence and the target word
    inputs = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True)
    target_word_tokens = tokenizer.tokenize(' ' + target_word)

    # Get the position of the target word in the sentence
    start_index = sentence.lower().find(target_word.lower())
    if start_index == -1:
        print(f"Target word '{target_word}' not found in the sentence.")
        return None

    end_index = start_index + len(target_word)
    token_positions = []

    for i, (start, end) in enumerate(inputs["offset_mapping"][0]):
        if start >= start_index and end <= end_index:
            token_positions.append(i)

    # Get the model's output
    with torch.no_grad():
        outputs = model(**{k: v for k, v in inputs.items() if k in ['input_ids', 'attention_mask']})

    # Get the embeddings of the target word
    embeddings = outputs.last_hidden_state[0, token_positions, :]

    # Average the embeddings if the target word has been split into several tokens
    word_embedding = embeddings.mean(dim=0).numpy()

    return word_embedding

# load your dataframe from a .pkl file
df = pd.read_pickle('/content/drive/MyDrive/CWI_data/features_NEW/final_camb_feats/News_Dev_actual')

# apply the function to each row and store the result in the 'embed' column
df['embed'] = df.apply(get_word_embedding, axis=1)

# write the dataframe to a new pkl file
df.to_pickle('/content/drive/MyDrive/CWI_data/features_NEW/final_camb_feats/News_Dev_actual_RB')


In [95]:
import pandas as pd

# Replace 'your_file.pkl' with the path to your .pkl file
file_path = 'RoBERTa/Original_RoB/Wikipedia_Dev_actual_RoB'

# Read the pickled DataFrame from the .pkl file
df = pd.read_pickle(file_path)


In [96]:
df

Unnamed: 0,sentence,ID,clean sentence,parse,start_index,end_index,phrase,total_native,total_non_native,native_complex,...,sub_imdb,google frequency,KFCAT,FAM,KFSMP,KFFRQ,AOA,NPHN,T-LFRQ,Embedding
0,The tail of Epidexipteryx also bore unusual ve...,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",12,25,epidexipteryx,10,10,6,...,0,0.000000,0,0,0,0,0,0,0,"[-0.0022872959, 0.11304215, 0.0045243786, -0.2..."
1,The tail of Epidexipteryx also bore unusual ve...,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",31,35,bore,10,10,0,...,0,12.019915,9,543,21,24,0,0,309,"[0.1064047, 0.16773231, -0.00042687453, 0.2381..."
2,The tail of Epidexipteryx also bore unusual ve...,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",36,43,unusual,10,10,0,...,1,25.006854,15,0,52,63,0,8,273,"[0.027525533, -0.045535043, -0.040021315, 0.42..."
3,The tail of Epidexipteryx also bore unusual ve...,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",44,53,vertebrae,10,10,7,...,0,2.126906,0,0,0,0,0,7,7,"[-0.025541285, -0.042219978, -0.030374907, 0.0..."
4,The tail of Epidexipteryx also bore unusual ve...,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",54,61,towards,10,10,0,...,1,0.000000,14,0,39,64,0,0,44,"[0.03312819, -0.039484713, -0.028750507, 0.205..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600,Devotion ( Bhakti ) will cancel the effects of...,3W0KKJIARRAMOTSFYF06L10TKN9K8Z,Devotion ( Bhakti ) will cancel the effects of...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",100,109,knowledge,10,10,0,...,1,153.343727,13,575,103,145,477,0,465,"[0.13245635, -0.13719822, 0.014618974, -0.0564..."
601,Devotion ( Bhakti ) will cancel the effects of...,3W0KKJIARRAMOTSFYF06L10TKN9K8Z,Devotion ( Bhakti ) will cancel the effects of...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",74,80,person,10,10,0,...,1,162.817686,15,620,119,175,0,0,978,"[0.06428637, 0.038197752, 0.18205002, -0.33014..."
602,Devotion ( Bhakti ) will cancel the effects of...,3W0KKJIARRAMOTSFYF06L10TKN9K8Z,Devotion ( Bhakti ) will cancel the effects of...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",95,99,true,10,10,0,...,0,162.715074,15,605,155,231,0,0,1711,"[-0.019195676, -0.16097079, 0.23393556, 0.1112..."
603,Devotion ( Bhakti ) will cancel the effects of...,3W0KKJIARRAMOTSFYF06L10TKN9K8Z,Devotion ( Bhakti ) will cancel the effects of...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",113,122,purifying,10,10,7,...,0,1.378423,2,0,2,2,0,7,7,"[0.013223473, -0.033989564, 0.096128985, -0.01..."


In [48]:
# Funtion to Flatten RoBERta Embedding 

def flatten_embeddings(df, column_name="Embedding"):
    # Assuming `df` is your dataframe and "Embedding" is the column with embeddings
    embeddings = np.array(df[column_name].to_list())  # Convert column of arrays into a 2D numpy array
    flattened = pd.DataFrame(embeddings)  # Convert 2D numpy array into dataframe

    # Rename the columns
    flattened.columns = [f"{column_name}_{i}" for i in range(flattened.shape[1])]

    # Drop the original embedding column and concat the flattened dataframe
    df = pd.concat([df.drop(columns=[column_name]), flattened], axis=1)
    
    return df

df = flatten_embeddings(df)


  embeddings = np.array(df[column_name].to_list())  # Convert column of arrays into a 2D numpy array


In [97]:
# This now handles Missing Embedding and Embeddings that are of different length.

import numpy as np
import pandas as pd

def flatten_embeddings(df, column_name="Embedding"):
    embeddings = df[column_name].to_list()

    # Filter out None values
    embeddings = [embedding for embedding in embeddings if embedding is not None]

    # Check if any non-zero value is present in the embeddings
    if any(np.any(np.array(embedding) != 0) for embedding in embeddings):
        max_embedding_length = max(len(embedding) for embedding in embeddings)
        embeddings_padded = [
            np.pad(embedding, (0, max_embedding_length - len(embedding)), mode='constant')
            if len(embedding) < max_embedding_length else embedding
            for embedding in embeddings
        ]
        embeddings = np.array(embeddings_padded)
    else:
        # If all embeddings are either None or all zeros, return a DataFrame of zeros
        num_rows = len(df)
        num_columns = len(embeddings[0]) if embeddings else 0
        embeddings = np.zeros((num_rows, num_columns))
    
    flattened = pd.DataFrame(embeddings)

    flattened.columns = [f"{column_name}_{i}" for i in range(flattened.shape[1])]

    df = pd.concat([df.drop(columns=[column_name]), flattened], axis=1)
    
    return df

df = flatten_embeddings(df)


In [98]:
df

Unnamed: 0,sentence,ID,clean sentence,parse,start_index,end_index,phrase,total_native,total_non_native,native_complex,...,Embedding_758,Embedding_759,Embedding_760,Embedding_761,Embedding_762,Embedding_763,Embedding_764,Embedding_765,Embedding_766,Embedding_767
0,The tail of Epidexipteryx also bore unusual ve...,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",12,25,epidexipteryx,10,10,6,...,0.051877,0.136058,-0.089933,0.035493,-0.022071,0.066447,0.153555,0.012447,-0.042098,0.043038
1,The tail of Epidexipteryx also bore unusual ve...,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",31,35,bore,10,10,0,...,-0.157775,0.069935,-0.168181,-0.111752,-0.164044,0.221702,-0.257107,-0.337937,0.001747,0.157806
2,The tail of Epidexipteryx also bore unusual ve...,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",36,43,unusual,10,10,0,...,0.045495,0.044459,-0.265047,-0.253186,-0.183772,0.047291,-0.256799,0.161694,0.224941,-0.002305
3,The tail of Epidexipteryx also bore unusual ve...,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",44,53,vertebrae,10,10,7,...,-0.065760,0.086544,0.305740,-0.159293,-0.214867,0.160937,0.101733,-0.127275,-0.012944,-0.013192
4,The tail of Epidexipteryx also bore unusual ve...,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",54,61,towards,10,10,0,...,0.200160,-0.005600,0.167409,-0.365396,-0.074997,-0.064576,-0.397419,-0.083067,-0.010616,-0.043380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600,Devotion ( Bhakti ) will cancel the effects of...,3W0KKJIARRAMOTSFYF06L10TKN9K8Z,Devotion ( Bhakti ) will cancel the effects of...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",100,109,knowledge,10,10,0,...,-0.130004,-0.184000,-0.176769,-0.122766,-0.045485,-0.097645,0.019090,-0.230233,0.100706,0.218293
601,Devotion ( Bhakti ) will cancel the effects of...,3W0KKJIARRAMOTSFYF06L10TKN9K8Z,Devotion ( Bhakti ) will cancel the effects of...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",74,80,person,10,10,0,...,0.297072,-0.004166,-0.337320,-0.025556,-0.144697,-0.060201,-0.065093,-0.236418,0.055958,0.181881
602,Devotion ( Bhakti ) will cancel the effects of...,3W0KKJIARRAMOTSFYF06L10TKN9K8Z,Devotion ( Bhakti ) will cancel the effects of...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",95,99,true,10,10,0,...,-0.366248,-0.085556,-0.015672,0.024762,0.183477,-0.179918,-0.064177,-0.171885,-0.021939,0.231173
603,Devotion ( Bhakti ) will cancel the effects of...,3W0KKJIARRAMOTSFYF06L10TKN9K8Z,Devotion ( Bhakti ) will cancel the effects of...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",113,122,purifying,10,10,7,...,0.136446,-0.092753,-0.035713,0.096410,0.167309,0.181306,-0.008599,-0.491306,0.167582,0.353482


In [99]:
df
df.to_pickle("Wikipedia_Dev_FLAT.pkl")

In [43]:
# This should now handle any empty rows

def flatten_embeddings(df, column_name="Embedding"):
    # Convert string representations of lists to actual lists of numbers
    df[column_name] = df[column_name].apply(ast.literal_eval)
    
    embeddings = df[column_name].to_list()  # Convert column of arrays into a list

    # Initialize an empty list to store flattened embeddings
    flattened_embeddings = []

    # Get the length of the first non-empty embedding
    nonzero_length = len(embeddings[0]) if embeddings and len(embeddings[0]) > 0 else 0

    # Iterate through the list of embeddings
    for embedding in embeddings:
        if len(embedding) == 0:
            # If the embedding is empty, append a row of zeros
            flattened_embeddings.append([0] * nonzero_length)
        else:
            flattened_embeddings.append(embedding)

    # Convert the list of flattened embeddings to a DataFrame
    flattened = pd.DataFrame(flattened_embeddings)

    # Rename the columns
    flattened.columns = [f"{column_name}_{i}" for i in range(flattened.shape[1])]

    # Drop the original embedding column and concat the flattened dataframe
    df = pd.concat([df.drop(columns=[column_name]), flattened], axis=1)

    return df

In [None]:
def process_all_files_in_directory(directory_path="RoBERTa/Original_RoB"):
    # List all files in the specified directory that end with "actual_Rob"
    files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and f.endswith('actual_RoB')]
    
    for file in files:
        try:
            # Load each file into a dataframe
            file_path = os.path.join(directory_path, file)
            df = pd.read_pickle(file_path)
            
            # Flatten the embeddings in the dataframe
            df_transformed = flatten_embeddings(df)
            
            # Save the transformed dataframe back to a new file with "_Flattened" added before the ending
            new_file_name = file.replace('actual_RoB', 'RoB_Flattened')
            new_file_path = os.path.join(directory_path, new_file_name)
            df_transformed.to_pickle(new_file_path)
        except Exception as e:
            print(f"Failed to process {file}. Error: {e}")

# Call the function on a specific directory
process_all_files_in_directory()

In [42]:
print(df["Embedding"].isnull().sum())

0


In [13]:
import os
import pandas as pd
import numpy as np

def split_embeddings(df, column_name="Embedding"):
    # Assuming `df` is your dataframe and "Embedding" is the column with embeddings
    embeddings = np.array(df[column_name].to_list())  # Convert column of arrays into a 2D numpy array

    # Split the embeddings into separate columns
    num_columns = embeddings.shape[1]
    columns = [f"{column_name}_{i}" for i in range(num_columns)]
    flattened = pd.DataFrame(embeddings, columns=columns)

    # Drop the original embedding column and concat the flattened dataframe
    df = pd.concat([df.drop(columns=[column_name]), flattened], axis=1)

    return df

def process_all_files_in_directory(directory_path="RoBERTa/Original_RoB"):
    # List all files in the specified directory that end with "actual_Rob"
    files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and f.endswith('actual_RoB')]
    
    for file in files:
        try:
            # Load each file into a dataframe
            file_path = os.path.join(directory_path, file)
            df = pd.read_pickle(file_path)
            
            # Split the embeddings in the dataframe
            df_transformed = split_embeddings(df)
            
            # Save the transformed dataframe back to a new file with "_Split" added before the ending
            new_file_name = file.replace('actual_RoB', 'RoB_Split')
            new_file_path = os.path.join(directory_path, new_file_name)
            df_transformed.to_pickle(new_file_path)
        except Exception as e:
            print(f"Failed to process {file}. Error: {e}")

# Call the function
process_all_files_in_directory()



Failed to process News_Train_actual_RoB. Error: tuple index out of range
Failed to process News_Dev_actual_RoB. Error: tuple index out of range
Failed to process WikiNews_Dev_actual_RoB. Error: tuple index out of range
Failed to process WikiNews_Train_actual_RoB. Error: tuple index out of range


  embeddings = np.array(df[column_name].to_list())  # Convert column of arrays into a 2D numpy array
  embeddings = np.array(df[column_name].to_list())  # Convert column of arrays into a 2D numpy array
  embeddings = np.array(df[column_name].to_list())  # Convert column of arrays into a 2D numpy array
  embeddings = np.array(df[column_name].to_list())  # Convert column of arrays into a 2D numpy array


In [18]:
df = pd.read_pickle('Wikipedia_Train_FLAT')
df

Unnamed: 0,sentence,ID,clean sentence,parse,start_index,end_index,phrase,total_native,total_non_native,native_complex,...,Embedding_758,Embedding_759,Embedding_760,Embedding_761,Embedding_762,Embedding_763,Embedding_764,Embedding_765,Embedding_766,Embedding_767
0,"Normally , the land will be passed down to fut...",3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...","{\n ""sentences"": [\n {\n ""index"": 0,\...",28,34,passed,10,10,0,...,-0.021684,-0.189329,-0.111865,0.115084,0.049065,0.378842,-0.451135,0.141652,0.066050,0.010311
1,"Normally , the land will be passed down to fut...",3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...","{\n ""sentences"": [\n {\n ""index"": 0,\...",15,19,land,10,10,0,...,0.068303,-0.123243,-0.118669,0.014359,0.053657,0.055099,0.112175,-0.111757,-0.031310,0.128363
2,"Normally , the land will be passed down to fut...",3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...","{\n ""sentences"": [\n {\n ""index"": 0,\...",43,49,future,10,10,1,...,0.013729,0.039722,-0.191531,0.211703,-0.009092,0.295454,-0.086659,-0.092444,-0.017178,-0.018810
3,"Normally , the land will be passed down to fut...",3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...","{\n ""sentences"": [\n {\n ""index"": 0,\...",50,61,generations,10,10,3,...,-0.070143,-0.087471,0.118887,0.000095,-0.136134,0.094078,0.374389,0.505341,-0.217758,0.020302
4,"Normally , the land will be passed down to fut...",3XU9MCX6VODXPI3L8I02CM94TFB2R7,"Normally , the land will be passed down to fut...","{\n ""sentences"": [\n {\n ""index"": 0,\...",76,86,recognizes,10,10,2,...,0.025573,-0.265114,-0.267120,0.173528,-0.169962,0.050043,0.183141,0.085481,0.232601,0.057799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4827,An actor ( sometimes actress for female ; see ...,3ZUE82NE0A2B8701X4995O8OCRS8FL,An actor ( sometimes actress for female ; see ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",33,39,female,10,10,0,...,-0.128167,-0.105799,0.273845,-0.153273,-0.304040,-0.197205,0.611910,-0.210247,-0.065629,0.007488
4828,An actor ( sometimes actress for female ; see ...,3ZUE82NE0A2B8701X4995O8OCRS8FL,An actor ( sometimes actress for female ; see ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",139,149,television,10,10,0,...,-0.401176,-0.072458,-0.207919,-0.266866,0.031897,0.334576,0.775077,0.205625,0.116589,-0.307944
4829,An actor ( sometimes actress for female ; see ...,3ZUE82NE0A2B8701X4995O8OCRS8FL,An actor ( sometimes actress for female ; see ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",46,57,terminology,10,10,7,...,-0.077243,0.177842,0.315260,-0.184363,-0.355212,-0.201826,0.576690,-0.027611,-0.233612,0.099904
4830,An actor ( sometimes actress for female ; see ...,3ZUE82NE0A2B8701X4995O8OCRS8FL,An actor ( sometimes actress for female ; see ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",86,94,dramatic,10,10,4,...,-0.335189,-0.132308,0.169282,-0.079048,-0.103670,0.078668,0.351139,0.203331,0.006614,0.060178


In [64]:
import os
import pandas as pd
import numpy as np
import ast

def flatten_embeddings(df, column_name="Embedding"):
    # Convert string representations of lists to actual lists of numbers
    df[column_name] = df[column_name].apply(ast.literal_eval)
    
    embeddings = df[column_name].to_list()  # Convert column of arrays into a list

    # Initialize an empty list to store flattened embeddings
    flattened_embeddings = []

    # Get the length of the first non-empty embedding
    nonzero_length = len(embeddings[0]) if embeddings and len(embeddings[0]) > 0 else 0

    # Iterate through the list of embeddings
    for embedding in embeddings:
        if len(embedding) == 0:
            # If the embedding is empty, append a row of zeros
            flattened_embeddings.append([0] * nonzero_length)
        else:
            flattened_embeddings.append(embedding)

    # Convert the list of flattened embeddings to a DataFrame
    flattened = pd.DataFrame(flattened_embeddings)

    # Rename the columns
    flattened.columns = [f"{column_name}_{i}" for i in range(flattened.shape[1])]

    # Drop the original embedding column and concat the flattened dataframe
    df = pd.concat([df.drop(columns=[column_name]), flattened], axis=1)

    return df

def process_all_files_in_directory(directory_path="RoBERTa/Original_RoB"):
    # List all files in the specified directory that end with "actual_Rob"
    files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and f.endswith('actual_RoB')]
    
    for file in files:
        try:
            # Load each file into a dataframe
            file_path = os.path.join(directory_path, file)
            df = pd.read_pickle(file_path)
            
            # Flatten the embeddings in the dataframe
            df_transformed = flatten_embeddings(df)
            
            # Save the transformed dataframe back to a new file with "_Flattened" added before the ending
            new_file_name = file.replace('actual_RoB', 'RoB_Flattened')
            new_file_path = os.path.join(directory_path, new_file_name)
            df_transformed.to_pickle(new_file_path)
        except Exception as e:
            print(f"Failed to process {file}. Error: {e}")

# Call the function
process_all_files_in_directory()


Failed to process News_Train_actual_RoB. Error: malformed node or string: array([-8.54269713e-02,  1.86173752e-01,  4.47292998e-02, -3.04733247e-01,
       -2.17045277e-01, -3.71072963e-02, -1.46049112e-01, -5.06227196e-04,
        1.93845421e-01,  2.48477936e-01, -2.31155261e-01, -4.49660659e-01,
       -7.86063299e-02, -7.16831088e-02,  1.06376730e-01,  1.86835937e-02,
       -4.12007630e-01,  1.95053846e-01, -6.66261613e-02,  1.04581669e-01,
       -6.89331219e-02,  4.03025568e-01,  1.47398070e-01, -1.47963434e-01,
       -1.53158590e-01, -1.03969507e-01,  3.66694778e-02, -1.13471515e-01,
       -1.26741201e-01, -1.09966420e-01,  1.54288158e-01,  2.39740163e-01,
        1.82974130e-01, -1.51510492e-01, -1.00982944e-02,  5.09956852e-02,
        1.05064400e-01, -6.58721700e-02,  2.61656344e-01, -8.11863020e-02,
       -1.33111835e-01, -3.66329972e-04, -3.13771516e-02,  2.46391281e-01,
       -2.39195712e-02, -1.08330073e-02,  5.31696342e-03,  1.02727255e-02,
       -6.10033758e-02, -3

In [12]:
df = pd.read_pickle('RoBERTa/Original_RoB/News_Dev_RoB_Flattened')
df

Unnamed: 0,sentence,ID,clean sentence,parse,start_index,end_index,phrase,total_native,total_non_native,native_complex,...,sub_imdb,google frequency,KFCAT,FAM,KFSMP,KFFRQ,AOA,NPHN,T-LFRQ,Embedding_0
0,Syrian troops shelled a rebel-held town on Mon...,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",0,6,syrian,10,10,0,...,0,3.513071,1,0,1,1,0,6,0,"[-0.048358764, 0.04159808, -0.11586902, -0.025..."
1,Syrian troops shelled a rebel-held town on Mon...,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",14,21,shelled,10,10,6,...,1,19.559029,7,524,10,22,0,0,218,"[0.21271735, 0.07035407, -0.047027998, -0.2875..."
2,Syrian troops shelled a rebel-held town on Mon...,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",24,34,rebel-held,10,10,4,...,0,0.000000,0,0,0,0,0,0,0,"[0.09429837, 0.1951797, -0.17190206, -0.031182..."
3,Syrian troops shelled a rebel-held town on Mon...,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",51,59,sparking,10,10,5,...,1,4.740991,6,505,9,12,0,4,75,"[0.29501337, 0.35755643, -0.1997622, -0.034754..."
4,Syrian troops shelled a rebel-held town on Mon...,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",35,39,town,10,10,0,...,1,90.138295,14,589,103,212,0,0,1607,"[-0.027314782, 0.31355807, -0.08525312, -0.164..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1496,The state will put less than 15 billion euros ...,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,The state will put less than 15 billion euros ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",149,156,guindos,10,10,0,...,0,0.004614,0,0,0,0,0,0,0,"[0.13226731, 0.075516224, 0.1255717, -0.027254..."
1497,The state will put less than 15 billion euros ...,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,The state will put less than 15 billion euros ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",88,95,rescues,10,10,2,...,1,0.490457,9,532,11,15,367,6,105,"[-0.0639202, 0.048246473, 0.14042006, -0.05273..."
1498,The state will put less than 15 billion euros ...,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,The state will put less than 15 billion euros ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",111,118,enacted,10,10,6,...,0,6.731140,5,0,6,7,0,0,33,"[-0.01024009, 0.020139743, 0.09040777, 0.00749..."
1499,The state will put less than 15 billion euros ...,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,The state will put less than 15 billion euros ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",157,166,estimated,10,10,2,...,0,27.054409,9,505,27,39,503,7,160,"[-0.026576916, 0.060671188, 0.09911453, -0.017..."


In [20]:
df = pd.read_pickle('RoBERTa/Original_RoB/Wikipedia_Dev_RoB_Split')
df

Unnamed: 0,sentence,ID,clean sentence,parse,start_index,end_index,phrase,total_native,total_non_native,native_complex,...,Embedding_758,Embedding_759,Embedding_760,Embedding_761,Embedding_762,Embedding_763,Embedding_764,Embedding_765,Embedding_766,Embedding_767
0,The tail of Epidexipteryx also bore unusual ve...,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",12,25,epidexipteryx,10,10,6,...,0.051877,0.136058,-0.089933,0.035493,-0.022071,0.066447,0.153555,0.012447,-0.042098,0.043038
1,The tail of Epidexipteryx also bore unusual ve...,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",31,35,bore,10,10,0,...,-0.157775,0.069935,-0.168181,-0.111752,-0.164044,0.221702,-0.257107,-0.337937,0.001747,0.157806
2,The tail of Epidexipteryx also bore unusual ve...,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",36,43,unusual,10,10,0,...,0.045495,0.044459,-0.265047,-0.253186,-0.183772,0.047291,-0.256799,0.161694,0.224941,-0.002305
3,The tail of Epidexipteryx also bore unusual ve...,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",44,53,vertebrae,10,10,7,...,-0.065760,0.086544,0.305740,-0.159293,-0.214867,0.160937,0.101733,-0.127275,-0.012944,-0.013192
4,The tail of Epidexipteryx also bore unusual ve...,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",54,61,towards,10,10,0,...,0.200160,-0.005600,0.167409,-0.365396,-0.074997,-0.064576,-0.397419,-0.083067,-0.010616,-0.043380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600,Devotion ( Bhakti ) will cancel the effects of...,3W0KKJIARRAMOTSFYF06L10TKN9K8Z,Devotion ( Bhakti ) will cancel the effects of...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",100,109,knowledge,10,10,0,...,-0.130004,-0.184000,-0.176769,-0.122766,-0.045485,-0.097645,0.019090,-0.230233,0.100706,0.218293
601,Devotion ( Bhakti ) will cancel the effects of...,3W0KKJIARRAMOTSFYF06L10TKN9K8Z,Devotion ( Bhakti ) will cancel the effects of...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",74,80,person,10,10,0,...,0.297072,-0.004166,-0.337320,-0.025556,-0.144697,-0.060201,-0.065093,-0.236418,0.055958,0.181881
602,Devotion ( Bhakti ) will cancel the effects of...,3W0KKJIARRAMOTSFYF06L10TKN9K8Z,Devotion ( Bhakti ) will cancel the effects of...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",95,99,true,10,10,0,...,-0.366248,-0.085556,-0.015672,0.024762,0.183477,-0.179918,-0.064177,-0.171885,-0.021939,0.231173
603,Devotion ( Bhakti ) will cancel the effects of...,3W0KKJIARRAMOTSFYF06L10TKN9K8Z,Devotion ( Bhakti ) will cancel the effects of...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",113,122,purifying,10,10,7,...,0.136446,-0.092753,-0.035713,0.096410,0.167309,0.181306,-0.008599,-0.491306,0.167582,0.353482
