In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
import networkx as nx
import spacy
import torch
from transformers import BertTokenizer, BertModel
import xgboost as xgb
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

In [None]:
data = pd.read_csv("../../../Data/Augmenting Datsets/Twitter_Augmented.csv")
data = data.dropna()
data

### LSA and Edit Distance

In [None]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data['sentence'].tolist() + data['paraphrase'].tolist())
lsa = TruncatedSVD(n_components=5)
lsa_features = lsa.fit_transform(tfidf_matrix)
lsa_features_df = pd.DataFrame(lsa_features, columns=[f"lsa_{i}" for i in range(5)])
data = pd.concat([data, lsa_features_df], axis=1)
data['edit_distance'] = data.apply(lambda row: fuzz.token_set_ratio(row['sentence'], row['paraphrase']), axis=1)

### Embeddings (Cosine Similarity)

In [None]:
from tqdm import tqdm
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text):
    tokens = tokenizer.tokenize(text)
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    if len(tokens) > 512:  
        tokens = tokens[:512]
    ids = tokenizer.convert_tokens_to_ids(tokens)
    ids_tensor = torch.tensor(ids).unsqueeze(0) 
    with torch.no_grad():
        model_out = model(ids_tensor)
    embeddings = model_out['last_hidden_state'][0]
    mean_embedding = torch.mean(embeddings, dim=0).numpy()  
    return mean_embedding

tqdm.pandas()
data['sentence_embedding'] = data['sentence'].progress_apply(get_bert_embedding)
data['paraphrase_embedding'] = data['paraphrase'].progress_apply(get_bert_embedding)
data['cosine_similarity'] = data.progress_apply(lambda row: cosine_similarity(row['sentence_embedding'].reshape(1, -1), row['paraphrase_embedding'].reshape(1, -1))[0][0], axis=1)

In [None]:
def unpack_embeddings(dataframe, embedding_column_name):
    """
    Unpack embedding columns into separate features.
    """
    embeddings = dataframe[embedding_column_name].apply(pd.Series)
    embeddings = embeddings.rename(columns=lambda x: f"{embedding_column_name}_{x}")
    dataframe = pd.concat([dataframe.drop([embedding_column_name], axis=1), embeddings], axis=1)
    return dataframe

embedding_column_names = ["sentence_embedding", "paraphrase_embedding"] 
for column in embedding_column_names:
    data = unpack_embeddings(data, column)

In [None]:
features = data
y = features['mi']
X = features.drop(columns=['mi'])