In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
import polars as pl
import polars.selectors as cs
import string
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
import utils
path = r'C:\Users\faune\Desktop\thesis\stanford_sentiment.parquet'

## Init

In [None]:
print('Downloading necessary resources...')
nltk.download('punkt')
nltk.download('stopwords')
df = pd.read_parquet(path)
print('Initialize the stemmer and stop words')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [None]:
def process_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems

## Matching sentences

In [None]:
df['processed'] = df['sentence'].apply(process_text)

print('Create a vocabulary of all unique words')
vocab = sorted(set(word for tokens in df['processed'].values for word in tokens))

print('One-hot encode the words using MultiLabelBinarizer')
mlb = MultiLabelBinarizer(classes=vocab)


print('Compute quantiles and get the index for the selected quantile')
df['len'] = df['processed'].apply(len)
q=10
df['quantile_len'] = pd.qcut(df['len'],q=q,labels=[f'q{i}' for i in range(1,q+1)])

quantile = 'q10'

df['positive_idx'] = -1
df['negative_idx'] = -1
df.loc[(df.label==0) & (df.quantile_len==quantile),'negative_idx'] = df.loc[(df.label==0) & (df.quantile_len==quantile)].reset_index().index.astype('Int64')
df.loc[(df.label==1) & (df.quantile_len==quantile),'positive_idx'] = df.loc[(df.label==1) & (df.quantile_len==quantile)].reset_index().index.astype('Int64')

print('Compute the gram matrix over the quantile (x positive, y negative)')
X = mlb.fit_transform(df.loc[(df['label']==1) & (df.quantile_len==quantile),'processed'])
Y = mlb.fit_transform(df.loc[(df['label']==0) & (df.quantile_len==quantile),'processed'])
matrix = X@Y.T


print('Merging the output to make the correspondance between positive and neg sentences')
positive_quantile_df = df.loc[(df.label==1) & (df.quantile_len==quantile),:]
negative_quantile_df = df.loc[(df.label==0) & (df.quantile_len==quantile),:]
positive_quantile_df['neg_closest_idx'] = np.argmax(matrix,axis=1)
merged = positive_quantile_df.merge(negative_quantile_df[['negative_idx','sentence','processed']].rename({'sentence':'neg_sentence','processed':'neg_processed'},axis=1),
                            how='left',
                            left_on='neg_closest_idx',
                            right_on='negative_idx')

## Final computations

In [None]:
merged

In [None]:
embeddings = pl.read_parquet(path)

In [None]:
embeddings.columns

In [None]:
matched_sentences = (
    pl.DataFrame(merged)
    .select('sentence', 'processed', 'neg_sentence', 'neg_processed')
    .join(embeddings, on = 'sentence')
    .drop('idx')
    .join(embeddings.rename({'sentence': 'neg_sentence'}), on = 'neg_sentence', suffix = '_neg')
    .select(pl.exclude('idx', 'label', 'label_neg'))
)

In [None]:
utils.display_polars(
    matched_sentences
    .unique()
    .with_columns(pl.col('processed').list.set_intersection(pl.col('neg_processed')).alias('matching_words'))
    .with_columns(pl.col('matching_words').list.len().alias('nb'))
    .with_columns(
            pl.col('nb').truediv(pl.col('processed').list.len()).alias('ratio_pos'), 
            pl.col('nb').truediv(pl.col('neg_processed').list.len()).alias('ratio_neg')
    )
    .with_columns(
        pl.mean_horizontal('ratio_pos', 'ratio_neg').alias('ratio')
    )
    .sort('ratio', descending=True)
    .filter(pl.col('ratio').gt(0.2), pl.col('ratio').lt(0.8))
)

In [None]:
matched_sentences

In [None]:
with_metrics = (matched_sentences.unique().with_columns(
        matched_sentences.with_columns(pl.col('embeddings').list.to_array(1024),pl.col('embeddings_neg').list.to_array(1024))
        .map_rows(function=lambda t : tuple(cosine_similarity(X=np.array(t[4]).reshape(1, -1), Y=np.array(t[5]).reshape(1, -1)).tolist()))
    )
    .explode('column_0')
    .sort('column_0')
    .rename({'column_0': 'cosine_similarity'})
    .with_columns(
        matched_sentences.with_columns(pl.col('embeddings').list.to_array(1024),pl.col('embeddings_neg').list.to_array(1024))
        .map_rows(function=lambda t :np.dot(a=t[4], b=np.transpose(t[5])))
    )
    .rename({'map': 'dot_product'})
    .sort('cosine_similarity')
    .with_columns(
        matched_sentences.with_columns(pl.col('embeddings').list.to_array(1024),pl.col('embeddings_neg').list.to_array(1024))
        .map_rows(function=lambda t : tuple(euclidean_distances(X=np.array(t[4]).reshape(1, -1), Y=np.array(t[5]).reshape(1, -1)).tolist()))
    )
    .explode('column_0')
    .rename({'column_0':'euclidean_distance'})
    .with_columns(
        matched_sentences.with_columns(pl.col('embeddings').list.to_array(1024),pl.col('embeddings_neg').list.to_array(1024))
        .map_rows(function=lambda t :np.linalg.norm(x=t[4]) -np.linalg.norm(x=t[5]))
    )
    .rename({'map':'norm_difference'})
)


In [None]:
with_metrics

In [None]:
import seaborn as sns 

sns.histplot(data=with_metrics.to_pandas(), x='euclidean_distance', kde=True)

In [None]:
sns.histplot(data=with_metrics.to_pandas(), x='cosine_similarity', kde=True)

In [None]:
sns.histplot(data=with_metrics.to_pandas(), x='dot_product', kde = True)

In [None]:
sns.histplot(data=with_metrics.to_pandas(), x='norm_difference', kde = True)

In [None]:
import mistralai
import mistralai.async_client
import mistralai.client
from api_key import API_KEY

client = mistralai.client.MistralClient(api_key=API_KEY)
test = client.embeddings(model="mistral-embed", input="hello there, how are you")

In [None]:
len([d.embedding for d in test.data][0])

In [None]:
from IPython.display import display_latex
display_latex(with_metrics.describe().to_pandas(), raw = False)

In [None]:
print(with_metrics.select('cosine_similarity', 'euclidean_distance', 'norm_difference').describe())