In [None]:
import polars as pl 
import polars.selectors as cs
import seaborn as sns
import utils
import get_data
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.manifold import TSNE

In [None]:
frame = pl.read_parquet(r"C:\Users\faune\Downloads\0000.parquet")

In [None]:
frame.drop('idx').with_columns(pl.col('label').replace({0:'negative', 1: 'positive'}))

In [None]:
similar_sentences = (
    frame
    .drop('idx')
    .with_columns(pl.col('label').replace({0:'negative', 1: 'positive'}))
    .sort('label', descending=True)
    .unstack(step=37569, how="vertical", columns='sentence')
    .rename({'sentence_0': 'positive', 'sentence_1':'negative'})
    .with_columns(pl.col('*').str.strip().str.split(' '))
    .with_columns(pl.col('positive').list.set_intersection('negative').alias('similar_words'))
    .with_columns(pl.col('similar_words').list.len().alias('nb_similar_words'))
    .with_columns(pl.col('positive').list.len().alias('nb_positive_words'))
    .with_columns(pl.col('negative').list.len().alias('nb_negative_words'))
    .sort(['nb_positive_words', 'nb_negative_words'])
    .with_columns(pl.col('nb_similar_words').truediv(pl.col('nb_positive_words')).alias('ratio_positive'))
    .with_columns(pl.col('nb_similar_words').truediv(pl.col('nb_negative_words')).alias('ratio_negative'))
    .with_columns(pl.mean_horizontal(pl.col('ratio_positive'), pl.col('ratio_negative')).alias('ratio_mean'))
    .with_columns(pl.col('ratio_positive').sub(pl.col('ratio_negative')).abs().alias('ratio_diff'))
    .sort(pl.col('similar_words').list.len(), descending=True)
    # .with_columns(pl.col('nb_similar_words').replace({0: None, 1: None}))
    # .drop_nulls(subset='nb_similar_words')
    .sort('ratio_mean', descending=True)
    .sort('ratio_diff')
    .with_columns(pl.col('positive').list.join(' '))
    .with_columns(pl.col('negative').list.join(' '))
    # .filter(pl.col('ratio_mean').ge(0.5), pl.col('ratio_diff').le(0.5), pl.col('nb_positive_words').gt(1), pl.col('nb_negative_words').gt(1))
)

In [None]:
similar_sentences= similar_sentences.filter(pl.col('nb_similar_words').gt(1))

In [None]:
similar_sentences = similar_sentences.with_columns(
            pl.Series(
                name= 'positive_embeddings', 
                values= get_data._get_embeddings_by_chunks(similar_sentences.select('positive').to_series().to_list(), 50)
            )
        ).with_columns(
            pl.Series(
                name= 'negative_embeddings', 
                values= get_data._get_embeddings_by_chunks(similar_sentences.select('negative').to_series().to_list(), 50)
            )
        )

In [None]:
similar_sentences = (
    similar_sentences
    .with_columns(
        similar_sentences.with_columns(pl.col('positive_embeddings').list.to_array(1024),pl.col('negative_embeddings').list.to_array(1024))
        .map_rows(function=lambda t : tuple(cosine_similarity(X=np.array(t[10]).reshape(1, -1), Y=np.array(t[11]).reshape(1, -1)).tolist()))
    )
    .explode('column_0')
    .sort('column_0')
    .rename({'column_0': 'cosine_similarity'})
    .with_columns(
        similar_sentences.with_columns(pl.col('positive_embeddings').list.to_array(1024),pl.col('negative_embeddings').list.to_array(1024))
        .map_rows(function=lambda t :np.dot(a=t[10], b=np.transpose(t[11])))
    )
    .rename({'map': 'dot_product'})
    .sort('cosine_similarity')
    # .sort(['dot_product', 'cosine_similarity'])
)

In [None]:
similar_sentences.select(cs.by_dtype(pl.Float64)).describe()

In [None]:
similar_sentences.head(1)

In [None]:
tsne = TSNE(n_components=2, random_state=0)
tsne = tsne.fit_transform(np.array(similar_sentences['positive_embeddings'].to_list()))

In [None]:
similar_sentences = similar_sentences.with_columns(pl.Series(name = 'positive_tsne', values = tsne))

In [None]:
tsne = TSNE(n_components=2, random_state=0)
tsne = tsne.fit_transform(np.array(similar_sentences['negative_embeddings'].to_list()))
similar_sentences = similar_sentences.with_columns(pl.Series(name = 'negative_tsne', values = tsne))

In [None]:
similar_sentences = similar_sentences.with_columns(
        similar_sentences
        .map_rows(function=lambda t : tuple(cosine_similarity(X=np.array(t[14]).reshape(1, -1), Y=np.array(t[15]).reshape(1, -1)).tolist()))
    ).rename({'column_0':'cosine_similarity_tsne'})

In [None]:
similar_sentences = similar_sentences.with_columns(
        similar_sentences.with_columns(pl.col('positive_embeddings').list.to_array(1024),pl.col('negative_embeddings').list.to_array(1024))
        .map_rows(function=lambda t :np.dot(a=t[10], b=np.transpose(t[11])))
    ).explode('cosine_similarity_tsne').sort('cosine_similarity_tsne').rename({'map':'dot_product_tsne'})


In [None]:
similar_sentences.filter(pl.col('ratio_mean').ge(0.3))

In [None]:
positive_array = similar_sentences.select('positive_embeddings').to_series().to_numpy()
negative_array = similar_sentences.select('negative_embeddings').to_series().to_numpy()

In [None]:
(
    similar_sentences
    .with_columns(
        pl.Series(
            name = 'cos_sim_test', 
            values = list(map(lambda x,y: cosine_similarity(X=x.reshape(1,-1), Y=y.reshape(1,-1)).tolist(), positive_array, negative_array))
        )
    )
    .explode('cos_sim_test')
    .explode('cos_sim_test')
    .select('positive', 'negative', 'nb_similar_words', 'ratio_mean', 'cosine_similarity', 'cos_sim_test')
)

In [None]:
print(cosine_similarity(X=positive_array[0].reshape(1, -1), Y=negative_array[0].reshape(1, -1)),
cosine_similarity(X=positive_array[1].reshape(1, -1), Y=negative_array[1].reshape(1, -1)),
cosine_similarity(X=positive_array[2].reshape(1, -1), Y=negative_array[2].reshape(1, -1)))

In [None]:
similar_sentences.select(cs.by_dtype(pl.Float64)).describe()

In [None]:
to_plot = (
    similar_sentences
    .select('positive', 'negative', 'positive_tsne', 'negative_tsne')
    .melt(id_vars=['positive', 'negative'], value_vars = cs.contains('tsne'))
    .select(pl.col('variable').str.split('_').list.first().alias('label'), pl.col('value').list.to_struct())
    .unnest('value')
    .rename({'field_0': 'x', 'field_1':'y'})
)

In [None]:
import matplotlib.pyplot as plt

sns.scatterplot(data = to_plot, x= 'x', y = 'y', hue='label')