In [1]:
import pandas as pd

data = pd.read_csv("../data/Processed/clean_data.csv")

In [2]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,clean_text,airline_sentiment,airline
0,0,what said,neutral,Virgin America
1,1,plus youve added commercials to the experience...,positive,Virgin America
2,2,i didnt today must mean i need to take another...,neutral,Virgin America
3,3,its really aggressive to blast obnoxious enter...,negative,Virgin America
4,4,and its a really big bad thing about it,negative,Virgin America
5,5,seriously would pay a flight for seats that di...,negative,Virgin America
6,6,yes nearly every time i fly vx this ear worm w...,positive,Virgin America
7,7,really missed a prime opportunity for men with...,neutral,Virgin America
8,8,well i didntbut now i do d,positive,Virgin America
9,9,it was amazing and arrived an hour early youre...,positive,Virgin America


In [3]:
from sentence_transformers import  SentenceTransformer
model=SentenceTransformer(
    "paraphrase-multilingual-MiniLM-L12-v2"
)

  from .autonotebook import tqdm as notebook_tqdm





In [4]:
texts=data["clean_text"].tolist()

In [5]:
embeddings=model.encode(
    texts,
    show_progress_bar=True
)

Batches: 100%|██████████| 451/451 [01:12<00:00,  6.22it/s]


In [6]:
data['embeddings']=list(embeddings)

In [7]:
data=data.drop(columns=[
    'Unnamed: 0'
])

In [9]:
data.head()

Unnamed: 0,clean_text,airline_sentiment,airline,embeddings
0,what said,neutral,Virgin America,"[0.09349196, 0.22166443, -0.10611222, 0.148594..."
1,plus youve added commercials to the experience...,positive,Virgin America,"[0.044986486, -0.303321, 0.066817686, -0.23257..."
2,i didnt today must mean i need to take another...,neutral,Virgin America,"[0.26666784, -0.032983124, 0.24450608, -0.0832..."
3,its really aggressive to blast obnoxious enter...,negative,Virgin America,"[0.38574392, -0.18187732, -0.023512818, -0.142..."
4,and its a really big bad thing about it,negative,Virgin America,"[0.026368542, 0.030634426, -0.10443704, -0.042..."


**Création des identifiants**


In [10]:
data['id'] = [f"review_{i}" for i in range(len(data))]


**Split train / test**

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(
    data,
    test_size=0.2,
    random_state=42,
    stratify=data['airline_sentiment'] 
    )

In [12]:
data_train.head()

Unnamed: 0,clean_text,airline_sentiment,airline,embeddings,id
7624,no not boarding order someone boarded with ser...,negative,Delta,"[0.37817588, -0.05232914, -0.06892034, -0.0444...",review_7624
9783,sucks delayed my moms flight x cancelled fligh...,negative,US Airways,"[0.068017125, -0.14787968, 0.101590306, 0.0563...",review_9783
7088,rt our fleets on fleek,neutral,Delta,"[-0.28331494, 0.019435078, 0.06429436, -0.2808...",review_7088
2157,when i read it say in some cases can you pleas...,negative,United,"[0.19091901, 0.022858623, -0.012275932, 0.1965...",review_2157
12118,i am dealing with the reflight booking problem...,neutral,American,"[0.07791889, -0.235583, -0.21146771, 0.0376822...",review_12118


In [13]:
data_test.head()

Unnamed: 0,clean_text,airline_sentiment,airline,embeddings,id
5868,why doesnt your terminal b in lga have prechec...,negative,Southwest,"[0.3199375, 0.28107005, -0.08896221, -0.276634...",review_5868
1125,b boarding this attendant took his shoe ampsoc...,negative,United,"[-0.18641071, 0.21389282, 0.15308303, -0.32157...",review_1125
5765,link doesnt work,negative,Southwest,"[-0.33605865, -0.60862505, 0.17004745, 0.06273...",review_5765
710,it was ua flight now im currently stuck in por...,negative,United,"[0.21065933, 0.00740448, 0.07825934, -0.082407...",review_710
3582,i forgot that intl flights out of lax dont go ...,positive,United,"[0.26278704, -0.20168234, 0.06297917, -0.19465...",review_3582


In [14]:
print(data_train.shape, data_test.shape)
print(data_train['airline_sentiment'].value_counts(normalize=True))
print(data_test['airline_sentiment'].value_counts(normalize=True))

(11541, 5) (2886, 5)
airline_sentiment
negative    0.629408
neutral     0.211853
positive    0.158738
Name: proportion, dtype: float64
airline_sentiment
negative    0.629245
neutral     0.212058
positive    0.158697
Name: proportion, dtype: float64


**Initialisation ChromaDB**

In [26]:
import chromadb
from chromadb.config import Settings

In [27]:
CHROMA_PATH = "../chroma_db"


In [28]:
clients = chromadb.PersistentClient(path=CHROMA_PATH)

**Créer les collections**

In [30]:
train_collection = clients.get_or_create_collection(
    name="airline_train_embeddings",metadata={"hnsw:space" : "cosine"}
)

test_collection = clients.get_or_create_collection(
    name="airline_test_embeddings",metadata={"hnsw:space" : "cosine"}
)

**Stockage TRAIN/TEST dans ChromaDB**

In [31]:
from tqdm import tqdm

def add_to_chroma(collection,df,batch_size=2000):
    # Ajoute les embeddings d'un DataFrame dans ChromaDB par batch.

    for start in tqdm(range(0,len(df),batch_size),desc=f"{collection.name}"):
        batch=df[start:start+batch_size]

        collection.add(
            ids=batch["id"].tolist(),
            embeddings=[e.tolist() for e in batch["embeddings"]],
            metadatas=batch[["airline", "airline_sentiment", "clean_text"]].to_dict("records"),
        )

add_to_chroma(train_collection, data_train)
add_to_chroma(test_collection, data_test)

airline_train_embeddings: 100%|██████████| 6/6 [00:02<00:00,  2.21it/s]
airline_test_embeddings: 100%|██████████| 2/2 [00:00<00:00,  2.83it/s]
