In [1]:
import pandas as pd

data = pd.read_csv("../Data/Processed/clean_data.csv")

In [2]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,clean_text,airline_sentiment,airline
0,0,what said,neutral,Virgin America
1,1,plus youve added commercials to the experience...,positive,Virgin America
2,2,i didnt today must mean i need to take another...,neutral,Virgin America
3,3,its really aggressive to blast obnoxious enter...,negative,Virgin America
4,4,and its a really big bad thing about it,negative,Virgin America
5,5,seriously would pay a flight for seats that di...,negative,Virgin America
6,6,yes nearly every time i fly vx this ear worm w...,positive,Virgin America
7,7,really missed a prime opportunity for men with...,neutral,Virgin America
8,8,well i didntbut now i do d,positive,Virgin America
9,9,it was amazing and arrived an hour early youre...,positive,Virgin America


In [6]:
from sentence_transformers import  SentenceTransformer
model=SentenceTransformer(
    "cardiffnlp/twitter-roberta-base-sentiment"
)

No sentence-transformers model found with name cardiffnlp/twitter-roberta-base-sentiment. Creating a new one with mean pooling.
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
texts=data["clean_text"].tolist()

In [8]:
embeddings=model.encode(
    texts,
    show_progress_bar=True
)

Batches: 100%|██████████| 451/451 [04:10<00:00,  1.80it/s]


In [9]:
data['embeddings']=list(embeddings)

In [10]:
data=data.drop(columns=[
    'Unnamed: 0'
])

[0.09349196, 0.22166443, -0.10611222, 0.148594...]


In [11]:
data.head()

Unnamed: 0,clean_text,airline_sentiment,airline,embeddings
0,what said,neutral,Virgin America,"[-0.14999603, -0.53225005, -0.14324617, -0.822..."
1,plus youve added commercials to the experience...,positive,Virgin America,"[-0.025072118, -0.5276757, -0.16144873, -0.449..."
2,i didnt today must mean i need to take another...,neutral,Virgin America,"[-0.13152163, -0.5167499, 0.09803597, -0.75049..."
3,its really aggressive to blast obnoxious enter...,negative,Virgin America,"[0.0893467, -0.4533559, 0.019523002, -0.772465..."
4,and its a really big bad thing about it,negative,Virgin America,"[-0.10262265, -0.60638845, 0.27071747, -0.6140..."


**Création des identifiants**


In [12]:
data['id'] = [f"review_{i}" for i in range(len(data))]


**Split train / test**

In [13]:
import numpy as np
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(
    data,
    test_size=0.2,
    random_state=42,
    stratify=data['airline_sentiment'] 
    )

In [14]:
data_train.head()

Unnamed: 0,clean_text,airline_sentiment,airline,embeddings,id
7624,no not boarding order someone boarded with ser...,negative,Delta,"[-0.030747607, -0.48066524, -0.099700734, -0.8...",review_7624
9783,sucks delayed my moms flight x cancelled fligh...,negative,US Airways,"[-0.024749225, -0.19862664, -0.079210095, -0.7...",review_9783
7088,rt our fleets on fleek,neutral,Delta,"[-0.13899958, -0.6026296, -0.037226222, -0.307...",review_7088
2157,when i read it say in some cases can you pleas...,negative,United,"[-0.029725853, -0.9434178, -0.061759353, -0.50...",review_2157
12118,i am dealing with the reflight booking problem...,neutral,American,"[-0.18219015, -0.33504093, 0.011790585, -0.755...",review_12118


In [15]:
data_test.head()

Unnamed: 0,clean_text,airline_sentiment,airline,embeddings,id
5868,why doesnt your terminal b in lga have prechec...,negative,Southwest,"[-0.012143357, -0.062115833, -0.18422022, -0.8...",review_5868
1125,b boarding this attendant took his shoe ampsoc...,negative,United,"[0.038747367, -0.77709734, -0.3220472, -1.3697...",review_1125
5765,link doesnt work,negative,Southwest,"[-0.31305444, -0.20537896, -0.17505193, -0.825...",review_5765
710,it was ua flight now im currently stuck in por...,negative,United,"[-0.38896766, -0.113024004, -0.099490106, -0.6...",review_710
3582,i forgot that intl flights out of lax dont go ...,positive,United,"[0.06962187, -0.40473652, -0.25463948, -0.7262...",review_3582


In [16]:
print(data_train.shape, data_test.shape)
print(data_train['airline_sentiment'].value_counts(normalize=True))
print(data_test['airline_sentiment'].value_counts(normalize=True))

(11541, 5) (2886, 5)
airline_sentiment
negative    0.629408
neutral     0.211853
positive    0.158738
Name: proportion, dtype: float64
airline_sentiment
negative    0.629245
neutral     0.212058
positive    0.158697
Name: proportion, dtype: float64


**Initialisation ChromaDB**

In [17]:
import chromadb
from chromadb.config import Settings

In [18]:
CHROMA_PATH = "../chroma_db"


In [19]:
clients = chromadb.PersistentClient(path=CHROMA_PATH)

**Créer les collections**

In [20]:
train_collection = clients.get_or_create_collection(
    name="airline_train_embeddings",metadata={"hnsw:space" : "cosine"}
)

test_collection = clients.get_or_create_collection(
    name="airline_test_embeddings",metadata={"hnsw:space" : "cosine"}
)

**Stockage TRAIN/TEST dans ChromaDB**

In [21]:
from tqdm import tqdm

def add_to_chroma(collection,df,batch_size=2000):
    # Ajoute les embeddings d'un DataFrame dans ChromaDB par batch.

    for start in tqdm(range(0,len(df),batch_size),desc=f"{collection.name}"):
        batch=df[start:start+batch_size]

        collection.add(
            ids=batch["id"].tolist(),
            embeddings=[e.tolist() for e in batch["embeddings"]],
            metadatas=batch[["airline", "airline_sentiment", "clean_text"]].to_dict("records"),
        )

add_to_chroma(train_collection, data_train)
add_to_chroma(test_collection, data_test)

airline_train_embeddings: 100%|██████████| 6/6 [00:06<00:00,  1.07s/it]
airline_test_embeddings: 100%|██████████| 2/2 [00:01<00:00,  1.07it/s]
