In [1]:
import chromadb
client = chromadb.PersistentClient(path='./chromadb_data')
collection_train = client.get_or_create_collection(name='news_train')
collection_test = client.get_or_create_collection(name='news_test')

In [2]:
import pandas as pd

df_train = pd.read_csv('cleaned_ag_news_train.csv')
df_test = pd.read_csv('cleaned_ag_news_test.csv')

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

embeddings_train = model.encode(df_train['cleaned_text'].tolist(), show_progress_bar=True)
embeddings_test = model.encode(df_test['cleaned_text'].tolist(), show_progress_bar=True)

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 3750/3750 [01:41<00:00, 36.92it/s]

Batches: 100%|██████████| 238/238 [00:06<00:00, 37.11it/s]



In [4]:
embeddings_train
embeddings_test

array([[ 0.0666526 ,  0.55323184, -0.10718946, ...,  0.03817198,
         0.30036014,  0.22507279],
       [-0.17459098,  0.43940595, -0.11578467, ...,  0.02132498,
        -0.40950644,  0.04040216],
       [-0.4353456 , -0.03354342, -0.22912   , ...,  0.09854171,
         0.32382354,  0.22689036],
       ...,
       [-0.05324285, -0.00198379,  0.20873637, ..., -0.00847722,
         0.23526411, -0.14617495],
       [-0.25789988,  0.10252697,  0.08102289, ..., -0.4463923 ,
         0.11971577,  0.30117202],
       [-0.00880264,  0.03725415,  0.15476961, ..., -0.22879522,
         0.05177697, -0.1803632 ]], shape=(7600, 384), dtype=float32)

In [5]:
batch_size = 5000

datasets = [
    (df_train, embeddings_train, collection_train, 'train'),
    (df_test, embeddings_test, collection_test, 'test')
]

for df, embeddings, collection, name in datasets:

    embeddings_list = embeddings.tolist()
    documents_list = df['cleaned_text'].tolist()
    metadatas_list = [{'label': label} for label in df['label']]
    ids_list = [str(i) for i in range(len(df))]

    total_items = len(df)


    for i in range(0, total_items, batch_size):
        collection.add(
            embeddings=embeddings_list[i:i+batch_size],
            documents=documents_list[i:i+batch_size],
            metadatas=metadatas_list[i:i+batch_size],
            ids=ids_list[i:i+batch_size]
        )

    print(f"✓ Added {total_items} items to '{name}' collection")


✓ Added 120000 items to 'train' collection
✓ Added 7600 items to 'test' collection
✓ Added 7600 items to 'test' collection


In [6]:

collection = client.get_collection(name="news_train")


In [7]:
all_data = collection.get(
    
    include=['embeddings', 'metadatas', 'documents']
)

In [8]:
all_data
#Ashf6kdDBQbX2bqr

{'ids': ['0',
  '1',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  '10',
  '11',
  '12',
  '13',
  '14',
  '15',
  '16',
  '17',
  '18',
  '19',
  '20',
  '21',
  '22',
  '23',
  '24',
  '25',
  '26',
  '27',
  '28',
  '29',
  '30',
  '31',
  '32',
  '33',
  '34',
  '35',
  '36',
  '37',
  '38',
  '39',
  '40',
  '41',
  '42',
  '43',
  '44',
  '45',
  '46',
  '47',
  '48',
  '49',
  '50',
  '51',
  '52',
  '53',
  '54',
  '55',
  '56',
  '57',
  '58',
  '59',
  '60',
  '61',
  '62',
  '63',
  '64',
  '65',
  '66',
  '67',
  '68',
  '69',
  '70',
  '71',
  '72',
  '73',
  '74',
  '75',
  '76',
  '77',
  '78',
  '79',
  '80',
  '81',
  '82',
  '83',
  '84',
  '85',
  '86',
  '87',
  '88',
  '89',
  '90',
  '91',
  '92',
  '93',
  '94',
  '95',
  '96',
  '97',
  '98',
  '99',
  '100',
  '101',
  '102',
  '103',
  '104',
  '105',
  '106',
  '107',
  '108',
  '109',
  '110',
  '111',
  '112',
  '113',
  '114',
  '115',
  '116',
  '117',
  '118',
  '119',
  '120',
  '121',
  '12

In [9]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score , classification_report

svc = LinearSVC()

svc.fit(embeddings_train, df_train['label'])

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score , classification_report

lr = LogisticRegression()

lr.fit(embeddings_train, df_train['label'])

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [11]:
preds_lr = lr.predict(embeddings_test)

print("Accuracy:", accuracy_score(df_test['label'], preds_lr))
print(classification_report(df_test['label'], preds_lr))

Accuracy: 0.8840789473684211
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      1900
           1       0.94      0.96      0.95      1900
           2       0.83      0.85      0.84      1900
           3       0.86      0.85      0.86      1900

    accuracy                           0.88      7600
   macro avg       0.88      0.88      0.88      7600
weighted avg       0.88      0.88      0.88      7600



In [12]:

preds = svc.predict(embeddings_test)

print("Accuracy:", accuracy_score(df_test['label'], preds))
print(classification_report(df_test['label'], preds))

Accuracy: 0.8848684210526315
              precision    recall  f1-score   support

           0       0.91      0.87      0.89      1900
           1       0.94      0.96      0.95      1900
           2       0.83      0.85      0.84      1900
           3       0.86      0.85      0.86      1900

    accuracy                           0.88      7600
   macro avg       0.88      0.88      0.88      7600
weighted avg       0.88      0.88      0.88      7600



In [13]:
def predict_text(text):
    emb = model.encode([text])
    return svc.predict(emb)[0]

print(predict_text("cristiono is the best footballer in the world"))


1


In [14]:
import joblib

joblib.dump(svc, 'models/vc_model.pkl')
joblib.dump(model, 'models/sentence_transformer.pkl')

['models/sentence_transformer.pkl']