**Step-by-Step Guide**
1. load and view data
2. split train-eval
3. text converting
4. vectorstore
5. validation

# 1. load and view data

In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv('data/train.csv')
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
test_df = pd.read_csv('data/test.csv')
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


# 2. split train-eval

In [4]:
# Define the train-test split ratio
train_ratio = 0.8

# Shuffle the DataFrame
df_shuffled = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Calculate the number of training samples
train_size = int(len(train_df) * train_ratio)

# Split the DataFrame
train_df = df_shuffled[:train_size]
eval_df = df_shuffled[train_size:]
print(train_df.shape)
print(eval_df.shape)

(712, 12)
(179, 12)


# 3. text converting

In [5]:
# Get columns
columns = train_df.columns.to_list()
print(columns)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [6]:
# Remove target column `Survived`
input_columns = [col for col in columns if col != 'Survived']
print('input_columns=',input_columns)
print('columns=',columns)

input_columns= ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
columns= ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [7]:
# Convert
list_train_dict = train_df.to_dict('record')
list_eval_dict = eval_df.to_dict('record')

  list_train_dict = train_df.to_dict('record')
  list_eval_dict = eval_df.to_dict('record')


In [8]:
# Convert to text on train set
train_data = []
for i in range(len(list_train_dict)):
    # Init
    s = ''
    # Loop over column
    for col in input_columns:
        s += col + ': ' + str(list_train_dict[i][col]) + ' '
    # Add result
    #s += ' is Survived' if list_train_set[i]['Survived'] == 1 else ' is NOT Survived'
    train_data.append({'survived':list_train_dict[i]['Survived'],'description':s})

print(train_data[0])
print(train_data[1])

{'survived': 1, 'description': 'PassengerId: 710 Pclass: 3 Name: Moubarek, Master. Halim Gonios ("William George") Sex: male Age: nan SibSp: 1 Parch: 1 Ticket: 2661 Fare: 15.2458 Cabin: nan Embarked: C '}
{'survived': 0, 'description': 'PassengerId: 440 Pclass: 2 Name: Kvillner, Mr. Johan Henrik Johannesson Sex: male Age: 31.0 SibSp: 0 Parch: 0 Ticket: C.A. 18723 Fare: 10.5 Cabin: nan Embarked: S '}


In [9]:
# Convert to text on eval set
eval_data = []
for i in range(len(list_eval_dict)):
    # Init
    s = ''
    # Loop over column
    for col in input_columns:
        s += col + ': ' + str(list_eval_dict[i][col]) + ' '
    # Add result
    eval_data.append({'survived':list_eval_dict[i]['Survived'],'description':s})

print(eval_data[0])

{'survived': 0, 'description': 'PassengerId: 620 Pclass: 2 Name: Gavey, Mr. Lawrence Sex: male Age: 26.0 SibSp: 0 Parch: 0 Ticket: 31028 Fare: 10.5 Cabin: nan Embarked: S '}


In [10]:
def convert_dict_to_text():
    pass

# 4. vectorstore

In [11]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

In [12]:
# Load the model to create embeddings
encoder = SentenceTransformer('all-MiniLM-L6-v2')

In [13]:
# Create the vector database client
qdrant = QdrantClient(":memory:") # create in-memory Qdrant instance

In [14]:
# Create vector config
qdrant.recreate_collection(
    collection_name="Titanic_Passenger",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
)

True

In [15]:
# Insert to vector database
qdrant.upsert(
    collection_name="Titanic_Passenger",
    points=[
        models.PointStruct(
            id=idx, vector=encoder.encode(doc["description"]).tolist(), payload=doc
        )
        for idx, doc in enumerate(train_data)
    ],
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [16]:
# Try to search
query = eval_data[0]
query

{'survived': 0,
 'description': 'PassengerId: 620 Pclass: 2 Name: Gavey, Mr. Lawrence Sex: male Age: 26.0 SibSp: 0 Parch: 0 Ticket: 31028 Fare: 10.5 Cabin: nan Embarked: S '}

In [17]:
hits = qdrant.search(
    collection_name="Titanic_Passenger",
    query_vector=encoder.encode(query['description']).tolist(),
    limit=1,
)
for hit in hits:
    print(hit.payload, "score:", hit.score)

{'survived': 0, 'description': 'PassengerId: 811 Pclass: 3 Name: Alexander, Mr. William Sex: male Age: 26.0 SibSp: 0 Parch: 0 Ticket: 3474 Fare: 7.8875 Cabin: nan Embarked: S '} score: 0.9160768679777078


# 5. validation

In [18]:
# Single-shot
count_correct  = 0
for i in range(len(eval_data)):
    # Get query
    query = eval_data[i]
    # Search
    hit = qdrant.search(collection_name="Titanic_Passenger",
                        query_vector=encoder.encode(query['description']).tolist(),
                        limit=1,
                       )
    #print(hit[0].payload, "score:", hit[0].score)
    # Check result
    if hit[0].payload['survived'] == eval_data[i]['survived']:
        count_correct +=1

print('Accuracy:',count_correct/(len(eval_data)))

Accuracy: 0.7262569832402235


In [19]:
import statistics

In [21]:
# Few-shot
limits = 5
count_correct  = 0
for i in range(len(eval_data)):
    # Get query
    query = eval_data[i]
    # Search
    hits = qdrant.search(collection_name="Titanic_Passenger",
                        query_vector=encoder.encode(query['description']).tolist(),
                        limit=limits,
                       )
    mode_value  = statistics.mode([hit.payload['survived'] for hit in hits])
    # Check result
    if mode_value == eval_data[i]['survived']:
        count_correct +=1

print('Accuracy:',count_correct/(len(eval_data)))

Accuracy: 0.7932960893854749


# referenes

[Vector Search Basics](https://qdrant.tech/documentation/overview/vector-search/)