In [74]:
from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection
from sentence_transformers import SentenceTransformer
import pandas as pd
import time

In [75]:
connections.connect(host='127.0.0.1', port='19530')

if utility.has_collection('movies_db'):
    utility.drop_collection('movies_db')

fields = [
    FieldSchema(name='id', dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name='title', dtype=DataType.VARCHAR, max_length=200),
    FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, dim=384)
]
schema = CollectionSchema(fields=fields)
collection = Collection(name='movies_db', schema=schema)

index_params = {
    'metric_type': 'L2',
    'index_type': "IVF_FLAT",
    'params': {'nlist': 1536}
}
collection.create_index(field_name="embedding", index_params=index_params)
collection.load()

In [76]:
df = pd.read_csv('moiver_plots.csv')
df = df.iloc[:, [1, 7]]
df

Unnamed: 0,Title,Plot
0,Kansas Saloon Smashers,"A bartender is working at a saloon, serving dr..."
1,Love by the Light of the Moon,"The moon, painted with a smiling face hangs ov..."
2,The Martyred Presidents,"The film, just over a minute long, is composed..."
3,"Terrible Teddy, the Grizzly King",Lasting just 61 seconds and consisting of two ...
4,Jack and the Beanstalk,The earliest known adaptation of the classic f...
...,...,...
34881,The Water Diviner,"The film begins in 1919, just after World War ..."
34882,Çalgı Çengi İkimiz,"Two musicians, Salih and Gürkan, described the..."
34883,Olanlar Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,Non-Transferable,The film centres around a young woman named Am...


In [77]:
transformer = SentenceTransformer('all-MiniLM-L6-v2')


def embed_insert(data):
    embeds = transformer.encode(data[1])  # Plot列的句子向量
    ins = [data[0],
           [x for x in embeds]]
    collection.insert(ins)


# 共插入2000条数据,每次插入512条数据
index_list = list(range(0, 2000, 512))

for i in range(len(index_list) - 1):
    section_df = df.iloc[index_list[i]: index_list[i + 1]]
    data_batch = [section_df['Title'].tolist(), section_df['Plot'].tolist()]
    embed_insert(data_batch)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [78]:
search_terms = ['A movie about cars', 'A movie about monsters']
search_data = [x for x in transformer.encode(search_terms)]

start = time.time()
res = collection.search(
    data=search_data,
    anns_field="embedding",
    param={},
    limit=5,
    output_fields=['title']
)
end = time.time()

for hits_i, hits in enumerate(res):
    print('Title:', search_terms[hits_i])
    print('Search Time:', end - start)
    print('Results:')
    for hit in hits:
        print(hit.entity.get('title'), '----', hit.distance)
    print()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Title: A movie about cars
Search Time: 0.01274418830871582
Results:
Youth's Endearing Charm ---- 1.095449447631836
From Leadville to Aspen: A Hold-Up in the Rockies ---- 1.1019386053085327
Gentlemen of Nerve ---- 1.1331942081451416
Hot Water ---- 1.143002986907959
High and Dizzy ---- 1.1749171018600464

Title: A movie about monsters
Search Time: 0.01274418830871582
Results:
The Suburbanite ---- 1.0666424036026
Youth's Endearing Charm ---- 1.1072256565093994
The Godless Girl ---- 1.1511220932006836
The Shriek of Araby ---- 1.1951016187667847
The Musketeers of Pig Alley ---- 1.1963965892791748



In [79]:
collection.release()