In [10]:
print("hello world!")

hello world!


In [1]:
# read local .env file
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())

True

In [2]:
import os
from langchain_openai import OpenAIEmbeddings

embed = OpenAIEmbeddings(
    api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
    base_url=os.environ.get("OPENAI_BASE_URL"),  # This is the default and can be omitted
    model="gpt-4",
    # model="BAAI/bge-m3",
    # model="multilingual-e5-large",
    # model="multilingual-e5-large-instruct",
    # dimensions=1536,
)

In [3]:
input_text = "The meaning of life is 42"
vector = embed.embed_query(input_text)
print(vector[:3])
print(len(vector))

[-0.01183319091796875, 0.062469482421875, -0.0821533203125]
1024


In [4]:
import json
import pandas as pd
from urllib.request import urlopen
import os
from pathlib import Path

def download_and_load_dataset():
    # データセットのURL
    dataset_url = "https://raw.githubusercontent.com/yahoojapan/JGLUE/main/datasets/jsts-v1.1/valid-v1.1.json"
    
    # ローカルファイルパスの設定
    local_file = "valid-v1.1.json"
    
    # ファイルが存在しない場合はダウンロード
    if not os.path.exists(local_file):
        print(f"Downloading dataset from {dataset_url}...")
        try:
            with urlopen(dataset_url) as response:
                content = response.read()
            
            # ダウンロードしたデータをローカルに保存
            with open(local_file, 'wb') as f:
                f.write(content)
            print(f"Dataset downloaded and saved to {local_file}")
        except Exception as e:
            print(f"Error downloading file: {e}")
            return None
    else:
        print(f"Loading dataset from local file: {local_file}")
    
    # ローカルファイルからデータを読み込む
    try:
        with open(local_file, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]
        
        # データフレームを作成
        df = pd.DataFrame(data)
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# データセットを読み込む
df = download_and_load_dataset()
# データフレームの中身を確認
if df is not None:
    print("\nDataset loaded successfully:")
    print(df)
else:
    print("Failed to load dataset")

Loading dataset from local file: valid-v1.1.json

Dataset loaded successfully:
     sentence_pair_id               yjcaptions_id  \
0                   0  100312_421853-104611-31624   
1                   1        100371-104675-104678   
2                   2        100668-104946-104949   
3                   3        100958-105177-105178   
4                   4        101401-105530-105533   
...               ...                         ...   
1452             1452         98940-103167-103171   
1453             1453         99222-103520-103521   
1454             1454         99421-103771-103773   
1455             1455         99453-103814-103815   
1456             1456         99597-103941-103943   

                                 sentence1                         sentence2  \
0              レンガの建物の前を、乳母車を押した女性が歩いています。                厩舎で馬と女性とが寄り添っています。   
1                         山の上に顔の白い牛が2頭います。             曇り空の山肌で、牛が２匹草を食んでいます。   
2                     バナナを持った人が道路を通行しています。  

In [5]:
from tqdm import tqdm
import torch
import os
import concurrent.futures

# データセットからsentence1とsentence2を抜き出して連結
set_sentence = set(df["sentence1"]).union(set(df["sentence2"]))
print(len(set_sentence))

2808


In [6]:
# データセットをembedding(concurrent.futuresで並列処理)
dict_sentence = {}
with concurrent.futures.ThreadPoolExecutor() as executor:
    # set_sentenceの各要素に対して、embed_query関数を並行に適用
    future_to_sentence = {
        executor.submit(embed.embed_query, sentence): sentence
        for sentence in set_sentence
    }
    for future in concurrent.futures.as_completed(future_to_sentence):
        sentence = future_to_sentence[future]
        try:
            dict_sentence[sentence] = future.result()
        except Exception as exc:
            print("%r generated an exception: %s" % (sentence, exc))

# ベクトル化したデータセット(ディクショナリ)を確認
print(len(dict_sentence))

2808


In [7]:
# コサイン類似度を使うためcosine_similarityをimport
from torch.nn.functional import cosine_similarity

# コサイン類似度を算出し配列に入力
similarities = []
for i, row in tqdm(df.iterrows()):
    embed_sentence1 = dict_sentence[row["sentence1"]]
    embed_sentence2 = dict_sentence[row["sentence2"]]
    similarity = cosine_similarity(
        torch.tensor(embed_sentence1).unsqueeze(0), torch.tensor(embed_sentence2).unsqueeze(0)
    )
    similarities.append(similarity.item())

1457it [00:00, 7121.32it/s]


In [8]:
from scipy.stats import pearsonr, spearmanr

# ピアソン相関係数の算出
pearson_corr, _ = pearsonr(similarities, df["label"])
print(f'Pearson correlation: {pearson_corr}')

# スピアマン相関係数の算出
spearman_corr, _ = spearmanr(similarities, df["label"])
print(f'Spearman correlation: {spearman_corr}')

Pearson correlation: 0.20309295991098175
Spearman correlation: 0.25997977887797685


## BAAI/bge-m3
- Pearson correlation: 0.20309295991098175
- Spearman correlation: 0.25997977887797685

- Pearson correlation: 0.8466529839655326
- Spearman correlation: 0.8022972686208227

## BAAI/bge-multilingual-gemma2
- Pearson correlation: 0.32889489844536185
- Spearman correlation: 0.3410501003671425

## Alibaba-NLP/gte-Qwen2-7B-instruct
- Pearson correlation: 0.5205752281045795
- Spearman correlation: 0.5510142337149282

## intfloat/multilingual-e5-large-instruct
- Pearson correlation: 0.5228528008921871
- Spearman correlation: 0.540391660332006

- Pearson correlation: 0.8634430697527975
- Spearman correlation: 0.8187526901653888

## intfloat/multilingual-e5-large
- Pearson correlation: 0.5447470095324037
- Spearman correlation: 0.5628063344366232

- Pearson correlation: 0.850360223135657
- Spearman correlation: 0.8098701611897999

# Cohere embed-multilingual-v3.0
- Pearson correlation: 0.8689620805910541
- Spearman correlation: 0.8218902671771843