In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
import keras

from sentence_transformers import SentenceTransformer

In [2]:
data_path = "../data/"
df = pd.read_csv(data_path + "chat_bot_data.csv")
df.head(5)

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


# Modeling

In [3]:
model = SentenceTransformer('sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens')

In [4]:
# cosine similarity
def cos_sim(A,B):
    dot_product = np.dot(A,B)
    norm_A = np.linalg.norm(A)
    norm_B = np.linalg.norm(B)
    
    result = dot_product / (norm_A * norm_B)
    return result

In [5]:
df['embedding'] = df.apply(lambda row: model.encode(row['Q']), axis=1) # axis : 0 - row, 1 - column
df.head(5)

Unnamed: 0,Q,A,label,embedding
0,12시 땡!,하루가 또 가네요.,0,"[0.20179577, -0.03443793, 1.5395724, 0.0106974..."
1,1지망 학교 떨어졌어,위로해 드립니다.,0,"[0.07716593, -0.03427811, 0.86244196, 0.026360..."
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0,"[0.10445253, -0.012432268, 1.0132881, 0.022501..."
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0,"[0.09760731, -0.046716906, 0.89369446, 0.02104..."
4,PPL 심하네,눈살이 찌푸려지죠.,0,"[-0.07002919, 0.03196142, 1.4915429, 4.3293196..."


In [6]:
def answer(question):
    q_embedding = model.encode(question)
    df['score'] = df.apply(lambda row: cos_sim(row['embedding'], q_embedding), axis=1)
    return df.loc[df['score'].idxmax()]['A']

### df.loc
 - access a group of rows and columns by label(s) or a boolean array

### df.idxmax()
 - return the row label of the maximum value.

```python
df = pd.DataFrame([['a','b',3], ['c','d',6], ['e','f',9]], columns=['col1', 'col2', 'col3'])
print(df)
"""
col1 col2  col3
0    a    b     3
1    c    d     6
2    e    f     9
"""

print(df.col3.idxmax()) # 2
print(df.loc[2])
"""
col1    e
col2    f
col3    9
"""

print(df[loc[2]['col1']]) # e
```

In [7]:
answer('결혼하고싶어')

'좋은 사람이랑 결혼할 수 있을 거예요.'