**Download Snowflake Embedding Data**

In [2]:
cur = conn.cursor()
cur.execute("""
select * from SNOWFLAKE_EMBEDDING_TRIAGE_FINAL
""")

# Fetch all the results from the query
results = cur.fetchall()

# Get the column names from the cursor description
column_names = [desc[0] for desc in cur.description]

# Convert the results into a pandas DataFrame
df = pd.DataFrame(results, columns=column_names)

In [4]:
df.head().to_csv('snowflake_embedding.csv', index=False)    

**Data Preparation**

- Use only the VALIDATED_CATEGORY (because many of the original data could be incorrect)

In [27]:
# Read conversation_dataset, only remain CONVERSATION_ID and VALIDATED_CATEGORY
cur = conn.cursor()
cur.execute("""
select * from class_shared_data.class_dataset.conversation_dataset
""")

# Fetch all the results from the query
results = cur.fetchall()

# Get the column names from the cursor description
column_names = [desc[0] for desc in cur.description]

# Convert the results into a pandas DataFrame
conversation_df = pd.DataFrame(results, columns=column_names)

conversation_df = conversation_df[['CONVERSATION_ID', 'VALIDATED_CATEGORY']]

In [None]:
# Left merge df and conversation_df based on 'CONVERSATION_ID'
df = pd.merge(df, conversation_df, on='CONVERSATION_ID', how='left')

# Only remain the rows with VALIDATED_CATEGORY == True or SIMPLIFIED_CATEGORY == OTHER
df = df[(df['VALIDATED_CATEGORY'] == True) | (df['SIMPLIFIED_CATEGORY'] == 'OTHER')]

print(len(df))

205099


- Only use MESSAGE where there are less masking (less than 20%)

In [30]:
# Keep only rows with 'MESSAGE_PLUS_TRIAGE' where '*' < 20%
df = df[df['MESSAGE_PLUS_TRIAGE'].str.count(r'\*') < (0.2 * df['MESSAGE_PLUS_TRIAGE'].str.len())]

print(len(df))

187144


In [31]:
# print SIMPLIFIED_CATEGORY value counts
df['SIMPLIFIED_CATEGORY'].value_counts()

SIMPLIFIED_CATEGORY
OTHER                                              161517
password_reset                                       8736
add_user OR remove_user                              4691
os_update                                            4648
shipping_request                                     2676
add_user_to_channel OR remove_user_from_channel      2169
add_license OR remove_license                        1668
reset_mfa                                            1039
Name: count, dtype: int64

In [None]:
# Print SIMPLIFIED_CATEGORY 'OTHER' and non-'OTHER' value counts
print('Other:', df['SIMPLIFIED_CATEGORY'].value_counts()['OTHER'])
print('Non-Other:', df['SIMPLIFIED_CATEGORY'].value_counts().sum() - df['SIMPLIFIED_CATEGORY'].value_counts()['OTHER'])
print('Total:', df['SIMPLIFIED_CATEGORY'].value_counts().sum())

Other: 161517
Non-Other: 25627
Total: 187144


In [None]:
# Output the type and len of first embedding
print(type(df['MULTILINGUAL_E5LARGE_EMBEDDING'].iloc[0]), len(df['MULTILINGUAL_E5LARGE_EMBEDDING'].iloc[0]))

<class 'list'> 1024


In [38]:
df.head()

Unnamed: 0,CONVERSATION_ID,SIMPLIFIED_CATEGORY,CONVERSATION_CHANNEL,MESSAGE_PLUS_TRIAGE,MULTILINGUAL_E5LARGE_EMBEDDING,VALIDATED_CATEGORY
0,fc6bf2cb796de4a21c69a47d7cb1b7c816ce2c39,add_user OR remove_user,chat,"Hello, we had a new onboarding of ****** -- h...","[0.010034076869487762, -0.007666395977139473, ...",True
1,311db9f15234e58b0223e34d0b8d43b2e5ee5601,OTHER,email,I am going to take ****’s laptop and replace ...,"[0.026209674775600433, 0.01610388420522213, -0...",False
2,8dca317f750535d770136a489edf16f761f71d7e,OTHER,email,"Hi, I just got a new phone and don’t know my...","[0.015950312837958336, -0.027684127911925316, ...",False
4,c3497b05d9b68df428aa47e133dada46d41fc523,OTHER,chat,"My request type is APPLICATION, There are some...","[0.034831006079912186, -0.0027348522562533617,...",False
11,c91e986c8008cbc2d81dfe8b29ee4b1a26307016,OTHER,chat,"My request type is NETWORK, There are some add...","[0.024984337389469147, -0.020768102258443832, ...",False


In [None]:
# Change MULTILINGUAL_E5LARGE_EMBEDDING to <class 'numpy.ndarray'>
# df['MULTILINGUAL_E5LARGE_EMBEDDING'] = df['MULTILINGUAL_E5LARGE_EMBEDDING'].apply(lambda x: np.fromstring(x[1:-1], sep=',').astype(np.float32))
df['MULTILINGUAL_E5LARGE_EMBEDDING'] = df['MULTILINGUAL_E5LARGE_EMBEDDING'].apply(lambda x: np.array(x, dtype=np.float32))

In [42]:
# output the type and len of first embedding
print(type(df['MULTILINGUAL_E5LARGE_EMBEDDING'].iloc[0]), len(df['MULTILINGUAL_E5LARGE_EMBEDDING'].iloc[0]))

<class 'numpy.ndarray'> 1024


In [43]:
# Save embeddings separately as a .npy file to retain ndarray format
np.save('multilingual_embeddings.npy', np.stack(df['MULTILINGUAL_E5LARGE_EMBEDDING'].values))

# Save the DataFrame to CSV without the embeddings column
df.drop(columns=['MULTILINGUAL_E5LARGE_EMBEDDING']).to_csv('snowflake_embedding_less_asterisk_cleaned.csv', index=False)