In [1]:
import pandas as pd
import numpy as np
import open_ended_tools

# Define run parameters
directory = './Data_Training/'
file_answers = 'open_ended_answers.csv'
file_metrics = 'metrics.csv'
generate_embeddings = False
embedding_model='text-embedding-ada-002'
n_clusters = 3  # Determined the number of clusters to use
# Set to none to truly randomize. 42 used in code to reproduce samples to what is in openai docs.
random_state = 40

In [2]:
# Read Data
df = pd.read_csv(directory+file_answers, index_col=0)
df = df[['Question_ID', 'Type', 'Question', 'Answer', 'Correct_answer',
         'Curiosity', 'Hunger', 'Smarts','Relevance',
         'Curiosity_optimum', 'Hunger_optimum', 'Smarts_optimum','Relevance_optimum']]

# metrics = ['Curiousity', 'Hunger', 'Smarts']
df_metrics = pd.read_csv(directory+file_metrics, index_col=0)
df_metrics = df_metrics[['Metric','Category_term_pos','Category_term_neg']]
metrics = df_metrics['Metric'].unique()
metrics

array(['Curiosity', 'Hunger', 'Smarts'], dtype=object)

In [3]:
#%% Create open_ended_answer object and embeddings
question = []
ans = []
for i in range(len(df['Question_ID'].unique())):
    q_ID = df['Question_ID'].unique()[i]
    question.append(df['Question'][df.index[df['Question_ID'] == q_ID].tolist()[0]])
    ans.append(open_ended_tools.OpenEndedAnswer(df[df['Question_ID'] == q_ID], metrics))

    ans[i].generate_answer_embeddings(directory+file_answers[:-4]+f'_{q_ID}.csv', 
                                      random_state=random_state, 
                                      generate_embeddings=generate_embeddings,
                                      embedding_model=embedding_model)
    print(ans[i])

Embeddings file read.
Question: 1, Describe a lightweight concept for an aircraft fuselage longitudinal stringer, with manufacturing methods that could be used. Its function is to make a stiffened skin monocoque structure. The stringer will take axial loads and prevent skin buckling. There are no cost or schedule constraints.
# of Answers in Model: 34
Metrics: ['Curiosity' 'Hunger' 'Smarts']
# of Clusters: None

Embeddings file read.
Question: 2, Identify the pattern in these numbers: 854976320
# of Answers in Model: 20
Metrics: ['Curiosity' 'Hunger' 'Smarts']
# of Clusters: None



In [4]:
#%% Create metric embeddings
met = []
for i in range(len(df_metrics['Metric'].unique())):
    met_name = df_metrics['Metric'].unique()[i]
    met.append(open_ended_tools.OpenEndedMetric(df_metrics[df_metrics['Metric'] == met_name]))

    met[i].generate_metric_embeddings(directory+file_metrics,
                                      generate_embeddings=True, 
                                      embedding_model=embedding_model)

Embeddings created.
Embeddings created.
Embeddings created.


In [5]:
met[2].df

Unnamed: 0_level_0,Metric,Category_term_pos,Category_term_neg,embedding_pos,embedding_neg
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,Smarts,Intelligence,A stupid engineer,"[-0.00781696941703558, -0.00983723346143961, 0...","[-0.011999057605862617, 0.004398558754473925, ..."
5,Smarts,An engineer who is always correct,An engineer who make a lot of mistakes,"[0.0018882370786741376, 0.0017914898926392198,...","[-0.016456840559840202, 0.001226898399181664, ..."
6,Smarts,An engineer who develops feasible concepts,An engineer who develops infeasible concepts,"[-0.006476408336311579, -0.009969864971935749,...","[-0.011681289412081242, -0.019320610910654068,..."


In [6]:
ans[0].df

Unnamed: 0,ID,Question_ID,Type,Question,Answer,Correct_answer,Curiosity,Hunger,Smarts,Relevance,Curiosity_optimum,Hunger_optimum,Smarts_optimum,Relevance_optimum,embedding
0,1,1,Multiple,Describe a lightweight concept for an aircraft...,A minimum weight concept for an aircraft fusel...,,0,0,1,1,1,0,1,1,"[-0.004058697260916233, 0.03430645912885666, -..."
1,2,1,Multiple,Describe a lightweight concept for an aircraft...,One possible concept for a minimum weight airc...,,0,0,1,1,1,0,1,1,"[2.1099383957334794e-05, 0.033611960709095, -0..."
2,3,1,Multiple,Describe a lightweight concept for an aircraft...,Use a composite material such as carbon fiber ...,,0,0,1,1,1,0,1,1,"[0.007572090718895197, 0.04471009597182274, -0..."
3,4,1,Multiple,Describe a lightweight concept for an aircraft...,The stringer could be manufactured with carbon...,,0,0,1,1,1,0,1,1,"[-0.004457200411707163, 0.010434738360345364, ..."
4,5,1,Multiple,Describe a lightweight concept for an aircraft...,Rather than make a new component and add a str...,,1,0,1,1,1,0,1,1,"[-0.00869241077452898, 0.04867749661207199, 0...."
5,6,1,Multiple,Describe a lightweight concept for an aircraft...,Material: The stringer could be made of a high...,,0,0,1,1,1,0,1,1,"[0.0014477220829576254, 0.03042137622833252, -..."
6,7,1,Multiple,Describe a lightweight concept for an aircraft...,There is not enough information provided to an...,,-1,-1,-1,-1,1,0,1,1,"[0.017032714560627937, 0.0015377630479633808, ..."
7,8,1,Multiple,Describe a lightweight concept for an aircraft...,"Given enough time, I could answer this questio...",,-1,-1,-1,-1,1,0,1,1,"[-0.004106028471142054, -0.023306598886847496,..."
8,9,1,Multiple,Describe a lightweight concept for an aircraft...,Material: The stringer could be made of a ligh...,,1,0,1,1,1,0,1,1,"[0.006133295129984617, 0.026939189061522484, -..."
9,10,1,Multiple,Describe a lightweight concept for an aircraft...,Material: The stringer could be made of a ligh...,,1,0,1,1,1,0,1,1,"[-0.0046913716942071915, 0.015704255551099777,..."


In [7]:
print(met[0].df['Category_term_pos'])
print(met[0].df['Category_term_neg'])

ID
0    An innovative engineer
1       A creative engineer
2        A curious engineer
Name: Category_term_pos, dtype: object
ID
0    An engineer who has stagnated
1            A complacent engineer
2         A disinterested engineer
Name: Category_term_neg, dtype: object


In [8]:
#%% Test out embeddings scoring
open_ended_tools.metric_score(met[2],ans[0])

array([[-0.06465412,  0.18611255,  0.67848847],
       [-0.25286785,  0.19149497,  0.62590204],
       [-0.35708739,  0.0732384 ,  0.66926473],
       [-0.333259  ,  0.30889701,  0.52984237],
       [-0.34754031, -0.0051174 ,  0.47098311],
       [-0.07895855,  0.11996127,  0.72775403],
       [ 0.47939182, -0.17766709,  0.19181427],
       [ 0.65840811,  0.13610027, -0.05586348],
       [-0.23827979,  0.04062822,  0.6681309 ],
       [-0.11171168,  0.11735523,  0.78324401],
       [-0.11245923,  0.16407078,  0.60867754],
       [-0.09624665,  0.10978699,  0.81557632],
       [-0.25906309,  0.18509625,  0.86504715],
       [-0.20979168,  0.22517888,  0.72197291],
       [-0.04735452,  0.18928703,  0.55464548],
       [-0.62463192,  0.0774737 ,  0.46734215],
       [ 0.09568799, -0.10034069,  0.42170389],
       [-0.21383478,  0.17970768,  0.68944588],
       [-0.22168808,  0.27418799,  0.72085643],
       [-0.83222042, -0.30488588,  0.2229768 ],
       [-0.56663252,  0.2581097 ,  0.522

In [9]:
from openai.embeddings_utils import cosine_similarity, get_embedding
test = 'tortillas'
test_embedding = get_embedding(test, engine=embedding_model)
cosine_similarity(ans[0].df['embedding'].iloc[15], test_embedding)

0.7009626651772836