In [7]:
!pip install openai

Collecting openai
  Downloading openai-1.3.3-py3-none-any.whl (220 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/220.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m143.4/220.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m220.3/220.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.25.1-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.2-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import pandas as pd
import openai

In [56]:
def load_your_dataset(file_path):
    # Load the CSV file
    data = pd.read_csv(file_path)
    data['sentence'] = data.apply(lambda row: row['sentence'] if not pd.isnull(row['sentence']) and not str(row['sentence']).isspace() else "idk", axis=1)


    # Assuming the file has columns named 'sentence' and 'score'
    texts = data['sentence'].values
    labels = data['score'].values

    return texts, labels

In [57]:
texts, labels = load_your_dataset("falling_weights.csv")

# Encoding labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
categorical_labels = to_categorical(encoded_labels)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(texts, categorical_labels, test_size=0.2)

In [163]:
import tensorflow as tf
from tensorflow import keras

class Distiller(keras.Model):
    def __init__(self, student, teacher_predictions):
        super(Distiller, self).__init__()
        self.teacher_predictions = teacher_predictions
        self.student = student
        self.processed_samples = tf.Variable(0, dtype=tf.int64, trainable=False)

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        alpha=0.1,
        temperature=3,
    ):
        super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.alpha = alpha
        self.temperature = temperature
    def train_step(self, data):
        # Unpack data
        x, y = data
        # print(x)
        # x_strings = self.tensor_to_strings(x)

        batch_size = tf.cast(tf.shape(x)[0], dtype=tf.int64)
        batch_start = self.processed_samples
        batch_end = batch_start + batch_size

        # # Create a range tensor for indexing
        # indices = tf.range(batch_start, batch_end)

        total_size = tf.cast(tf.shape(self.teacher_predictions)[0], dtype=tf.int64)
        batch_end = tf.minimum(batch_end, total_size)
        batch_start = tf.minimum(batch_start, total_size)

        new_processed_samples = batch_end
        # Reset the counter at the end of each epoch
        new_processed_samples = tf.where(new_processed_samples >= total_size, tf.cast(0, tf.int64), new_processed_samples)
        self.processed_samples.assign(new_processed_samples)

        # Create a range tensor for indexing
        indices = tf.range(batch_start, batch_end)

        # Use tf.gather to slice the teacher predictions tensor
        teacher_predictions = tf.gather(self.teacher_predictions, indices, axis=0)

        # # Forward pass of teacher
        # teacher_scores = self.teacher(x_strings)
        # teacher_predictions = tf.one_hot(teacher_scores, depth=len(label_encoder.classes_))

        with tf.GradientTape() as tape:
            # Forward pass of student
            student_predictions = self.student(x, training=True)

            # Compute losses
            student_loss = self.student_loss_fn(y, student_predictions)

            # Calculate distillation loss
            distillation_loss = self.distillation_loss_fn(teacher_predictions, student_predictions)

            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss

        # Compute gradients and update student weights
        trainable_vars = self.student.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update metrics
        self.compiled_metrics.update_state(y, student_predictions)

        # Return a dictionary mapping metric names to current value
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss, "distillation_loss": distillation_loss})
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.student(x, training=False)

        # Updates the metrics tracking the loss
        student_loss = self.student_loss_fn(y, y_prediction)
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dictionary mapping metric names to current value
        results = {m.name: m.result() for m in self.metrics}
        results.update({"student_loss": student_loss})
        return results

In [92]:
import re

openai.api_key = 'Use you key here'

def extract_score(text):
    """Extract the score from a string like 'The score is X'"""
    match = re.search(r"The score is (\d+)", text)
    if match:
        return int(match.group(1))
    return None

def get_teacher_predictions(texts):
    predictions = []
    for text in texts:
        response = openai.chat.completions.create(
            model="ft:gpt-3.5-turbo-0613:ai4stem::8LPJZ4oN", #Fine-tuned model on falling weights dataset
            messages=[
                      {"role": "system", "content": "This is a question about falling weights from ETS chemistry uncategorized data"},
                      {"role": "user", "content": text}
                    ]
        )
        predictions.append(extract_score(response.choices[0].message.content))
    return np.array(predictions)

# Generating soft labels from the teacher model
soft_labels = get_teacher_predictions(X_train)
soft_labels_one_hot = to_categorical(soft_labels, num_classes=len(label_encoder.classes_))

In [94]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def preprocess_your_data(X, sequence_length):
    # Initialize the tokenizer
    tokenizer = Tokenizer(num_words=5000, oov_token="<UNK>")

    # Fit the tokenizer on the training data
    tokenizer.fit_on_texts(X)

    # Convert text to sequences of integers
    sequences = tokenizer.texts_to_sequences(X)

    # Pad sequences to the specified length
    padded_sequences = pad_sequences(sequences, maxlen=sequence_length, padding='post', truncating='post')
    padded_sequences = padded_sequences.astype(np.float32)

    return padded_sequences
sequence_length = 100
X_train_processed = preprocess_your_data(X_train, sequence_length)
X_test_processed = preprocess_your_data(X_test, sequence_length)

In [166]:
student = keras.Sequential([
    keras.layers.InputLayer(input_shape=(sequence_length,)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

In [None]:
# Initialize and compile the distiller
distiller = Distiller(student=student, teacher_predictions=soft_labels_one_hot)
distiller.compile(
    optimizer=keras.optimizers.Adam(),
    metrics=[keras.metrics.CategoricalAccuracy()],
    student_loss_fn= keras.losses.CategoricalCrossentropy(from_logits=True),
    # metrics=[keras.metrics.SparseCategoricalAccuracy()],
    # student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    alpha=0.1,
    temperature=10
)
# print(X_train_processed)
# Train the student
distiller.fit(X_train_processed, y_train,batch_size=32, epochs=32)

# Evaluate the student model
distiller.evaluate(X_test_processed, y_test,batch_size=32)

**HERE IS A VANILA STUDENT MODEL FOR TRAINING**

In [46]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

# Define a simple neural network for text classification
def create_student_model(sequence_length, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=sequence_length))
    model.add(LSTM(128))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

student_model = create_student_model(sequence_length, len(label_encoder.classes_))

In [None]:
student_model.fit(X_train_processed, soft_labels_one_hot, epochs=32, batch_size=32)

In [None]:
X_test_processed = preprocess_your_data(X_test, sequence_length)
loss, accuracy = student_model.evaluate(X_test_processed, y_test)
print(f"Test Accuracy: {accuracy}")

array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
