In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, LSTM, Bidirectional
from keras.utils import to_categorical

In [None]:
# Returns X_train, X_test, y_train, y_test that works with model
def get_training_data(X, y, labels):
  labels_map = {k: v for v, k in enumerate(labels)}
  y_num = np.zeros_like(y, dtype=float)
  for i in range(y_num.shape[0]):
    y_num[i] = labels_map[y[i]]
  y_num = to_categorical(y_num, num_classes=len(labels))
  return train_test_split(X, y_num, test_size=0.2, random_state=30)

In [None]:
# Takes in training and testing data, outputs tokenized and padded
def standardize_data(X_train, X_test, word_count):
  tokenizer = Tokenizer(num_words=word_count, oov_token="<OOV>")
  tokenizer.fit_on_texts(X_train)
  X_train_seq = tokenizer.texts_to_sequences(X_train)
  X_test_seq = tokenizer.texts_to_sequences(X_test)

  max_length = max(len(x) for x in X_train_seq)
  X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
  X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

  return X_train_padded, X_test_padded, max_length

In [None]:
# Loading the data

df = pd.read_csv('final data.csv')
df.dropna(subset=['Intensity'], inplace=True)
df.dropna(subset=['Classification'], inplace=True)
print(df)

# Load intensity data
intensity_df = pd.read_csv('/content/Intensity Database - Attempt 2.csv')
intensity_df.dropna(subset=['Intensity'], inplace=True)
intensity_df.dropna(subset=['Classification'], inplace=True)
intensity_df

                                                  Query Classification  \
0     Are there any general debate teams around campus?           Arts   
1     Hello, I'm new to campus and I'm looking to fi...           Arts   
2     I am interested in art and want to learn more ...           Arts   
3     I am really looking to design and build new th...           Arts   
4     I like drawing in my leisure time, is there a ...           Arts   
...                                                 ...            ...   
2919  Strong public speaking skills and passionate a...        Service   
2920  Experienced fundraiser and passionate about en...        Service   
2921  Data analyst student with a strong work ethic ...        Service   
2922  Experienced grant writer and passionate about ...        Service   
2923  Strong research skills and passionate about fo...        Service   

      Intensity  
0             1  
1             1  
2             1  
3             1  
4             1  
...

Unnamed: 0,Query,Classification,Intensity
0,I have a strong background in computer science...,STEM,3
1,I'm passionate about exploring the intersectio...,STEM,3
2,I want to participate in cutting-edge research...,STEM,3
3,I'm interested in joining a robotics club that...,STEM,3
4,I'm eager to contribute to blockchain research...,STEM,3
...,...,...,...
840,I recently wanted to get into breakdancing,Arts,1
841,I want to get out of my comfort zone by joinin...,Arts,1
842,I want to pick up painting as a hobby.,Arts,1
843,I want to start getting into Anime and learnin...,Arts,1


In [None]:
# Experimenting with the priority model
dict_word_count = 2500

# Getting relevant data
queries = df['Query'].values
intensity = df['Intensity'].values
intensity_labels = [1, 2, 3]

# Morphing data to form into model
X_train, X_test, y_train_intensity, y_test_intensity = get_training_data(queries, intensity, intensity_labels)
X_train_padded, X_test_padded, max_length = standardize_data(X_train, X_test, dict_word_count)

# Create model
intensity_model = Sequential([
    Embedding(input_dim=dict_word_count, output_dim=4,input_length=max_length),
    #LSTM(units=2, dropout=0.5, return_sequences=True),
    LSTM(units=5, dropout=0.6),
    Dense(3, activation='softmax')
])

intensity_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
intensity_model.summary()

# Fit model and show how it went
history = intensity_model.fit(X_train_padded, y_train_intensity, epochs=20, validation_data=(X_test_padded, y_test_intensity))
test_loss, test_acc = intensity_model.evaluate(X_test_padded, y_test_intensity, verbose=2)

print('\nTest accuracy:', test_acc)

predictions = intensity_model.predict(X_test_padded)
predicted_priorities = np.argmax(predictions, axis=1)



Model: "sequential_23"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_24 (Embedding)    (None, 35, 4)             10000     
                                                                 
 lstm_23 (LSTM)              (None, 5)                 200       
                                                                 
 dense_23 (Dense)            (None, 3)                 18        
                                                                 
Total params: 10218 (39.91 KB)
Trainable params: 10218 (39.91 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
19/19 - 0s - loss: 0.8490 - accuracy: 0.6034 - 98m

In [None]:
# Experimenting with category model
dict_word_count = 10000

# Getting needed data
classifications = df['Classification'].values
classification_labels = ["Arts","Athletics","Business","Culture","Government","Professional","Religion","Service","Social","STEM"]

# Morphing data to fit into model
X_train, X_test, y_train_category, y_test_category = get_training_data(queries, classifications, classification_labels)
X_train_padded, X_test_padded, max_length = standardize_data(X_train, X_test, dict_word_count)

# Create model
category_model = Sequential([
    Embedding(input_dim=dict_word_count, output_dim=32,input_length=max_length),
    Bidirectional(LSTM(20,dropout=0.9)),
    Dense(10, activation='softmax')
])

category_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
category_model.summary()

# Fit model and show how it went
history = category_model.fit(X_train_padded, y_train_category, epochs=70, validation_data=(X_test_padded, y_test_category))
test_loss, test_acc = category_model.evaluate(X_test_padded, y_test_category, verbose=2)

print('\nTest accuracy:', test_acc)

predictions = category_model.predict(X_test_padded)
predicted_categories = np.argmax(predictions, axis=1)



Model: "sequential_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_25 (Embedding)    (None, 35, 32)            320000    
                                                                 
 bidirectional (Bidirection  (None, 40)                8480      
 al)                                                             
                                                                 
 dense_24 (Dense)            (None, 10)                410       
                                                                 
Total params: 328890 (1.25 MB)
Trainable params: 328890 (1.25 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19