In [48]:
!pip install tensorflow==2.9.0



In [49]:
import pandas as pd
from tensorflow import keras
import sklearn.metrics as metrics
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, GlobalAveragePooling1D, Dense, LSTM, Bidirectional
from keras.utils import to_categorical
import joblib

In [50]:
print(keras.__version__)

2.9.0


In [51]:
# Returns X_train, X_test, y_train, y_test that works with model
def get_training_data(X, y, labels):
  labels_map = {k: v for v, k in enumerate(labels)}
  y_num = np.zeros_like(y, dtype=float)
  for i in range(y_num.shape[0]):
    y_num[i] = labels_map[y[i]]
  y_num = to_categorical(y_num, num_classes=len(labels))
  return train_test_split(X, y_num, test_size=0.2, random_state=30)

In [52]:
# Takes in training and testing data, outputs tokenized and padded
def standardize_data(X_train, X_test, word_count):
  tokenizer = Tokenizer(num_words=word_count, oov_token="<OOV>")
  tokenizer.fit_on_texts(X_train)
  X_train_seq = tokenizer.texts_to_sequences(X_train)
  X_test_seq = tokenizer.texts_to_sequences(X_test)

  filename = 'tokenizer_' + str(word_count) + '.joblib'
  joblib.dump(tokenizer, filename)

  # loading
  #joblib.load(path)

  max_length = max(len(x) for x in X_train_seq)
  print(max_length)
  X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
  X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

  return X_train_padded, X_test_padded, max_length

In [53]:
# Loading the data

df = pd.read_csv('final data.csv')
df.dropna(subset=['Intensity'], inplace=True)
df.dropna(subset=['Classification'], inplace=True)
print(df)

# Load intensity data
# intensity_df = pd.read_csv('/content/Intensity Database - Attempt 2.csv')
# intensity_df.dropna(subset=['Intensity'], inplace=True)
# intensity_df.dropna(subset=['Classification'], inplace=True)
# intensity_df

                                                  Query Classification  \
0     Are there any general debate teams around campus?           Arts   
1     Hello, I'm new to campus and I'm looking to fi...           Arts   
2     I am interested in art and want to learn more ...           Arts   
3     I am really looking to design and build new th...           Arts   
4     I like drawing in my leisure time, is there a ...           Arts   
...                                                 ...            ...   
2919  Strong public speaking skills and passionate a...        Service   
2920  Experienced fundraiser and passionate about en...        Service   
2921  Data analyst student with a strong work ethic ...        Service   
2922  Experienced grant writer and passionate about ...        Service   
2923  Strong research skills and passionate about fo...        Service   

      Intensity  
0             1  
1             1  
2             1  
3             1  
4             1  
...

In [54]:
# Experimenting with the priority model
dict_word_count = 2500

# Getting relevant data
queries = df['Query'].values
intensity = df['Intensity'].values
intensity_labels = [1, 2, 3]

# Morphing data to form into model
X_train, X_test, y_train_intensity, y_test_intensity = get_training_data(queries, intensity, intensity_labels)
X_train_padded, X_test_padded, max_length = standardize_data(X_train, X_test, dict_word_count)

# Create model
intensity_model = Sequential([
    Embedding(input_dim=dict_word_count, output_dim=4,input_length=max_length),
    #LSTM(units=2, dropout=0.5, return_sequences=True),
    LSTM(units=5, dropout=0.6),
    Dense(3, activation='softmax')
])

intensity_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
intensity_model.summary()

# Fit model and show how it went
history = intensity_model.fit(X_train_padded, y_train_intensity, epochs=10, validation_data=(X_test_padded, y_test_intensity))
test_loss, test_acc = intensity_model.evaluate(X_test_padded, y_test_intensity, verbose=2)

print('\nTest accuracy:', test_acc)

predictions = intensity_model.predict(X_test_padded)
predicted_priorities = np.argmax(predictions, axis=1)

# save model
intensity_model.save("intensity_model.keras")


35
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 35, 4)             10000     
                                                                 
 lstm_4 (LSTM)               (None, 5)                 200       
                                                                 
 dense_4 (Dense)             (None, 3)                 18        
                                                                 
Total params: 10,218
Trainable params: 10,218
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
19/19 - 0s - loss: 0.8861 - accuracy: 0.5538 - 101ms/epoch - 5ms/step

Test accuracy: 0.5538461804389954


In [55]:
# Confusion matrix to see whats going on

y_prediction_conf = intensity_model.predict(X_test_padded)
y_prediction_conf = np.argmax(y_prediction_conf, axis=1)
y_test_conf = np.argmax(y_test_intensity, axis=1)
intensity_confusion = metrics.confusion_matrix(y_test_conf, y_prediction_conf, normalize='pred')
print(intensity_confusion)

[[0.5310559  0.36363636 0.09139785]
 [0.36645963 0.32467532 0.22043011]
 [0.10248447 0.31168831 0.68817204]]


In [57]:
# Experimenting with category model
dict_word_count = 10000

# Getting needed data
classifications = df['Classification'].values
classification_labels = ["Arts","Athletics","Business","Culture","Government","Professional","Religion","Service","Social","STEM"]

# Morphing data to fit into model
X_train, X_test, y_train_category, y_test_category = get_training_data(queries, classifications, classification_labels)
X_train_padded, X_test_padded, max_length = standardize_data(X_train, X_test, dict_word_count)

# Create model
category_model = Sequential([
    Embedding(input_dim=dict_word_count, output_dim=32,input_length=max_length),
    Bidirectional(LSTM(20,dropout=0.9)),
    Dense(10, activation='softmax')
])

category_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
category_model.summary()

# Fit model and show how it went
history = category_model.fit(X_train_padded, y_train_category, epochs=50, validation_data=(X_test_padded, y_test_category))
test_loss, test_acc = category_model.evaluate(X_test_padded, y_test_category, verbose=2)

print('\nTest accuracy:', test_acc)

predictions = category_model.predict(X_test_padded)
predicted_categories = np.argmax(predictions, axis=1)

category_model.save("category_model.keras")

35
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 35, 32)            320000    
                                                                 
 bidirectional_3 (Bidirectio  (None, 40)               8480      
 nal)                                                            
                                                                 
 dense_6 (Dense)             (None, 10)                410       
                                                                 
Total params: 328,890
Trainable params: 328,890
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50


In [58]:
print(X_train_padded)
print(X_train_padded.shape)

[[  7  18   2 ...   0   0   0]
 [  6  12   2 ...   0   0   0]
 [  6  60  55 ...   0   0   0]
 ...
 [ 14   5  40 ...   0   0   0]
 [ 14   5 158 ...   0   0   0]
 [  6  52   2 ...   0   0   0]]
(2339, 35)


In [63]:
# Testing loading and using the models

max_length = 35
intensity_tokenizer = joblib.load('tokenizer_2500.joblib')
loaded_intensity_model = keras.models.load_model("intensity_model.keras")
category_tokenizer = joblib.load('tokenizer_10000.joblib')
loaded_category_model = keras.models.load_model("category_model.keras")

# get query
query = "I want to play some basket and soccer for fun, low commitment"
# intensity: tokensize and pad query
query_seq = intensity_tokenizer.texts_to_sequences([query])
query_padded = pad_sequences(query_seq, maxlen=max_length, padding='post')
# intensity: get prediction
intensity_prediction = intensity_model.predict(query_padded)
# category: tokensize and pad query
query_seq = category_tokenizer.texts_to_sequences([query])
query_padded = pad_sequences(query_seq, maxlen=max_length, padding='post')
# cateogry: get prediction
category_prediction = category_model.predict(query_padded)



In [64]:
print(intensity_prediction)
print(category_prediction)

print(intensity_labels[np.argmax(intensity_prediction)])
print(classification_labels[np.argmax(category_prediction)])

[[0.46818143 0.4264311  0.10538749]]
[[1.59279990e-03 6.15818858e-01 5.53428035e-05 2.87453859e-05
  1.20340796e-04 9.29758389e-05 1.34425126e-02 2.38510314e-03
  3.66401494e-01 6.18156482e-05]]
1
Athletics
