In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
import string
import re
import nltk
nltk.download('punkt')

# Load the dataset
train_df = pd.read_csv('train.csv')
qa_df = pd.read_csv('QA_data.csv', encoding='ISO-8859-1')

# Define a function to normalize and tokenize the text
def normalize_and_tokenize(text):
    # Remove special characters and punctuation symbols
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    return tokens

# Apply the function to the 'text' column
train_df['text'] = train_df['text'].apply(normalize_and_tokenize)
qa_df['text'] = qa_df['text'].apply(normalize_and_tokenize)
qa_df['answer'] = qa_df['answer'].apply(normalize_and_tokenize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip glove.6B.300d.txt

--2023-05-19 17:54:36--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-05-19 17:54:36--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-05-19 17:54:36--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
import numpy as np

# Load the pre-trained GloVe embeddings
embeddings_index = {}
with open('glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Define a function to get the embedding for a given word
def get_embedding(word):
    embedding = embeddings_index.get(word)
    if embedding is not None:
        return embedding
    else:
        return np.zeros(300)

# Define a function to get the embeddings for a given text
def get_text_embeddings(text):
    embeddings = []
    for word in text:
        embedding = get_embedding(word)
        embeddings.append(embedding)
    return embeddings

# Apply the function to the 'text' column of the train and test datasets
train_df['text_embeddings'] = train_df['text'].apply(get_text_embeddings)
qa_df['text_embeddings'] = qa_df['text'].apply(get_text_embeddings)
qa_df['answer_embeddings'] = qa_df['answer'].apply(get_text_embeddings)

In [None]:
from keras.models import Model
from keras.layers import Input, Bidirectional, LSTM, Dense
from keras.utils import to_categorical
!pip install keras_preprocessing
from keras_preprocessing.sequence import pad_sequences

# Set the number of classes
num_coarse_classes = 6
num_fine_classes = 50

# Convert the 'label-coarse' and 'label-fine' columns to one-hot encoded labels
y_coarse = to_categorical(qa_df['label-coarse'], num_classes=num_coarse_classes)
y_fine = to_categorical(qa_df['label-fine'], num_classes=num_fine_classes)

# Define the maximum sequence length
max_seq_length = 100

# Add a <PAD> token at the end of each text sequence
qa_df['padded_text'] = qa_df['text'].apply(lambda x: x + ['<PAD>'])

# Get the embeddings for the padded text sequences
X_text = qa_df['padded_text'].apply(get_text_embeddings)

# Pad or truncate the 'text_embeddings' sequences to have the same length
X_text = pad_sequences(X_text, maxlen=max_seq_length + 1, dtype='float32')

# Define the input shape
input_shape = (max_seq_length + 1, 300)

# Create the input layer
inputs = Input(shape=input_shape)

# Create the Bidirectional LSTM layer
lstm_outputs = Bidirectional(LSTM(100, return_sequences=True))(inputs)

# Create the coarse and fine output layers
coarse_outputs = Dense(num_coarse_classes, activation='softmax')(lstm_outputs[:, -2, :])
fine_outputs = Dense(num_fine_classes, activation='softmax')(lstm_outputs[:, -1, :])

# Create the model
model = Model(inputs=inputs, outputs=[coarse_outputs, fine_outputs])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_text, [y_coarse, y_fine], epochs=50, batch_size=32)

questions = ['How many people speak French?', 'What day is today?', 'Who will win the war?', 'Who is Italian first minister?', 'When World War II ended?', 'When Gandhi was assassinated?']

for question in questions:
    # Normalize and tokenize the question
    tokens = normalize_and_tokenize(question)
    # Add a <PAD> token at the end of the sequence
    tokens.append('<PAD>')
    # Get the embeddings for the tokens
    embeddings = get_text_embeddings(tokens)
    # Pad or truncate the sequence to have the same length as the input shape
    embeddings = pad_sequences([embeddings], maxlen=max_seq_length + 1, dtype='float32')
    # Predict the coarse and fine labels for the question
    coarse_pred, fine_pred = model.predict(embeddings)
    # Get the index of the predicted coarse and fine labels
    coarse_index = np.argmax(coarse_pred)
    fine_index = np.argmax(fine_pred)
    # Find all rows in qa_df where 'label-coarse' == coarse_index and 'label-fine' == fine_index
    rows = qa_df[(qa_df['label-coarse'] == coarse_index) & (qa_df['label-fine'] == fine_index)]
    if len(rows) > 0:
        # Select a random row from rows as an answer to this question.
        answer_row = rows.sample(n=1)
        answer_text = answer_row.iloc[0]['answer']
        print(f"Q: {question}")
        print(f"A: {' '.join(answer_text)}")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Ep

In [None]:
questions = ['How many people speak French?', 'What day is today?', 'Who will win the war?', 'Who is Italian first minister?', 'When World War II ended?', 'When Gandhi was assassinated?']

for question in questions:
    # Normalize and tokenize the question
    tokens = normalize_and_tokenize(question)
    # Add a <PAD> token at the end of the sequence
    tokens.append('<PAD>')
    # Get the embeddings for the tokens
    embeddings = get_text_embeddings(tokens)
    # Pad or truncate the sequence to have the same length as the input shape
    embeddings = pad_sequences([embeddings], maxlen=max_seq_length + 1, dtype='float32')
    # Predict the coarse and fine labels for the question
    coarse_pred, fine_pred = model.predict(embeddings)
    # Get the index of the predicted coarse and fine labels
    coarse_index = np.argmax(coarse_pred)
    fine_index = np.argmax(fine_pred)
    # Find all rows in qa_df where 'label-coarse' == coarse_index and 'label-fine' == fine_index
    rows = qa_df[(qa_df['label-coarse'] == coarse_index) & (qa_df['label-fine'] == fine_index)]
    if len(rows) > 0:
        # Select a random row from rows as an answer to this question.
        answer_row = rows.sample(n=1)
        answer_text = answer_row.iloc[0]['answer']
        print(f"Q: {question}")
        print(f"A: {' '.join(answer_text)}")

Q: What day is today?
A: measures temperature
Q: Who will win the war?
A: chris haney and scott abbott
Q: Who is Italian first minister?
A: the bard
Q: When World War II ended?
A: 1789
Q: When Gandhi was assassinated?
A: 1939
