In [24]:
from kaggle_secrets import UserSecretsClient
import wandb

user_secrets = UserSecretsClient()

my_secret = user_secrets.get_secret("wandb_api") 

wandb.login(key=my_secret)



True

In [25]:
!pip install python-Levenshtein



In [26]:
!pip install -U accelerate



In [27]:
!pip install sacrebleu



In [28]:
!pip install rouge_score



### Import Libraries & Modules

In [29]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import json
import re
import requests
import string
import matplotlib.pyplot as plt

import spacy
from textblob import TextBlob
tokens = spacy.load("en_core_web_sm")
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from functools import reduce
from collections import Counter
from itertools import chain

import nltk
nltk.download("stopwords")
nltk.download('brown')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import norm
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.metrics import edit_distance

import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package brown to /usr/share/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Using device: cuda


### Import Data

In [30]:
datapath = '/kaggle/input/final-data/final_data-2.csv'
data = pd.read_csv(datapath)
data = data.dropna(axis=0)
print(data.shape)
data.sample(10)

(38547, 2)


Unnamed: 0,Target,Clues
25069,astrophysical,astrophysical science
38162,padre amaro,father amaro mexico father benito year old gir...
35681,punisher,undercover fbi agent frank castle wife punishe...
32010,lorentz force,lorentz point charge magnetic field right angl...
10141,sinistral,sinistral individual exhibit dominance leave hand
7601,fond,fond mother
38641,baghead,horror film
3894,conventional,conventional wisdom
15939,whisk,wire whisk
623,basseterre saint kitts nevis,america


### Import Wordnet Data

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
noun_lemmas_in_wordnet = set(chain(*[ss.lemma_names() for ss in wn.all_synsets(pos='n')]))

def get_word_relations(word):
    synsets = wn.synsets(word, pos='n')
    antonyms = set()
    hyponyms = set()
    synonyms = set()
    hypernyms = set()
    polysems = set()
    metonyms = set()
    part_meronyms = set()
    member_meronyms = set()
    substance_meronyms = set()
    part_holonyms = set()
    member_holonyms = set()
    substance_holonyms = set()
    troponyms = set()
    entails = set()

    for synset in synsets:
        antonyms.update(lemma.name().split('.')[0] for lemma in synset.lemmas() if lemma.antonyms())
        hyponyms.update(lemma.name().split('.')[0] for lemma in synset.lemmas() if lemma.hyponyms())
        synonyms.update(lemma.name().split('.')[0] for lemma in synset.lemmas())
        hypernyms.update(hypernym.name().split('.')[0] for hypernym in synset.hypernyms())
        polysems.update(syn.name().split('.')[0] for syn in wn.synsets(word))
        metonyms.update(related.name().split('.')[0] for related in synset.part_meronyms())
        part_meronyms.update(related.name().split('.')[0] for related in synset.part_meronyms())
        member_meronyms.update(related.name().split('.')[0] for related in synset.member_meronyms())
        substance_meronyms.update(related.name().split('.')[0] for related in synset.substance_meronyms())
        part_holonyms.update(related.name().split('.')[0] for related in synset.part_holonyms())
        member_holonyms.update(related.name().split('.')[0] for related in synset.member_holonyms())
        substance_holonyms.update(related.name().split('.')[0] for related in synset.substance_holonyms())
        troponyms.update(lemma.name().split('.')[0] for lemma in synset.lemmas() if lemma.derivationally_related_forms())
        entails.update(related.name().split('.')[0] for related in synset.entailments())

    return {
        'Target': word,
        'Clues': ', '.join(set(chain(member_meronyms, member_holonyms, part_meronyms, part_holonyms, hypernyms, hyponyms)))
    }

word_relations_list = [get_word_relations(word) for word in noun_lemmas_in_wordnet]
wordnet_words = pd.DataFrame(word_relations_list)
print(wordnet_words.shape)
wordnet_words.sample(10)

In [None]:
data = pd.concat([data, wordnet_words], ignore_index=True)
data = data[data['Target'].apply(lambda x: len(x) > 0)]
data = data[data['Clues'].apply(lambda x: len(x) > 0)]
data = data.groupby('Target', as_index=False)['Clues'].agg(', '.join)
data.to_csv('final_data.csv')
print(data.shape)
data.sample(10)

### Word2Vec Model with Skip-Gram

In [None]:
def tokenizer(sentence):
    sentence = sentence.replace('_', ' ')
    return word_tokenize(sentence)

tokenized_data_target = [tokenizer(str(sentence)) for sentence in data['Target']]
tokenized_data_clues = [tokenizer(str(sentence)) for sentence in data['Clues']]
tokenized_data_combined = tokenized_data_target + tokenized_data_clues

w2v_model = Word2Vec(sentences=tokenized_data_combined, vector_size=100, window=5, min_count=1, sg=1, workers=4)

In [None]:
def common_words(words, top_n=10):
    similar_words = []
    similar_score = []
    for word in words:
        try:
            if word in w2v_model.wv:
                similar_words += [similar_word for similar_word, _ in w2v_model.wv.most_similar(word, topn=top_n)]
                similar_score += [similar_score for _, similar_score in w2v_model.wv.most_similar(word, topn=top_n)]
        except KeyError:
            pass

    data = pd.DataFrame({'Similar_Words': similar_words, 'Similar_Scores': similar_score})
    ranked_data = data.groupby("Similar_Words")["Similar_Scores"].mean()
    ranked_data = ranked_data.sort_values(ascending=False)

    return ranked_data

In [None]:
input_words = ['laptop', 'pc', 'smartphone']
common_words_result = common_words(input_words)
print(common_words_result)

In [None]:
input_words = ['apple', 'banana', 'grape', 'mango']
common_words_result = common_words(input_words)
print(common_words_result)

In [None]:
input_words = ['spoon', 'fork', 'butterknife']
common_words_result = common_words(input_words)
print(common_words_result)

### Neural Network Model

In [None]:
input_tokens = [word.split(', ') for word in data['Clues']]
output_tokens = [word.split('_') for word in data['Target']]

all_tokens = [token for sublist in input_tokens + output_tokens for token in sublist]

unique_tokens = list(set(all_tokens))

word_to_index = {word: idx + 1 for idx, word in enumerate(unique_tokens)}

input_indices = [[word_to_index[word] for word in tokens] for tokens in input_tokens]
output_indices = [[word_to_index[word] for word in tokens] for tokens in output_tokens]

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

input_indices_padded = pad_sequences(input_indices, padding='post')
output_indices_padded = pad_sequences(output_indices, padding='post', maxlen=input_indices_padded.shape[1])

embedding_matrix = np.zeros((len(word_to_index) + 1, w2v_model.vector_size))
for word, idx in word_to_index.items():
    if word in w2v_model.wv:
        embedding_matrix[idx] = w2v_model.wv[word]

num_classes = len(unique_tokens) + 1

In [None]:
embedding_matrix

In [None]:
from tensorflow.keras.layers import LSTM

strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
    model = Sequential()
    model.add(Embedding(input_dim=len(word_to_index) + 1, output_dim=w2v_model.vector_size, input_length=input_indices_padded.shape[1], trainable=False, weights=[embedding_matrix]))
    model.add(LSTM(64, return_sequences=True))
    model.add(Dense(len(word_to_index) + 1, activation='softmax'))

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(input_indices_padded, output_indices_padded, epochs=5)

In [None]:
plt.plot(history.history['accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

In [None]:
index_to_word = {idx: word for word, idx in word_to_index.items()}

input_words = ['chair', 'table', 'sofa', 'bed']
input_indices = [word_to_index[word] for word in input_words]
input_indices_padded = pad_sequences([input_indices], padding='post', maxlen=input_indices_padded.shape[1])

predictions = model.predict(input_indices_padded)

predicted_word_indices = np.argmax(predictions, axis=2)[0]
predicted_words = [index_to_word.get(idx, 'UNKNOWN') for idx in predicted_word_indices]

print(f"The predicted words related to the input set are: {predicted_words}")

### Pre-Trained Model with Fine-Tuning - BART

In [None]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base", forced_bos_token_id=0).to(device)
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

In [None]:
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="/kaggle/working/final_data.csv",
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

def compute_metrics(p):
    return {
        'loss': p.loss,
        'learning_rate': p.learning_rate,
    }

training_args = TrainingArguments(
    output_dir="/kaggle/working/fine-tuned-bart-model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    use_cpu = False
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    compute_metrics=compute_metrics,
)

training_history = trainer.train()

In [None]:
train_loss = training_history.metrics['train_loss']

plt.figure(figsize=(12, 6))
plt.plot(train_loss, label='Training Loss')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:

# Assuming 'data' is your DataFrame and it has two columns: 'text' for the input text and 'label' for the target label
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


In [None]:

# Assuming you are using a Hugging Face Transformers model
from transformers import TrainingArguments, Trainer

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_data,            # training dataset
    eval_dataset=test_data               # evaluation dataset
)

# Start training
trainer.train()


In [None]:

# Assuming 'data' is your DataFrame and it has columns for features and a target label
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Splitting the data into features and labels
features = data.drop('label', axis=1).values
labels = data['label'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)




In [None]:

# Assuming a TensorFlow model is defined as 'model'

# Compiling the model
model.compile(optimizer='adam',
              loss='binary_crossentropy', # or another appropriate loss function depending on your problem
              metrics=['accuracy'])

# Training the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


In [60]:

# Assuming 'data' is your DataFrame and it has columns for features and a target label
from sklearn.model_selection import train_test_split
import torch

# Splitting the data into features and labels
features = list(data.drop('Target', axis=1).values)
labels = list(data['Target'].values)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

X_train_text = tokenizer(str(X_train), padding=True, truncation=True, return_tensors="pt")
y_train_labels = tokenizer(y_train, padding=True, truncation=True, return_tensors="pt")
X_test_text = tokenizer(str(X_test), padding=True, truncation=True, return_tensors="pt")
y_test_labels = tokenizer(y_test, padding=True, truncation=True, return_tensors="pt")



{'input_ids': tensor([[    0, 10975, 30766,  ..., 23324,   108,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [None]:

# Assuming a PyTorch model is defined as 'model'

# Define a loss function and optimizer
criterion = torch.nn.BCELoss() # or another appropriate loss function depending on your problem
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):  # number of epochs
    model.train()
    optimizer.zero_grad()   # zero the gradient buffers
    output = model(X_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()

    # Validation step
    model.eval()
    with torch.no_grad():
        val_output = model(X_test)
        val_loss = criterion(val_output, y_test)
        # Add accuracy calculation or other metrics as needed
