In [2]:
!pip install --upgrade tensorflow keras



In [3]:

!pip install datasets



In [4]:
import pandas as pd

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
word_relations = []
with open('/content/drive/MyDrive/NYT29/relations.txt') as f:
  word_relations = f.readlines()

word_relations = [r.strip() for r in word_relations]

mapping_label = {}
for i in range(len(word_relations)):
  mapping_label[word_relations[i]] = i
mapping_label

{'/location/administrative_division/country': 0,
 '/location/country/capital': 1,
 '/location/country/administrative_divisions': 2,
 '/location/neighborhood/neighborhood_of': 3,
 '/location/location/contains': 4,
 '/people/person/nationality': 5,
 '/people/person/place_lived': 6,
 '/people/deceased_person/place_of_death': 7,
 '/business/person/company': 8,
 '/location/us_state/capital': 9,
 '/people/person/place_of_birth': 10,
 '/people/person/children': 11,
 '/business/company/founders': 12,
 '/business/company/place_founded': 13,
 '/sports/sports_team/location': 14,
 '/people/person/ethnicity': 15,
 '/people/ethnicity/geographic_distribution': 16,
 '/people/person/religion': 17,
 '/business/company/major_shareholders': 18,
 '/location/province/capital': 19,
 '/location/br_state/capital': 20,
 '/business/company/advisors': 21,
 '/film/film_location/featured_in_films': 22,
 '/film/film/featured_film_locations': 23,
 '/location/us_county/county_seat': 24,
 '/time/event/locations': 25,
 

In [7]:
def add_entity_marks(sentences, pointers):
    # Initialize lists to hold marked sentences and their corresponding multi-class labels
    marked_sentences = []
    multi_class_labels = []

    # Iterate over each sentence and its corresponding pointer data
    for idx, pointer_line in enumerate(pointers):
        sentence = sentences[idx]  # Get the current sentence
        words = sentence.split()    # Split the sentence into words

        # Split the pointer data into individual pointer strings
        pointer_strings = pointer_line.split('|')
        label_map = {}  # Dictionary to map entity positions to their labels

        # Process each pointer string to extract entity positions and labels
        for pointer_str in pointer_strings:
            word_marks = []  # List to hold the words with entity marks

            # Extract start and end positions of entities and the label
            parts = pointer_str.split()
            start_entity1 = int(parts[0])
            end_entity1 = int(parts[1])
            start_entity2 = int(parts[2])
            end_entity2 = int(parts[3])
            entity_label = parts[4]

            # Create a tuple of entity positions
            entity_positions = (start_entity1, end_entity1, start_entity2, end_entity2)
            # Map the positions to their corresponding labels
            if entity_positions in label_map:
                label_map[entity_positions].append(entity_label)
            else:
                label_map[entity_positions] = [entity_label]

        # Iterate over each entity position and its corresponding labels
        for positions, labels in label_map.items():
            start_e1, end_e1, start_e2, end_e2 = positions

            # Iterate over the words in the sentence
            for j, word in enumerate(words):
                # Mark the first entity
                if j == start_e1:
                    word_marks.append("<e1>")
                    word_marks.append(word)
                elif j == end_e1:
                    word_marks.append(word)
                    word_marks.append("</e1>")
                # Mark the second entity
                elif j == start_e2:
                    word_marks.append("<e2>")
                    word_marks.append(word)
                elif j == end_e2:
                    word_marks.append(word)
                    word_marks.append("</e2>")
                else:
                    word_marks.append(word)  # Add the word without marking

            # Join the words to form the marked sentence
            marked_sentence = " ".join(word_marks)

            # Create a multi-class label vector for the current sentence
            class_label_vector = [float(0) for _ in range(len(word_relations))]
            for label in labels:
                class_label_vector[label_map[label]] = float(1)

            # Append the marked sentence and its labels to the lists
            multi_class_labels.append(class_label_vector)
            marked_sentences.append(marked_sentence)

    return marked_sentences, multi_class_labels


In [8]:
def load_and_label_data(dataset_split):
    # Open and read sentences from the specified dataset split
    with open(f'/content/drive/MyDrive/NYT29/{dataset_split}.sent') as sentence_file:
        sentences = sentence_file.readlines()

    # Open and read tuples from the specified dataset split
    with open(f'/content/drive/MyDrive/NYT29/{dataset_split}.tup') as tuple_file:
        tuples = tuple_file.readlines()

    # Open and read pointers from the specified dataset split
    with open(f'/content/drive/MyDrive/NYT29/{dataset_split}.pointer') as pointer_file:
        pointers = pointer_file.readlines()

    # Add marks to the sentences based on the pointers and return the marked data along with labels
    marked_sentences, labels = add_entity_marks(sentences, pointers)

    return marked_sentences, labels

# Load and label training data
training_data, training_labels = load_and_label_data("train")

# Load and label testing data
testing_data, testing_labels = load_and_label_data("test")

# Load and label development data
development_data, development_labels = load_and_label_data("dev")


In [9]:
train_df = pd.DataFrame({'marked_sentences':training_data, 'multi_class_labels': training_labels})
dev_df = pd.DataFrame({'marked_sentences':development_data, 'multi_class_labels': development_labels})
test_df = pd.DataFrame({'marked_sentences':testing_data, 'multi_class_labels': testing_labels})

In [10]:
# save
train_df.to_csv('train_data.csv')
dev_df.to_csv('dev_data.csv')
test_df.to_csv('test_data.csv')

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load data (assuming you have already saved your marked data and labels)
train_data = pd.read_csv('train_data.csv')
dev_data = pd.read_csv('dev_data.csv')
test_data = pd.read_csv('test_data.csv')


In [12]:
train_data

Unnamed: 0.1,Unnamed: 0,marked_sent,label
0,0,"then terrorism struck again , this time in the...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,"then terrorism struck again , this time in the...","[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,a12 new york\/region b1-7 enclave for middle c...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,a12 new york\/region b1-7 enclave for middle c...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,"before long , though , he 's continent-hopping...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
73387,73387,the yacht being used to train crew members for...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
73388,73388,"the interment is on monday , july 31st , 2006 ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
73389,73389,"fragonard '' is the eighteenth century , '' wr...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
73390,73390,on a hilltop patio with a stunning view of the...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."


In [13]:

# Extract sentences and labels
X_train = train_data['marked_sent'].values
y_train = train_data['label'].values
X_dev = dev_data['marked_sent'].values
y_dev = dev_data['label'].values
X_test = test_data['marked_sent'].values
y_test = test_data['label'].values


In [14]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [15]:

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_dev_seq = tokenizer.texts_to_sequences(X_dev)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [16]:
# Pad sequences
max_length = max(max(len(seq) for seq in X_train_seq),
                 max(len(seq) for seq in X_dev_seq),
                 max(len(seq) for seq in X_test_seq))
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_dev_pad = pad_sequences(X_dev_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

In [17]:
# # Encode labels
# label_encoder = LabelEncoder()
# y_train_encoded = label_encoder.fit_transform(y_train)


In [None]:
# y_dev_encoded = label_encoder.transform(y_dev)
# y_test_encoded = label_encoder.fit_transform(y_test)

In [19]:
from sklearn.preprocessing import MultiLabelBinarizer

In [27]:




# Fit MultiLabelBinarizer on training labels
mlb = MultiLabelBinarizer()
y_train_encoded = mlb.fit_transform(y_train)

# Transform validation and test labels
y_dev_encoded = mlb.transform(y_dev)
y_test_encoded = mlb.transform(y_test)


y_train_encoded = np.argmax(y_train_encoded, axis=1)
y_dev_encoded = np.argmax(y_dev_encoded, axis=1)
y_test_encoded = np.argmax(y_test_encoded, axis=1)


##Build LSTM Model

In [28]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

# Define model parameters
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding
embedding_dim = 100
num_classes = len(label_encoder.classes_)

In [29]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [32]:
subset_size = int(len(X_train_pad) * 0.3)  # Use 50% of the data, for example
history = model.fit(X_train_pad[:subset_size], y_train_encoded[:subset_size],
                    validation_data=(X_dev_pad, y_dev_encoded),
                    epochs=2,
                    batch_size=128)


Epoch 1/2
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1097s[0m 6s/step - accuracy: 1.0000 - loss: 0.0022 - val_accuracy: 1.0000 - val_loss: 1.8869e-04
Epoch 2/2
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1200s[0m 7s/step - accuracy: 1.0000 - loss: 0.0010 - val_accuracy: 1.0000 - val_loss: 9.4767e-05


In [34]:
from sklearn.metrics import f1_score
import numpy as np

# Step 1: Predict on the test set
y_test_pred = model.predict(X_test_pad)

# Step 2: Convert predictions to class labels
y_test_pred_class = np.argmax(y_test_pred, axis=1)

# Step 3: Calculate F1-score (no need for np.argmax on y_test_encoded if it's already in integer label form)
f1 = f1_score(y_test_encoded, y_test_pred_class, average='weighted')

print(f"F1-score: {f1:.4f}")


[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 628ms/step
F1-score: 1.0000
