In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [2]:
# Load dataset
file_path = "ner_dataset.csv"  # Ensure the dataset is in the same directory
df = pd.read_csv(file_path, encoding="ISO-8859-1")

In [3]:
df.head(2)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O


In [4]:
# Fill missing sentence numbers
df["Sentence #"] = df["Sentence #"].ffill()

In [5]:
df.head(2)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O


In [12]:
df.shape

(1048575, 3)

In [6]:
# Drop POS column
df = df.drop(columns=["POS"])

In [7]:
# Group words by sentence
sentences = df.groupby("Sentence #")["Word"].apply(list).values
tags = df.groupby("Sentence #")["Tag"].apply(list).values


In [8]:
sentences,tags

(array([list(['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']),
        list(['Iranian', 'officials', 'say', 'they', 'expect', 'to', 'get', 'access', 'to', 'sealed', 'sensitive', 'parts', 'of', 'the', 'plant', 'Wednesday', ',', 'after', 'an', 'IAEA', 'surveillance', 'system', 'begins', 'functioning', '.']),
        list(['Helicopter', 'gunships', 'Saturday', 'pounded', 'militant', 'hideouts', 'in', 'the', 'Orakzai', 'tribal', 'region', ',', 'where', 'many', 'Taliban', 'militants', 'are', 'believed', 'to', 'have', 'fled', 'to', 'avoid', 'an', 'earlier', 'military', 'offensive', 'in', 'nearby', 'South', 'Waziristan', '.']),
        ...,
        list(['Following', 'Iran', "'s", 'disputed', 'June', '12', 'elections', ',', 'rights', 'groups', 'said', 'hundreds', 'of', 'people', 'were', 'detained', 'in', 'clashes', 'with', '

In [9]:
# Split dataset (70% train, 10% validation, 20% test)
train_sentences, test_sentences, train_tags, test_tags = train_test_split(sentences, tags, test_size=0.2, random_state=42)
train_sentences, val_sentences, train_tags, val_tags = train_test_split(train_sentences, train_tags, test_size=0.125, random_state=42)

In [16]:
train_sentences.shape

(33571,)

In [15]:
# Tokenize words and labels
word_tokenizer = Tokenizer(lower=False, oov_token="<OOV>")
word_tokenizer.fit_on_texts(train_sentences)


In [16]:
tag_tokenizer = Tokenizer(lower=False)
tag_tokenizer.fit_on_texts(train_tags)


In [26]:
train_sentences

array([list(['Medina', 'Garrigues', 'raced', 'to', 'a', '05-Jan', 'lead', 'in', 'the', 'first', 'set', 'before', 'Cho', 'won', 'the', 'next', 'three', 'games', '.']),
       list(['A', 'similar', 'oil', 'loan', 'program', 'aided', 'oil', 'companies', 'after', 'a', 'hurricane', 'disrupted', 'production', 'last', 'year', '.']),
       list(['The', 'United', 'Nations', 'says', 'it', 'is', 'rushing', 'assistance', 'to', 'El', 'Salvador', 'and', 'Costa', 'Rica', ',', 'and', 'remains', 'ready', 'to', 'mobilize', 'international', 'support', 'for', 'emergency', 'relief', 'and', 'recovery', 'efforts', '.']),
       ...,
       list(['The', 'Trees', 'were', 'good-natured', 'and', 'gave', 'him', 'one', 'of', 'their', 'branches', '.']),
       list(['He', 'said', 'Hutu', 'rebels', 'in', 'Congo', 'are', 'massing', 'near', 'the', 'border', 'with', 'Rwanda', ',', 'adding', 'that', 'the', 'government', 'will', 'take', 'any', 'means', 'necessary', 'to', 'defend', 'Rwandan', 'territory', '.']),
       l

In [17]:
# Convert words and tags to sequences
X_train = word_tokenizer.texts_to_sequences(train_sentences)
X_val = word_tokenizer.texts_to_sequences(val_sentences)
X_test = word_tokenizer.texts_to_sequences(test_sentences)

y_train = tag_tokenizer.texts_to_sequences(train_tags)
y_val = tag_tokenizer.texts_to_sequences(val_tags)
y_test = tag_tokenizer.texts_to_sequences(test_tags)


In [18]:
# Padding sequences to max sentence length
max_len = max(len(seq) for seq in X_train)

X_train = pad_sequences(X_train, maxlen=max_len, padding="post")
X_val = pad_sequences(X_val, maxlen=max_len, padding="post")
X_test = pad_sequences(X_test, maxlen=max_len, padding="post")

y_train = pad_sequences(y_train, maxlen=max_len, padding="post")
y_val = pad_sequences(y_val, maxlen=max_len, padding="post")
y_test = pad_sequences(y_test, maxlen=max_len, padding="post")


In [19]:
# Convert labels to categorical format
num_tags = len(tag_tokenizer.word_index) + 1  # Add 1 for padding index
y_train = to_categorical(y_train, num_classes=num_tags)
y_val = to_categorical(y_val, num_classes=num_tags)
y_test = to_categorical(y_test, num_classes=num_tags)


In [25]:
X_train

array([[ 5344,  6617, 10418, ...,     0,     0,     0],
       [   48,   919,   106, ...,     0,     0,     0],
       [   10,    51,   202, ...,     0,     0,     0],
       ...,
       [   10, 15355,    32, ...,     0,     0,     0],
       [   62,    18,  3052, ...,     0,     0,     0],
       [ 3646,  3278,    21, ...,     0,     0,     0]], dtype=int32)

In [27]:
# Build ANN Model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(word_tokenizer.word_index) + 1, output_dim=64, input_length=max_len),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(num_tags * max_len, activation="softmax"),  # Output all tags at once
    tf.keras.layers.Reshape((max_len, num_tags))  # Reshape output to sequence format
])

# Compile Model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Train Model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32)

# Evaluate Model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

Epoch 1/5
[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 14ms/step - accuracy: 0.9200 - loss: 0.3149 - val_accuracy: 0.9727 - val_loss: 0.1023
Epoch 2/5
[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 15ms/step - accuracy: 0.9742 - loss: 0.0915 - val_accuracy: 0.9778 - val_loss: 0.0763
Epoch 3/5
[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 18ms/step - accuracy: 0.9796 - loss: 0.0676 - val_accuracy: 0.9804 - val_loss: 0.0683
Epoch 4/5
[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 20ms/step - accuracy: 0.9829 - loss: 0.0564 - val_accuracy: 0.9824 - val_loss: 0.0634
Epoch 5/5
[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 20ms/step - accuracy: 0.9859 - loss: 0.0466 - val_accuracy: 0.9840 - val_loss: 0.0599
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9838 - loss: 0.0599
Test Accuracy: 0.9837


In [23]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(word_tokenizer.word_index) + 1, output_dim=64, input_length=max_len),
    tf.keras.layers.LSTM(64, return_sequences=True),  # Single-directional LSTM
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_tags, activation="softmax"))
])

# Compile Model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Train Model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32)

# Evaluate Model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

Epoch 1/5
[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 55ms/step - accuracy: 0.9512 - loss: 0.2830 - val_accuracy: 0.9888 - val_loss: 0.0471
Epoch 2/5
[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 57ms/step - accuracy: 0.9909 - loss: 0.0368 - val_accuracy: 0.9913 - val_loss: 0.0306
Epoch 3/5
[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 55ms/step - accuracy: 0.9933 - loss: 0.0231 - val_accuracy: 0.9914 - val_loss: 0.0286
Epoch 4/5
[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 55ms/step - accuracy: 0.9942 - loss: 0.0183 - val_accuracy: 0.9917 - val_loss: 0.0279
Epoch 5/5
[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 55ms/step - accuracy: 0.9948 - loss: 0.0158 - val_accuracy: 0.9917 - val_loss: 0.0288
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.9917 - loss: 0.0285
Test Accuracy: 0.9919


In [20]:
# Define BiLSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(word_tokenizer.word_index) + 1, output_dim=64, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_tags, activation="softmax"))
])




In [21]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Train model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32)


Epoch 1/5
[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 63ms/step - accuracy: 0.9512 - loss: 0.2281 - val_accuracy: 0.9906 - val_loss: 0.0336
Epoch 2/5
[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 68ms/step - accuracy: 0.9927 - loss: 0.0259 - val_accuracy: 0.9924 - val_loss: 0.0256
Epoch 3/5
[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 99ms/step - accuracy: 0.9948 - loss: 0.0175 - val_accuracy: 0.9927 - val_loss: 0.0243
Epoch 4/5
[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 120ms/step - accuracy: 0.9958 - loss: 0.0137 - val_accuracy: 0.9931 - val_loss: 0.0241
Epoch 5/5
[1m1050/1050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 83ms/step - accuracy: 0.9965 - loss: 0.0111 - val_accuracy: 0.9927 - val_loss: 0.0257


In [22]:
# Evaluate model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.9931 - loss: 0.0245
Test Accuracy: 0.9931


In [28]:
model.save("ner_model.h5")



In [29]:
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer

# Save word tokenizer
with open("word_tokenizer.pkl", "wb") as f:
    pickle.dump(word_tokenizer, f)

# Save tag tokenizer
with open("tag_tokenizer.pkl", "wb") as f:
    pickle.dump(tag_tokenizer, f)

In [31]:
from flask import Flask, request, jsonify
import tensorflow as tf
import pickle
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the trained model
model = tf.keras.models.load_model("ner_model.h5")

# Load tokenizers
with open("word_tokenizer.pkl", "rb") as f:
    word_tokenizer = pickle.load(f)

with open("tag_tokenizer.pkl", "rb") as f:
    tag_tokenizer = pickle.load(f)

# Get tag index mapping
index_to_tag = {index: tag for tag, index in tag_tokenizer.word_index.items()}

# Initialize Flask app
app = Flask(__name__)

# Define prediction function
def predict_ner(sentence):
    words = sentence.split()
    sequence = word_tokenizer.texts_to_sequences([words])
    padded_sequence = pad_sequences(sequence, maxlen=model.input_shape[1], padding="post")

    predictions = model.predict(padded_sequence)
    predicted_tags = np.argmax(predictions, axis=-1)

    output = [{"word": word, "tag": index_to_tag.get(tag, "O")} for word, tag in zip(words, predicted_tags[0])]
    return output

# Define API endpoint
@app.route("/predict", methods=["POST"])
def predict():
    data = request.json
    sentence = data.get("sentence", "")

    if not sentence:
        return jsonify({"error": "No sentence provided"}), 400

    ner_result = predict_ner(sentence)
    return jsonify({"predictions": ner_result})

# Run Flask app
if __name__ == "__main__":
    app.run(debug=True)




 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [1]:
import requests

url = "http://127.0.0.1:5000/predict"
data = {"sentence": "Elon Musk is the CEO of Tesla."}

response = requests.post(url, json=data)

print(response.json())


{'error': 'Michael Jackson visited New York'}
