### NER ###
Notice that this is vanilla example and the train set sentences are composed of very narrow vocabulary. Therefore the predictions for the unknown words during the train phase may be not good. But notice that for the known words this is a good model.</br>
If you want you can upload your own dataset and train the model based on it.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import random

In [None]:
def generate_sentences(n):
    person_prefixes = [
    ['I', 'meet'], ['She', 'knows'], ['He', 'saw'], ['We', 'spoke'], ['They', 'contact'], 
    ['They', 'met'], ['I', 'interviewed'], ['She', 'recognized'], ['He', 'noticed'], ['We', 'greeted'],
    ['They', 'welcomed'], ['I', 'found'], ['She', 'introduced'], ['He', 'mentioned'], ['We', 'saw'],
    ['They', 'had dinner with'], ['I', 'chatted with'], ['She', 'appreciated'], ['He', 'hugged'], ['We', 'appreciated'],
    ['They', 'talked to'], ['I', 'praised'], ['She', 'helped'], ['He', 'listened to'], ['We', 'thanked'],
    ['They', 'informed'], ['I', 'advised'], ['She', 'invited'], ['He', 'accepted'], ['We', 'met with']
    ]

    location_prefixes = [
    ['I', 'visit'], ['We', 'are in'], ['They', 'leave'], ['She', 'is from'], ['He', 'lives'],
    ['I', 'arrived'], ['We', 'travel'], ['They', 'explore'], ['She', 'moved'], ['He', 'commutes'],
    ['I', 'love'], ['We', 'vacationed in'], ['They', 'study in'], ['She', 'works in'], ['He', 'going'],
    ['I', 'recommend'], ['We', 'enjoyed'], ['They', 'flew to'], ['She', 'visited'], ['He', 'stays in'],
    ['I', 'discovered'], ['We', 'drive'], ['They', 'departed'], ['She', 'return'], ['He', 'be in'],
    ['I', 'hiked in'], ['We', 'visit'], ['They', 'traveling'], ['She', 'was'], ['He', 'left']
    ]
    neutral_prefixes = [
    ['I', 'have'], ['We', 'found'], ['They', 'like'], ['She', 'ate'], ['He', 'saw'],
    ['I', 'own'], ['We', 'discovered'], ['They', 'use'], ['She', 'bought'], ['He', 'enjoys'],
    ['I', 'sold'], ['We', 'lost'], ['They', 'have'], ['She', 'made'], ['He', 'found'],
    ['I', 'dropped'], ['We', 'showed'], ['They', 'ate'], ['She', 'threw'], ['He', 'washed'],
    ['I', 'bought'], ['We', 'borrowed'], ['They', 'cooked'], ['She', 'read'], ['He', 'cleaned'],
    ['I', 'wrote'], ['We', 'gave'], ['They', 'painted'], ['She', 'played with'], ['He', 'picked up']
    ]
    person_names = [
    'John', 'Alice', 'Michael', 'Emma', 'Daniel', 'Sophie', 'Lucas', 'Olivia', 'Ethan', 'Isabella',
    'Mason', 'Ava', 'Logan', 'Mia', 'Liam', 'Gal', 'Noah', 'Ella', 'Aiden', 'Amelia',
    'Elijah', 'Avery', 'James', 'Scarlett', 'Benjamin', 'Grace', 'Jacob', 'Chloe', 'Matthew', 'Evelyn'
    ]
    locations = [
    'Paris', 'Berlin', 'New York', 'London', 'Tokyo', 'Rome', 'Sydney', 'Toronto', 'Moscow', 'Dubai',
    'Singapore', 'Madrid', 'Istanbul', 'Yavne', 'Amsterdam', 'San Francisco', 'Acre', 'Bangkok', 'Budapest', 'Vienna',
    'Mumbai', 'Shanghai', 'Cairo', 'Rio', 'Melbourne', 'Barcelona', 'Dublin', 'Seoul', 'Athens', 'Vancouver'
    ]
    neutral_words = [
    'book', 'apple', 'chair', 'dog', 'car', 'table', 'phone', 'coffee', 'pen', 'glasses',
    'shirt', 'shoes', 'bag', 'hat', 'watch', 'wallet', 'camera', 'computer', 'cake', 'bread'
    ]
    postfixes = [['last', 'week'], ['this', 'month'], ['next', 'year'], ['every', 'day'], ['on', 'Sunday']]

    sentences = []
    labels = []
    for _ in range(n):
        category = random.choice(['Person', 'Location', 'None'])
        if category == 'Person':
            prefixes = person_prefixes
            target_words = person_names
        elif category == 'Location':
            prefixes = location_prefixes
            target_words = locations
        else:  # None
            prefixes = neutral_prefixes
            target_words = neutral_words

        label = category

        prefix = random.choice(prefixes)
        target = random.choice(target_words)
        postfix = random.choice(postfixes)

        sentence = f"{prefix[0]} {prefix[1]} {target} {postfix[0]} {postfix[1]}"
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

In [None]:
sentences, labels = generate_sentences(10000)

# Vectorize the input (features and labels)
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(sentences)

le = LabelEncoder()
Y_vec = le.fit_transform(labels)

# Define the Dataset
class MyDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = torch.LongTensor(targets)
    
    def __getitem__(self, index):
        return self.data[index].toarray(), self.targets[index]
    
    def __len__(self):
        return len(self.targets)

# Split into training and testing
split_ratio = 0.8
split_idx = int(split_ratio * len(sentences))
train_dataset = MyDataset(X_vec[:split_idx], Y_vec[:split_idx])
test_dataset = MyDataset(X_vec[split_idx:], Y_vec[split_idx:])

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
# Hyperparameters
input_dim = len(vectorizer.get_feature_names_out())
hidden_dim = 64
output_dim = len(le.classes_)
lr = 0.001
num_epochs = 10

# Define the model
class Classifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = torch.relu(out)
        out = self.fc2(out)
        return out

model = Classifier(input_dim, hidden_dim, output_dim)

# Define the loss and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in tqdm(train_dataloader):
        inputs = inputs.squeeze(1).float()
        targets = targets.long()

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, targets)

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

In [None]:
# Testing
correct = 0
total = 0
with torch.no_grad():
    model.eval()

    for inputs, targets in test_dataloader:
        inputs = inputs.squeeze(1).float()
        targets = targets.long()

        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

print(f"Accuracy on the test set: {100 * correct / total:.2f}%")

In [None]:
def predict(sentence):
    # Prepare the input
    X = vectorizer.transform([sentence])

    # Convert the input to a tensor
    X_tensor = torch.from_numpy(X.toarray()).float()

    # Pass the input through the model
    outputs = model(X_tensor)

    # Use softmax to get the probabilities and take the argmax to get the predicted classes
    _, predicted_classes = torch.max(nn.functional.softmax(outputs, dim=1), 1)

    # Decode the predicted classes
    predicted_tags = le.inverse_transform(predicted_classes.detach().numpy())

    # Return the predicted tag for the sentence
    return predicted_tags[0]

# Use the function

tag = predict("I meet iris next week")
print(f'The predicted tag for the sentence is: {tag}')
