Reading Dataset Files

In [14]:
#libraries
import pandas as pd
from ast import literal_eval

# List of dataset filenames
dataset_filenames = ['twitter_train.csv', 'restaurants_train.csv']

# Initialize an empty string to store the combined documents
combined_documents = ""

# Read files and combine documents
for filename in dataset_filenames:
    # Read the dataset
    data = pd.read_csv(filename)

    # Convert string representations of lists to actual lists
    data['Tokens'] = data['Tokens'].apply(literal_eval)

    # Combine the tokenized text from all rows
    all_documents = [' '.join(tokens) for tokens in data['Tokens']]
    
    # Concatenate the documents into the combined string
    combined_documents += ' '.join(all_documents) + ' '

# Print or use the combined string as needed
print(combined_documents)



Noun and Noun Phrase using Parametric Co-occurrence Matrix

In [18]:
#import library
import spacy

#loading spacy model
nlp = spacy.load("en_core_web_sm")

doc = nlp(combined_documents)
sentences = list(doc.sents)

nouns = []
noun_phrases = []
# Iterate over sentences
for sentence in sentences:
    for chunk in sentence.noun_chunks:
        noun_phrases.append(chunk.text)

    # Iterate over tokens in each sentence
    """
    for token in sentence:
        # Check if the token is a noun
        if token.pos_ == "NOUN":
            nouns.append(token.text)
""" 
# Extract noun chunks in each sentence
   

# Print the combined list of nouns and noun chunks
combined_nouns = nouns + noun_phrases
print(combined_nouns)



In [20]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

sentence = combined_documents

# Tokenize the test sentence
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sentence)))

noun_phrase = combined_nouns

# Convert test noun chunks to token positions
phrase_positions = []
for chunk in noun_phrase:
    chunk_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(chunk)))
    phrase_positions.append([i for i, token in enumerate(tokens) if token in chunk_tokens])

# Convert test tokens to IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)
phrase_positions_ids = [item for sublist in phrase_positions for item in sublist]

# Create a binary label tensor where 1 indicates an aspect and 0 otherwise
labels = [1 if i in phrase_positions_ids else 0 for i in range(len(tokens))]
labels = torch.tensor(labels)

# Convert input_ids to tensor
input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension

# Load pre-trained BERT model
model = BertModel.from_pretrained('bert-base-uncased')

# Extract contextualized embeddings
with torch.no_grad():
    outputs = model(input_ids)

# Obtain embeddings for each token
word_embeddings = outputs.last_hidden_state.squeeze(0)

# Define a classification model for aspect identification
class AspectClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(AspectClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

# Instantiate the aspect identification model
input_size = word_embeddings.size(1)
hidden_size = 256
output_size = 1
model = AspectClassifier(input_size, hidden_size, output_size)


# Save the trained model weights
torch.save(model.state_dict(), 'aspect_model_improved.pth')

# Load the trained weights (replace 'path_to_your_model_weights.pth' with your actual file path)
model.load_state_dict(torch.load('aspect_model.pth'))
model.eval()  # Set the model to evaluation mode


# Evaluate the model on the test set
with torch.no_grad():
    predictions = (model(word_embeddings) > 0.5).float()


# Convert predictions and labels to numpy arrays for evaluation metrics
predictions_np = predictions.numpy().flatten()
labels_np = labels.numpy()


# Extract the aspects and probabilities from the test predictions
aspects_and_probs = [(tokens[i], predictions_np[i]) for i in range(len(tokens)) if predictions_np[i] == 1]

# Sort the aspects based on their probabilities in descending order
aspects_and_probs.sort(key=lambda x: x[1], reverse=True)

# Select the top 10 aspects
top_10_aspects = aspects_and_probs[:10]

# Extract the aspects and probabilities separately
top_10_aspects, top_10_probs = zip(*top_10_aspects)

# Calculate metrics for each of the top 10 aspects separately
for aspect in top_10_aspects:
    aspect_positions = [i for i, token in enumerate(tokens) if token == aspect]

    # Check if aspect_positions is not empty
    if aspect_positions:
        aspect_labels = [1 if i in aspect_positions else 0 for i in range(len(tokens))]

        aspect_labels_np = torch.tensor(aspect_labels).numpy()

        # Extract predictions only for all positions in the original sentence
        aspect_predictions_np = predictions_np[:len(tokens)]

        aspect_accuracy = accuracy_score(aspect_labels_np, aspect_predictions_np)
        aspect_precision = precision_score(aspect_labels_np, aspect_predictions_np)
        aspect_recall = recall_score(aspect_labels_np, aspect_predictions_np)
        aspect_f1 = f1_score(aspect_labels_np, aspect_predictions_np)

        # Print metrics for each aspect
        print(f"\nMetrics for Aspect '{aspect}':")
        print(f"Accuracy: {aspect_accuracy:.4f}")
        print(f"Precision: {aspect_precision:.4f}")
        print(f"Recall: {aspect_recall:.4f}")
        print(f"F1 Score: {aspect_f1:.4f}")
    else:
        print(f"\nAspect '{aspect}' not found in the test sentence.")


Token indices sequence length is longer than the specified maximum sequence length for this model (240358 > 512). Running this sequence through the model will result in indexing errors





KeyboardInterrupt: 