Code to Preprocess a .json File which will creates tokens as per given instruction

In [None]:
pip install nltk

In [None]:
import nltk

nltk.download("punkt")

print(nltk.__version__)

In [2]:
import json
import pandas as pd
from nltk.tokenize import word_tokenize
import string  # Import the string module


def preprocessing_json(input_path, output_path):
    # Load the JSON data
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Convert the JSON data to a pandas DataFrame
    df = pd.DataFrame(data)

    processed_data = []

    # Define a set of punctuation characters to filter out
    punctuation_set = set(string.punctuation)

    for i, row in df.iterrows():
        sentence = row["sentence"]
        tokens = word_tokenize(sentence)

        # Filter out punctuation tokens
        tokens = [token for token in tokens if token not in punctuation_set]

        aspect_terms = row["aspect_terms"]

        # Use a set to track unique aspect terms for this sentence
        unique_aspect_terms = set()

        for each_aspect in aspect_terms:
            term = each_aspect["term"]
            polarity = each_aspect["polarity"]

            # Skip if this aspect term has already been processed for this sentence
            if term in unique_aspect_terms:
                continue

            unique_aspect_terms.add(term)

            term_tokens = word_tokenize(term)

            try:
                index = next(
                    i
                    for i in range(len(tokens))
                    if tokens[i : i + len(term_tokens)] == term_tokens
                )
            except StopIteration:
                index = -1  # if not found

            processed_data.append(
                {
                    "tokens": tokens,
                    "polarity": polarity,
                    "aspect_terms": [term],
                    "index": index,
                }
            )

    # Convert the processed data to a DataFrame
    processed_df = pd.DataFrame(processed_data)

    # Save the processed data to a JSON file
    processed_df.to_json(output_path, orient="records", indent=4)


# Input and output file paths
input_path = "train.json"
output_path = "train_task_2.json"

# Call the preprocessing function
preprocessing_json(input_path, output_path)