In [1]:
import json
from typing import List, Dict, Sized

import spacy
import jsonlines

In [2]:
def read_jsonl(file_path: str) -> List[Dict]:
    """
    Reads a JSONL file and returns the data as a list of dictionaries
    
    Args:
        file_path: Path to the JSONL file
        
    Returns:
        List of dictionaries containing the data
    """
    data = []
    with jsonlines.open(file_path) as reader:
        for obj in reader:
            data.append(obj)
    return data

In [3]:
def is_valid_token(token: Sized) -> bool:
    """
    Checks if a token is valid based on specific criteria
    
    Args:
        token: A spaCy token to validate
        
    Returns:
        Boolean indicating if the token is valid
        
    Criteria:
        - Not a stopword
        - Contains only alphabetic characters
        - Not a preposition (IN part of speech tag)
    """
    return (
            not token.is_stop
            and token.is_alpha
            and token.pos_ != "IN"
    )

In [4]:
def post_precess_token(token: str) -> bool:
    """
    Removes measurement unit residuals (mm, cm, m) that might remain in reviews
    after removing numbers, and tokens shorter than 3 characters
    
    Args:
        token: String token to process
        
    Returns:
        Boolean indicating if the token should be kept
    """
    return len(token) > 2 and token not in ["mm", "cm", "m"]

In [5]:
def preprocess_text(text: str) -> List[str]:
    """
    Preprocesses text: tokenization, lemmatization, stopwords removal
    
    Args:
        text: Input text to process
        
    Returns:
        List of processed tokens
    """
    doc = nlp(text.lower())

    valid_tokens = [
        token.lemma_
        for token in doc
        if is_valid_token(token)
    ]

    processed_tokens = [
        token
        for token in valid_tokens
        if post_precess_token(token)
    ]

    return processed_tokens

In [6]:
def extract_relevant_fields(review: Dict) -> Dict[str, str]:
    """
    Extracts only relevant fields from each review
    
    Args:
        review: Dictionary containing review data
        
    Returns:
        Dictionary with only title and text fields
    """
    return {
        "title": review.get("title", ""),
        "text": review.get("text", ""),
    }

In [7]:
def export_data(filepath: str, data: List[Dict]) -> None:
    """
    Exports data to a JSON file
    
    Args:
        filepath: Path where to save the JSON file
        data: Data to export
    """
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [8]:
# Load English language model from spaCy
nlp = spacy.load("en_core_web_sm")

In [9]:
# Read reviews from JSONL file
reviews = read_jsonl("data/reviews.jsonl")
processed_reviews_separated = []  # For separate title and text tokens
processed_reviews_combined = []   # For all tokens combined

In [10]:
# Process each review
for review in reviews:
    relevant_review = extract_relevant_fields(review)

    title_tokens = preprocess_text(relevant_review["title"])
    text_tokens = preprocess_text(relevant_review["text"])

    processed_review_separated = {
        "title_tokens": title_tokens,
        "text_tokens": text_tokens
    }
    processed_reviews_separated.append(processed_review_separated)

    all_tokens = title_tokens + text_tokens
    processed_reviews_combined.append(all_tokens)

In [11]:
# Export processed data
export_data("post_process/processed_reviews.json", processed_reviews_separated)
export_data("post_process/processed_reviews_combined.json", processed_reviews_combined)