# Feature Engineering


In [None]:
# Imports
import pandas as pd
import textstat
import spacy
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from wordfreq import word_frequency


In [None]:
# Load data from energy_measurements.jsonl
data = pd.read_json('data/energy_measurements.jsonl', lines=True)


In [None]:
# add syntactic_tree_depth to data
nlp = spacy.load("en_core_web_sm")

def get_syntactic_tree_depth(text):
    doc = nlp(text)
    return max([len(list(token.ancestors)) for token in doc], default=0)

data["syntactic_tree_depth"] = [get_syntactic_tree_depth(text) for text in data["prompt"]]


In [None]:
# add clause_count to data
clause_words = [
    "and", "but", "because", "that", "which", "who", "although", "if", "when", "while", "though",
    "or", "so", "yet", "nor", "for", "since", "until", "unless", "whereas", "wherever", "wherever",
    "however", "therefore", "moreover", "furthermore", "nevertheless", "meanwhile", "consequently",
    "whereas", "provided", "supposing", "in case", "as long as", "as soon as", "as if", "as though",
    "even though", "even if", "so that", "in order that", "such that", "now that", "once", "before",
    "after", "during", "throughout", "despite", "in spite of", "regardless of", "due to", "owing to",
    "thanks to", "as a result", "consequently", "thus", "hence", "accordingly", "similarly", "likewise",
    "on the other hand", "in contrast", "by contrast", "alternatively", "instead", "rather", "instead of",
    "as well as", "along with", "together with", "in addition to", "besides", "apart from", "except",
    "except for", "other than", "instead of", "rather than", "as opposed to", "in comparison to",
    "compared to", "compared with", "unlike", "like", "such as", "for example", "for instance",
    "namely", "that is", "i.e.", "e.g.", "in other words", "to put it differently", "to clarify"
]

data["clause_count"] = data["prompt"].str.lower().apply(
    lambda x: sum(x.count(word) for word in clause_words)
)


In [None]:
# add flesch_kincaid_grade to data
data["flesch_kincaid_grade"] = data["prompt"].apply(textstat.flesch_kincaid_grade)


In [None]:
# add gunning_fog_index to data
data["gunning_fog_index"] = data["prompt"].apply(textstat.gunning_fog)


In [None]:
# add smog_index to data
data["smog_index"] = data["prompt"].apply(textstat.smog_index)


In [None]:
# add avg_word_frequency to data
data["avg_word_frequency"] = data["prompt"].apply(
    lambda x: sum(word_frequency(w.lower(), "en") for w in x.split()) / len(x.split()) if x.split() else 0
)


In [None]:
# add lexical_diversity to data
data["lexical_diversity"] = data["prompt"].apply(
    lambda x: len(set(x.split())) / (len(x.split()) ** 0.5) if x.split() else 0
)


In [None]:
# add type_token_ratio to data
data["type_token_ratio"] = (
    data["prompt"].str.split().apply(lambda words: len(set(words)) / len(words) if words else 0)
)


In [None]:
# add vocabulary_richness to data
data["vocabulary_richness"] = data["prompt"].apply(
    lambda x: len(set(x.split())) / (len(x.split()) + 1e-6)
)


In [None]:
# add named_entity_density to data
data["named_entity_density"] = [
    len(doc.ents) / len(doc) if len(doc) else 0
    for doc in nlp.pipe(data["prompt"], disable=["tagger", "parser"])
]


In [None]:
# add tech_ai_density to data
tech_ai_keywords = [
    "technology", "artificial", "intelligence", "machine", "learning", "algorithm", "data", "computer", 
    "software", "programming", "code", "digital", "online", "internet", "app", "system", "platform", 
    "database", "cloud", "cyber", "ai", "ml", "neural", "network", "deep", "model", "training", 
    "prediction", "automation", "robotic", "bot", "api", "framework", "library", "package", "module",
    "function", "variable", "loop", "condition", "logic", "syntax", "debug", "compile", "execute",
    "server", "client", "frontend", "backend", "fullstack", "devops", "deployment", "infrastructure",
    "security", "encryption", "authentication", "authorization", "privacy", "blockchain", "crypto",
    "bitcoin", "ethereum", "smart", "contract", "defi", "nft", "metaverse", "vr", "ar", "iot",
    "sensor", "device", "hardware", "cpu", "gpu", "memory", "storage", "bandwidth", "latency"
]

data["tech_ai_density"] = data["prompt"].str.lower().apply(
    lambda x: sum(1 for keyword in tech_ai_keywords if keyword in x) / len(x.split()) if x.split() else 0
)


In [None]:
# add business_finance_density to data
business_finance_keywords = [
    "business", "company", "finance", "money", "investment", "market", "economy", "profit", "revenue", 
    "budget", "cost", "price", "value", "stock", "trading", "banking", "credit", "loan", "payment", 
    "transaction", "capital", "funding", "venture", "startup", "enterprise", "corporation", "firm",
    "industry", "sector", "marketplace", "competition", "competitive", "strategy", "planning", "growth",
    "expansion", "acquisition", "merger", "partnership", "collaboration", "contract", "agreement",
    "negotiation", "deal", "offer", "proposal", "bid", "tender", "quotation", "invoice", "receipt",
    "accounting", "bookkeeping", "audit", "tax", "taxation", "deduction", "exemption", "refund",
    "financial", "monetary", "fiscal", "economic", "commercial", "retail", "wholesale", "distribution",
    "supply", "demand", "consumption", "production", "manufacturing", "operations", "logistics",
    "management", "leadership", "administration", "governance", "compliance", "regulation", "policy",
    "risk", "insurance", "coverage", "premium", "claim", "settlement", "liability", "asset", "equity",
    "debt", "bond", "security", "portfolio", "diversification", "return", "yield", "dividend", "interest"
]

data["business_finance_density"] = data["prompt"].str.lower().apply(
    lambda x: sum(1 for keyword in business_finance_keywords if keyword in x) / len(x.split()) if x.split() else 0
)


In [None]:
# add health_medical_density to data
health_medical_keywords = [
    "health", "medical", "doctor", "patient", "treatment", "medicine", "drug", "therapy", "surgery", 
    "hospital", "clinic", "diagnosis", "symptom", "disease", "illness", "wellness", "fitness", 
    "exercise", "nutrition", "diet", "healthcare", "physician", "nurse", "specialist", "surgeon",
    "dentist", "psychiatrist", "psychologist", "therapist", "counselor", "practitioner", "provider",
    "pharmacy", "pharmacist", "prescription", "medication", "dosage", "side", "effect", "allergy",
    "infection", "virus", "bacteria", "pathogen", "immune", "immunity", "vaccine", "vaccination",
    "prevention", "cure", "recovery", "rehabilitation", "physical", "mental", "emotional", "psychological",
    "chronic", "acute", "severe", "mild", "moderate", "condition", "disorder", "syndrome", "disability",
    "injury", "wound", "fracture", "sprain", "strain", "pain", "ache", "sore", "tender", "swollen",
    "inflammation", "fever", "temperature", "blood", "pressure", "heart", "rate", "pulse", "breathing",
    "respiration", "oxygen", "circulation", "cardiovascular", "respiratory", "digestive", "nervous",
    "endocrine", "immune", "system", "organ", "tissue", "cell", "gene", "genetic", "hereditary",
    "congenital", "developmental", "aging", "elderly", "pediatric", "maternal", "prenatal", "postnatal"
]

data["health_medical_density"] = data["prompt"].str.lower().apply(
    lambda x: sum(1 for keyword in health_medical_keywords if keyword in x) / len(x.split()) if x.split() else 0
)


In [None]:
# add education_learning_density to data
education_learning_keywords = [
    "education", "school", "university", "college", "student", "teacher", "learning", "study", "course", 
    "degree", "knowledge", "skill", "training", "research", "academic", "scholar", "expert", "professional", 
    "career", "job", "classroom", "lecture", "seminar", "workshop", "tutorial", "lesson", "curriculum",
    "syllabus", "textbook", "assignment", "homework", "project", "thesis", "dissertation", "paper",
    "essay", "report", "presentation", "exam", "test", "quiz", "grade", "score", "mark", "evaluation",
    "assessment", "feedback", "instructor", "professor", "lecturer", "tutor", "mentor", "advisor",
    "counselor", "guidance", "coaching", "instruction", "teaching", "pedagogy", "methodology", "approach",
    "technique", "strategy", "practice", "exercise", "drill", "repetition", "memorization", "comprehension",
    "understanding", "analysis", "synthesis", "evaluation", "critical", "thinking", "problem", "solving",
    "creativity", "innovation", "discovery", "exploration", "investigation", "inquiry", "question",
    "answer", "solution", "explanation", "clarification", "demonstration", "example", "illustration",
    "concept", "theory", "principle", "law", "rule", "formula", "equation", "calculation", "computation",
    "literacy", "numeracy", "communication", "language", "vocabulary", "grammar", "syntax", "pronunciation",
    "reading", "writing", "speaking", "listening", "comprehension", "fluency", "proficiency", "mastery"
]

data["education_learning_density"] = data["prompt"].str.lower().apply(
    lambda x: sum(1 for keyword in education_learning_keywords if keyword in x) / len(x.split()) if x.split() else 0
)


In [None]:
# add science_research_density to data
science_research_keywords = [
    "science", "research", "study", "experiment", "theory", "hypothesis", "analysis", "discovery", 
    "innovation", "development", "testing", "results", "findings", "evidence", "proof", "method", 
    "process", "technique", "approach", "scientific", "empirical", "observational", "experimental",
    "laboratory", "lab", "laboratory", "facility", "equipment", "instrument", "measurement", "data",
    "statistics", "statistical", "probability", "correlation", "causation", "variable", "control", "group",
    "sample", "population", "survey", "questionnaire", "interview", "observation", "documentation",
    "publication", "journal", "article", "paper", "conference", "presentation", "poster", "abstract",
    "citation", "reference", "bibliography", "literature", "review", "meta", "analysis", "systematic",
    "quantitative", "qualitative", "mixed", "methods", "design", "protocol", "procedure", "guideline",
    "standard", "criteria", "validity", "reliability", "accuracy", "precision", "error", "bias", "confound",
    "physics", "chemistry", "biology", "mathematics", "statistics", "engineering", "technology", "medicine",
    "psychology", "sociology", "anthropology", "geography", "geology", "astronomy", "environmental",
    "climate", "ecology", "evolution", "genetics", "molecular", "cellular", "organism", "species",
    "biodiversity", "conservation", "sustainability", "renewable", "energy", "pollution", "emission"
]

data["science_research_density"] = data["prompt"].str.lower().apply(
    lambda x: sum(1 for keyword in science_research_keywords if keyword in x) / len(x.split()) if x.split() else 0
)


In [None]:
# add social_relationships_density to data
social_relationships_keywords = [
    "relationship", "family", "friend", "social", "community", "group", "team", "partner", "marriage", 
    "dating", "communication", "interaction", "connection", "support", "help", "advice", "guidance", 
    "mentor", "coach", "parent", "child", "sibling", "spouse", "couple", "romance", "love", "affection",
    "intimacy", "trust", "loyalty", "commitment", "bond", "attachment", "closeness", "proximity",
    "socialization", "networking", "collaboration", "cooperation", "teamwork", "partnership", "alliance",
    "friendship", "companionship", "fellowship", "camaraderie", "solidarity", "unity", "harmony",
    "conflict", "disagreement", "argument", "dispute", "tension", "stress", "pressure", "anxiety",
    "isolation", "loneliness", "separation", "divorce", "breakup", "rejection", "abandonment", "loss",
    "grief", "mourning", "bereavement", "sadness", "depression", "anxiety", "worry", "fear", "anger",
    "jealousy", "envy", "resentment", "bitterness", "forgiveness", "reconciliation", "healing", "recovery",
    "therapy", "counseling", "support", "group", "therapy", "peer", "support", "mentoring", "coaching",
    "leadership", "followership", "influence", "persuasion", "negotiation", "mediation", "arbitration",
    "culture", "tradition", "custom", "ritual", "ceremony", "celebration", "festival", "holiday",
    "community", "neighborhood", "society", "civilization", "population", "demographics", "diversity",
    "inclusion", "exclusion", "discrimination", "prejudice", "bias", "stereotype", "stigma", "label"
]

data["social_relationships_density"] = data["prompt"].str.lower().apply(
    lambda x: sum(1 for keyword in social_relationships_keywords if keyword in x) / len(x.split()) if x.split() else 0
)


In [None]:
# add entertainment_culture_density to data
entertainment_culture_keywords = [
    "entertainment", "music", "movie", "film", "book", "game", "sport", "art", "culture", "fun", 
    "enjoyment", "hobby", "interest", "passion", "creativity", "imagination", "story", "narrative", 
    "character", "plot", "theater", "drama", "comedy", "tragedy", "romance", "action", "adventure",
    "thriller", "horror", "mystery", "fantasy", "science", "fiction", "documentary", "reality", "show",
    "television", "tv", "streaming", "netflix", "youtube", "podcast", "radio", "broadcast", "media",
    "news", "journalism", "blog", "vlog", "social", "media", "facebook", "twitter", "instagram", "tiktok",
    "dance", "ballet", "opera", "concert", "performance", "show", "exhibition", "gallery", "museum",
    "library", "bookstore", "magazine", "newspaper", "publication", "author", "writer", "poet", "novelist",
    "artist", "painter", "sculptor", "photographer", "designer", "architect", "musician", "singer",
    "composer", "conductor", "actor", "actress", "director", "producer", "screenwriter", "playwright",
    "athlete", "player", "team", "coach", "training", "practice", "competition", "tournament", "championship",
    "league", "season", "match", "game", "score", "victory", "defeat", "win", "lose", "tie", "draw",
    "fitness", "exercise", "workout", "gym", "yoga", "pilates", "running", "cycling", "swimming",
    "hiking", "climbing", "skiing", "snowboarding", "surfing", "diving", "sailing", "fishing", "hunting",
    "cooking", "baking", "recipe", "restaurant", "cuisine", "food", "drink", "wine", "beer", "cocktail",
    "travel", "vacation", "holiday", "trip", "journey", "adventure", "exploration", "discovery", "sightseeing"
]

data["entertainment_culture_density"] = data["prompt"].str.lower().apply(
    lambda x: sum(1 for keyword in entertainment_culture_keywords if keyword in x) / len(x.split()) if x.split() else 0
)


In [None]:
# add travel_lifestyle_density to data
travel_lifestyle_keywords = [
    "travel", "trip", "vacation", "holiday", "destination", "place", "country", "city", "hotel", 
    "restaurant", "food", "cooking", "recipe", "lifestyle", "routine", "habit", "experience", 
    "adventure", "exploration", "journey", "voyage", "expedition", "tour", "sightseeing", "landmark",
    "monument", "museum", "gallery", "park", "beach", "mountain", "forest", "desert", "ocean", "lake",
    "river", "island", "peninsula", "coast", "shore", "valley", "canyon", "cave", "volcano", "glacier",
    "airport", "airline", "flight", "plane", "aircraft", "pilot", "passenger", "ticket", "boarding",
    "passport", "visa", "customs", "immigration", "border", "checkpoint", "security", "luggage", "baggage",
    "hotel", "accommodation", "resort", "hostel", "bed", "breakfast", "suite", "room", "check", "in",
    "checkout", "reception", "concierge", "service", "amenity", "facility", "pool", "spa", "gym",
    "restaurant", "dining", "cuisine", "menu", "chef", "waiter", "waitress", "tip", "bill", "payment",
    "shopping", "store", "mall", "market", "boutique", "souvenir", "gift", "purchase", "buy", "sell",
    "currency", "money", "cash", "credit", "card", "atm", "exchange", "rate", "price", "cost", "budget",
    "transportation", "bus", "train", "subway", "metro", "taxi", "uber", "lyft", "car", "rental", "drive",
    "walking", "hiking", "cycling", "scooter", "motorcycle", "boat", "cruise", "ferry", "subway", "tram",
    "lifestyle", "daily", "routine", "schedule", "appointment", "meeting", "work", "office", "home",
    "family", "personal", "private", "leisure", "recreation", "relaxation", "stress", "relief", "wellness",
    "health", "fitness", "exercise", "diet", "nutrition", "sleep", "rest", "recovery", "energy", "vitality"
]

data["travel_lifestyle_density"] = data["prompt"].str.lower().apply(
    lambda x: sum(1 for keyword in travel_lifestyle_keywords if keyword in x) / len(x.split()) if x.split() else 0
)


In [None]:
# add sentiment_polarity to data
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
data["sentiment_polarity"] = data["prompt"].apply(lambda x: sia.polarity_scores(x)["compound"])


In [None]:
# add sentiment_intensity to data
data["sentiment_intensity"] = data["prompt"].apply(
    lambda x: abs(sia.polarity_scores(x)["pos"] - sia.polarity_scores(x)["neg"])
)


In [None]:
# add information_density to data
def get_information_density(text):
    doc = nlp(text)
    if len(doc) == 0:
        return 0
    content_words = [t for t in doc if t.pos_ in ["NOUN", "VERB", "ADJ", "ADV"]]
    return len(content_words) / len(doc)

data["information_density"] = [get_information_density(text) for text in data["prompt"]]


In [None]:
# add avg_sentence_length_prompt to data
data["avg_sentence_length_prompt"] = (
    data["prompt"]
    .str.split(r"[.!?]")
    .apply(lambda sents: sum(len(sent.split()) for sent in sents if sent.strip()) / len([sent for sent in sents if sent.strip()]) if any(sent.strip() for sent in sents) else 0)
)


In [None]:
# add abstract_thinking_density to data
abstract_thinking_keywords = [
    "concept", "idea", "thought", "thinking", "reasoning", "logic", "rational", "analysis", "evaluation", 
    "judgment", "opinion", "perspective", "viewpoint", "understanding", "comprehension", "interpretation", 
    "meaning", "significance", "importance", "value", "abstract", "theoretical", "philosophical", "metaphysical",
    "existential", "ontological", "epistemological", "phenomenological", "hermeneutical", "dialectical",
    "synthesis", "integration", "coordination", "harmony", "unity", "coherence", "consistency", "clarity",
    "precision", "accuracy", "validity", "truth", "reality", "existence", "being", "essence", "nature",
    "substance", "form", "matter", "spirit", "soul", "mind", "consciousness", "awareness", "perception",
    "cognition", "intelligence", "wisdom", "knowledge", "insight", "intuition", "imagination", "creativity",
    "innovation", "originality", "uniqueness", "distinctiveness", "individuality", "personality", "identity",
    "self", "ego", "subconscious", "unconscious", "collective", "universal", "transcendent", "immanent",
    "absolute", "relative", "objective", "subjective", "empirical", "rational", "intuitive", "mystical",
    "spiritual", "religious", "sacred", "divine", "eternal", "infinite", "timeless", "immortal", "permanent",
    "temporary", "transient", "ephemeral", "fleeting", "momentary", "instant", "duration", "continuity"
]

data["abstract_thinking_density"] = data["prompt"].str.lower().apply(
    lambda x: sum(1 for keyword in abstract_thinking_keywords if keyword in x) / len(x.split()) if x.split() else 0
)


In [None]:
# add problem_solving_density to data
problem_solving_keywords = [
    "problem", "solution", "challenge", "difficulty", "issue", "trouble", "obstacle", "barrier", "strategy", 
    "approach", "method", "technique", "process", "procedure", "step", "plan", "goal", "objective", "target", 
    "aim", "resolve", "fix", "repair", "correct", "improve", "enhance", "optimize", "maximize", "minimize",
    "efficiency", "effectiveness", "productivity", "performance", "outcome", "result", "consequence", "impact",
    "troubleshoot", "debug", "diagnose", "identify", "locate", "find", "discover", "detect", "recognize",
    "analyze", "examine", "investigate", "research", "study", "explore", "test", "experiment", "trial",
    "error", "mistake", "failure", "success", "achievement", "accomplishment", "victory", "win", "overcome",
    "defeat", "conquer", "master", "control", "manage", "handle", "deal", "cope", "adapt", "adjust",
    "modify", "change", "alter", "transform", "convert", "translate", "interpret", "explain", "clarify",
    "simplify", "complex", "complicated", "intricate", "sophisticated", "advanced", "basic", "fundamental",
    "essential", "critical", "important", "significant", "relevant", "applicable", "practical", "useful",
    "effective", "successful", "working", "functional", "operational", "systematic", "organized", "structured",
    "logical", "rational", "reasonable", "sensible", "sound", "valid", "reliable", "consistent", "stable",
    "robust", "flexible", "adaptable", "versatile", "comprehensive", "complete", "thorough", "detailed"
]

data["problem_solving_density"] = data["prompt"].str.lower().apply(
    lambda x: sum(1 for keyword in problem_solving_keywords if keyword in x) / len(x.split()) if x.split() else 0
)


In [None]:
# add communication_density to data
communication_keywords = [
    "communication", "discussion", "conversation", "dialogue", "debate", "argument", "explanation", "description", 
    "clarification", "understanding", "misunderstanding", "confusion", "agreement", "disagreement", "consensus", 
    "compromise", "negotiation", "persuasion", "influence", "convince", "speak", "talk", "say", "tell", "express",
    "convey", "transmit", "share", "exchange", "interact", "connect", "relate", "associate", "link", "bond",
    "message", "information", "data", "content", "meaning", "significance", "importance", "relevance", "context",
    "background", "history", "story", "narrative", "account", "report", "summary", "overview", "introduction",
    "conclusion", "ending", "beginning", "start", "finish", "complete", "partial", "whole", "entire", "total",
    "language", "words", "vocabulary", "terminology", "jargon", "slang", "dialect", "accent", "pronunciation",
    "grammar", "syntax", "structure", "format", "style", "tone", "mood", "attitude", "feeling", "emotion",
    "written", "spoken", "verbal", "nonverbal", "body", "language", "gesture", "facial", "expression", "eye",
    "contact", "posture", "stance", "position", "distance", "proximity", "closeness", "intimacy", "formality",
    "informality", "casual", "professional", "official", "unofficial", "public", "private", "personal", "intimate"
]

data["communication_density"] = data["prompt"].str.lower().apply(
    lambda x: sum(1 for keyword in communication_keywords if keyword in x) / len(x.split()) if x.split() else 0
)


In [None]:
# add emotional_psychological_density to data
emotional_psychological_keywords = [
    "emotion", "feeling", "mood", "attitude", "personality", "character", "behavior", "reaction", "response", 
    "motivation", "inspiration", "confidence", "anxiety", "stress", "pressure", "comfort", "satisfaction", 
    "disappointment", "frustration", "excitement", "happiness", "joy", "pleasure", "delight", "satisfaction",
    "contentment", "fulfillment", "achievement", "pride", "accomplishment", "success", "victory", "triumph",
    "sadness", "sorrow", "grief", "mourning", "loss", "bereavement", "depression", "melancholy", "despair",
    "hopelessness", "helplessness", "powerlessness", "vulnerability", "fragility", "sensitivity", "tenderness",
    "anger", "rage", "fury", "wrath", "irritation", "annoyance", "frustration", "resentment", "bitterness",
    "hatred", "hostility", "aggression", "violence", "conflict", "tension", "strain", "pressure", "burden",
    "fear", "anxiety", "worry", "concern", "apprehension", "dread", "terror", "panic", "alarm", "distress",
    "shame", "guilt", "regret", "remorse", "embarrassment", "humiliation", "disgrace", "dishonor", "disgrace",
    "love", "affection", "tenderness", "caring", "compassion", "empathy", "sympathy", "understanding", "support",
    "jealousy", "envy", "covetousness", "greed", "selfishness", "selflessness", "altruism", "generosity",
    "kindness", "goodness", "virtue", "morality", "ethics", "values", "principles", "beliefs", "convictions",
    "faith", "trust", "hope", "optimism", "pessimism", "cynicism", "skepticism", "doubt", "uncertainty"
]

data["emotional_psychological_density"] = data["prompt"].str.lower().apply(
    lambda x: sum(1 for keyword in emotional_psychological_keywords if keyword in x) / len(x.split()) if x.split() else 0
)


In [None]:
# add decision_making_density to data
decision_making_keywords = [
    "decision", "choice", "option", "alternative", "preference", "priority", "consideration", "factor", 
    "criteria", "standard", "requirement", "condition", "constraint", "limitation", "advantage", "disadvantage", 
    "benefit", "risk", "consequence", "outcome", "result", "effect", "impact", "influence", "determine",
    "decide", "choose", "select", "pick", "elect", "vote", "favor", "prefer", "recommend", "suggest",
    "advise", "counsel", "guide", "direct", "lead", "manage", "control", "govern", "rule", "regulate",
    "judgment", "assessment", "evaluation", "appraisal", "estimation", "calculation", "computation", "analysis",
    "examination", "investigation", "research", "study", "exploration", "inquiry", "question", "query",
    "weigh", "balance", "compare", "contrast", "distinguish", "differentiate", "separate", "divide",
    "categorize", "classify", "organize", "arrange", "order", "rank", "rate", "score", "grade", "mark",
    "important", "significant", "relevant", "pertinent", "applicable", "suitable", "appropriate", "fitting",
    "necessary", "essential", "required", "mandatory", "optional", "voluntary", "discretionary", "flexible",
    "rigid", "strict", "lenient", "permissive", "restrictive", "limiting", "constraining", "binding"
]

data["decision_making_density"] = data["prompt"].str.lower().apply(
    lambda x: sum(1 for keyword in decision_making_keywords if keyword in x) / len(x.split()) if x.split() else 0
)


In [None]:
# add time_change_density to data
time_change_keywords = [
    "time", "moment", "period", "duration", "schedule", "timeline", "deadline", "change", "development", 
    "progress", "improvement", "growth", "evolution", "transformation", "transition", "shift", "trend", 
    "pattern", "cycle", "phase", "stage", "step", "level", "degree", "extent", "scope", "range", "scale",
    "before", "after", "during", "while", "since", "until", "when", "then", "now", "past", "present", "future",
    "early", "late", "soon", "recently", "previously", "originally", "initially", "finally", "ultimately",
    "eventually", "gradually", "suddenly", "immediately", "instantly", "quickly", "slowly", "rapidly", "swiftly",
    "temporarily", "permanently", "constantly", "continuously", "intermittently", "occasionally", "frequently",
    "rarely", "never", "always", "sometimes", "often", "usually", "typically", "normally", "regularly",
    "irregularly", "sporadically", "randomly", "systematically", "methodically", "carefully", "hastily",
    "urgently", "emergently", "critically", "importantly", "significantly", "substantially", "considerably",
    "slightly", "moderately", "greatly", "enormously", "tremendously", "vastly", "dramatically", "radically",
    "completely", "totally", "entirely", "fully", "partially", "incompletely", "unfinished", "ongoing", "active"
]

data["time_change_density"] = data["prompt"].str.lower().apply(
    lambda x: sum(1 for keyword in time_change_keywords if keyword in x) / len(x.split()) if x.split() else 0
)


In [None]:
# add semantic_category_diversity to data
data["semantic_category_diversity"] = [
    len({token.pos_ for token in doc}) if len(doc) > 0 else 0
    for doc in nlp.pipe(data["prompt"], disable=["tagger", "parser"])
]


In [None]:
# Save the modified data to energy_features_dataset.jsonl
data.to_json('energy_features_dataset.jsonl', orient='records', lines=True)
