In [1]:
import pandas as pd
import string
import spacy
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob

In [2]:
# buat tagging
nlp = spacy.load("en_core_web_sm")

# prep
english_stopwords = set(stopwords.words('english'))
punctuation_list = set(string.punctuation)

# prep
def remove(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in english_stopwords and word not in punctuation_list and word.isalpha()]
    return ' '.join(tokens)

# buat ngambil kalimat kalimat deskriptif
def extract_descriptive_phrases(texts):
    descriptive_phrases = []
    for text in texts:
        doc = nlp(text)
        for token in doc:
            if token.pos_ == "ADJ" and token.head.pos_ == "NOUN":
                phrase = f"{token.text} {token.head.text}"
                descriptive_phrases.append(phrase)
    return descriptive_phrases

# filter kata kata yang generic dan gaperlukan
def filter_generic_phrases(phrases, blacklist):
    return [phrase for phrase in phrases if not any(word in phrase for word in blacklist)]

# return most used kata / kalimat with positive and negative sentiment
def extract_phrases(texts, threshold=0.2):
    phrases = []
    for text in texts:
        blob = TextBlob(text)
        if abs(blob.sentiment.polarity) > threshold:
            phrases.append(text)
    return phrases

In [3]:
# load dataset
df_games = pd.read_csv('dataset/Terraria.csv')

# prep
df_games.dropna(subset=['review_text'], inplace=True)

# prep
target = df_games['review_text'].apply(remove)

# ambil kalimat kalimat deskriptif aja
descriptive_phrases = extract_descriptive_phrases(target)

# set generic phrases yang gadiperluin buat di extract. kaya "great game" "recommend game" itu ga dibutuhkan
# positive 
positive_generic_phrases = {
    "great", "good", "fun", "love", "best", "recommend", 
    "like", "awesome", "amazing", "really", "hours", "favorite"
}

# negative
negative_generic_phrases = {
    "bad", "terrible", "hate", "worst", "boring", "awful", 
    "dislike", "sucks", "stupid", "waste" 
}

# Combine
generic_phrases = positive_generic_phrases | negative_generic_phrases
filtered_phrases = filter_generic_phrases(descriptive_phrases, generic_phrases)

# extract hasil kalimat
phrases = extract_phrases(filtered_phrases)

# ambil top 50 aja
top_phrases = Counter(phrases).most_common(50)

# save to file
with open("Most Frequent Words.txt", "w", encoding="utf-8") as f:
    for phrase, freq in top_phrases:
        f.write(f"{phrase}: {freq}\n")