In [22]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/delilawiryono/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/delilawiryono/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/delilawiryono/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/delilawiryono/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [23]:
def get_wordnet_pos(nltk_pos_tag):
    if nltk_pos_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_pos_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_pos_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default noun

In [24]:
df = pd.read_csv("/Users/delilawiryono/Desktop/apifylink.csv")
df["cleaned"] = df["body"].str.lower().str.replace(r"[^\w\s]", "", regex=True)

print(df["cleaned"])

0     the switch 2 is pretty much what i wanted out ...
1                                                 agree
2     highly recommend picking up the amfilm glass s...
3                                              where at
4     these are the codes for the amazon products\n\...
                            ...                        
93    ah that part i heard that the system they were...
94    hey there\n\nplease remember rule 1 in the fut...
95                                              removed
96    gtboth ntsc and pal versions are available to ...
97    my ps5 controller has drift and all ive played...
Name: cleaned, Length: 98, dtype: object


In [25]:
df["tokens"] = df["cleaned"].apply(word_tokenize)
print(df["tokens"])

0     [the, switch, 2, is, pretty, much, what, i, wa...
1                                               [agree]
2     [highly, recommend, picking, up, the, amfilm, ...
3                                           [where, at]
4     [these, are, the, codes, for, the, amazon, pro...
                            ...                        
93    [ah, that, part, i, heard, that, the, system, ...
94    [hey, there, please, remember, rule, 1, in, th...
95                                            [removed]
96    [gtboth, ntsc, and, pal, versions, are, availa...
97    [my, ps5, controller, has, drift, and, all, iv...
Name: tokens, Length: 98, dtype: object


In [26]:

stop_words = set(stopwords.words('english'))

df["stopwordremove"] = df["tokens"].apply(lambda x: [word for word in x if word not in stop_words])


In [27]:
lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(tokens):
    pos_tags = pos_tag(tokens)
    return [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]

df['lemmatized'] = df['stopwordremove'].apply(lemmatize_tokens)
print(df["lemmatized"])

0     [switch, 2, pretty, much, want, switch, 1, cha...
1                                               [agree]
2     [highly, recommend, pick, amfilm, glass, scree...
3                                                    []
4     [code, amazon, product, 3pack, tempered, glass...
                            ...                        
93    [ah, part, heard, system, accidentally, switch...
94    [hey, please, remember, rule, 1, future, perso...
95                                             [remove]
96    [gtboth, ntsc, pal, version, available, switch...
97    [ps5, controller, drift, ive, play, like, 150,...
Name: lemmatized, Length: 98, dtype: object


In [28]:
df["clean_text"] = df["lemmatized"].apply(lambda tokens: " ".join(tokens))
df.to_csv("reddit_final_text.csv", index=False, encoding="utf-8")

print("✅ CSV berhasil disimpan sebagai reddit_preprocessed.csv")


✅ CSV berhasil disimpan sebagai reddit_preprocessed.csv
