# **Import Library**

In [1]:
import re
import nltk
import string
import emoji
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords as nltk_stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Dicky
[nltk_data]     Zulfikar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Dicky
[nltk_data]     Zulfikar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# **Load Data**

In [2]:
data1 = pd.read_csv("datasets/kebebasan_pers.csv")
data2 = pd.read_csv("datasets/pembungkaman_pers.csv")

df = pd.concat([data1, data2], ignore_index=True)
df.to_csv("tweets_pers.csv", index=False)

In [3]:
df.head()

Unnamed: 0,conversation_id_str,created_at,favorite_count,full_text,id_str,image_url,in_reply_to_screen_name,lang,location,quote_count,reply_count,retweet_count,tweet_url,user_id_str,username
0,1947629184714678376,Tue Jul 22 12:05:57 +0000 2025,0,Wakil Gubernur Bali I Nyoman Giri Prasta membe...,1947629184714678376,,,in,,0,0,0,https://x.com/undefined/status/194762918471467...,1293295315148140545,
1,1944614483714003310,Mon Jul 14 04:26:36 +0000 2025,0,Kasus dugaan intimidasi terhadap jurnalis Jawa...,1944614483714003310,https://pbs.twimg.com/amplify_video_thumb/1944...,,in,,0,0,0,https://x.com/undefined/status/194461448371400...,1571102928852897798,
2,1937912480186937535,Wed Jun 25 16:35:14 +0000 2025,1,Pembungkaman media tanda sakitnya demokrasi #d...,1937912480186937535,https://pbs.twimg.com/media/GuTZIV9bEAAl8Xv.jpg,,in,,0,0,0,https://x.com/undefined/status/193791248018693...,952232797476802562,
3,1937400838584172770,Tue Jun 24 06:42:09 +0000 2025,0,RUU Penyiaran disusun Komdigi dengan Nezar Pat...,1937400838584172770,https://pbs.twimg.com/media/GuMHySRagAIXZnr.jpg,,in,,0,0,0,https://x.com/undefined/status/193740083858417...,1704017670507819008,
4,1937395247748882705,Tue Jun 24 06:19:56 +0000 2025,1,Nezar Patria Wamenkomdigi memastikan pemerinta...,1937395247748882705,https://pbs.twimg.com/media/GuMCs8fagAM4mxz.jpg,,in,,0,0,0,https://x.com/undefined/status/193739524774888...,1698999506736713728,


# **Data Exploration**

## Cek Struktur Data

In [4]:
df.shape

(712, 15)

## Cek Tipe Data

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   conversation_id_str      712 non-null    int64  
 1   created_at               712 non-null    object 
 2   favorite_count           712 non-null    int64  
 3   full_text                712 non-null    object 
 4   id_str                   712 non-null    int64  
 5   image_url                258 non-null    object 
 6   in_reply_to_screen_name  329 non-null    object 
 7   lang                     712 non-null    object 
 8   location                 0 non-null      float64
 9   quote_count              712 non-null    int64  
 10  reply_count              712 non-null    int64  
 11  retweet_count            712 non-null    int64  
 12  tweet_url                712 non-null    object 
 13  user_id_str              712 non-null    int64  
 14  username                 0

## Cek Missing Values

In [6]:
df.isnull().sum()

conversation_id_str          0
created_at                   0
favorite_count               0
full_text                    0
id_str                       0
image_url                  454
in_reply_to_screen_name    383
lang                         0
location                   712
quote_count                  0
reply_count                  0
retweet_count                0
tweet_url                    0
user_id_str                  0
username                   712
dtype: int64

## Cek Duplicate Data

In [7]:
df.duplicated().sum()

0

# **Preprocessing**

## Drop Column

In [8]:
df = df.drop(columns=['conversation_id_str', 'id_str', 'image_url', 'in_reply_to_screen_name', 
                      'lang', 'location', 'user_id_str', 'username'], errors='ignore')

## Format Column Time

In [9]:
df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")

  df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")


## Text Preprocessing

In [10]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))

def clean_text(text):
    # 1. Lowercase
    text = text.lower()
    
    # 2. Membersihkan URL, mention, hashtag, angka, emoji, dan simbol
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # hapus URL
    text = re.sub(r"@\w+|#\w+", "", text)                # hapus mention & hashtag
    text = re.sub(r"\d+", "", text)                      # hapus angka
    text = emoji.replace_emoji(text, replace='')         # hapus emoji
    text = text.translate(str.maketrans("", "", string.punctuation))  # hapus tanda baca
    
    # 3. Tokenisasi
    tokens = word_tokenize(text)
    
    # 4. Stopword removal
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    
    # 5. Stemming 
    stemmed_tokens = [stemmer.stem(w) for w in tokens]
    
    return " ".join(stemmed_tokens)

df['clean_text'] = df['full_text'].astype(str).apply(clean_text)

In [None]:
df_compare = df[['full_text', 'clean_text']].head()
df_compare

Unnamed: 0,full_text,clean_text
0,Wakil Gubernur Bali I Nyoman Giri Prasta membe...,wakil gubernur bal nyoman giri prasta apresias...
1,Kasus dugaan intimidasi terhadap jurnalis Jawa...,duga intimidasi jurnalis jawa pos radar bal an...
2,Pembungkaman media tanda sakitnya demokrasi #d...,bungkam media tanda sakit demokrasi
3,RUU Penyiaran disusun Komdigi dengan Nezar Pat...,ruu siar susun komdigi nezar patria bukti peri...
4,Nezar Patria Wamenkomdigi memastikan pemerinta...,nezar patria wamenkomdigi perintah gak campur ...


In [17]:
df_clean = df[['clean_text']]
df_clean = df_clean.rename(columns={'clean_text': 'text'})
df_clean

Unnamed: 0,text
0,wakil gubernur bal nyoman giri prasta apresias...
1,duga intimidasi jurnalis jawa pos radar bal an...
2,bungkam media tanda sakit demokrasi
3,ruu siar susun komdigi nezar patria bukti peri...
4,nezar patria wamenkomdigi perintah gak campur ...
...,...
707,hussein buka kotak cium bau busuk buka kardus
708,cica pulang liput hussein abri yusuf muda dong...
709,kotak isi kepala babi cica wartawan desk polit...
710,rabu maret kantor tempo kirim paket kepala bab...


In [19]:
df_clean.sample(20)

Unnamed: 0,text
339,pidana jurnalis cedera
701,harap jaksa kritik langsung ngebungkam
619,jaksa jancok
315,yuk ikut twitter space diskusi dgn audiens mod...
263,nasihat prof bagir man capres bebas pers
94,ancam tempo selidik bareskrim usut duga teror ...
445,nilai angka agregat skala fredoom house nilai ...
320,bebas pers bebas kritik tentang george orwell ...
114,mari dukung jurnalis berani bicara benar garda...
376,jurnalisme bungkam sastra bicara seno gumira a...


# **Saved Data Clean**

In [21]:
df_clean.to_csv("datasets/tweets_clean.csv")