In [6]:
import requests

url = "https://api.twitterapi.io/twitter/tweet/advanced_search"

querystring = {"cursor":"4","queryType":"Latest","query":"Transjakarta lang:id since:2024-04-16 until:2025-04-22"}

headers = {"X-API-Key": "c8fcbee491594413a5dcd710c1adc60a"}

response = requests.request("GET", url, headers=headers, params=querystring)

print(response.text)

{"tweets":[],"has_next_page":false,"next_cursor":null}


## Kode Scraping

In [10]:
# Scraping with TwitterAPI.IO

"""
Scrape 'Transjakarta' 1 Juli – 1 Agustus 2024 via twitterapi.io
Simpan → transjakarta_jul24.csv
"""

import requests, csv, time

API_KEY = "c8fcbee491594413a5dcd710c1adc60a"   # ganti milik Anda
QUERY   = "(Busway OR Transjakarta OR TJ) -from:PT_Transjakarta lang:id since:2025-05-18 until:2025-05-24"
ENDPT   = "https://api.twitterapi.io/twitter/tweet/advanced_search"
OUTCSV  = "Proof of Concept/May 18-24 2025.csv"

HEADERS = {"X-API-Key": API_KEY}
TARGET  = 2000            # total tweet yang ingin dikumpulkan
SLEEP   = 0.8            # delay antarpanggilan (hindari rate-limit)

all_rows, cursor, page = [], None, 0

while len(all_rows) < TARGET:
    page += 1
    params = {"query": QUERY}
    if cursor:
        params["cursor"] = cursor

    resp = requests.get(ENDPT, headers=HEADERS, params=params, timeout=30)
    if resp.status_code != 200:
        print(f"❌  (Error OR Eror) {resp.status_code}: {resp.text[:200]}")
        break

    data   = resp.json()
    tweets = data.get("tweets", [])

    for tw in tweets:
        all_rows.append({
            "id"        : tw["id"],
            "created_at": tw["createdAt"],
            "username"  : tw["author"]["userName"],
            "text"      : tw["text"].replace("\r", " ").replace("\n", " "),
            "retweets"  : tw["retweetCount"],
            "likes"     : tw["likeCount"],
            "lang"      : tw["lang"]
        })

    print(f"Page {page:>2}  +{len(tweets):3}  (total {len(all_rows)})")

    # pagination
    if not data.get("has_next_page"):
        break
    cursor = data.get("next_cursor") or None
    if not cursor:
        break

    time.sleep(SLEEP)

# ---------------- simpan ke CSV ----------------
if all_rows:
    with open(OUTCSV, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=all_rows[0].keys(), quoting=csv.QUOTE_ALL)
        w.writeheader()
        w.writerows(all_rows[:TARGET])
    print(f"✅  Selesai: {len(all_rows[:TARGET])} tweet → {OUTCSV}")
else:
    print("⚠️  Tidak ada data terkumpul.")

Page  1  + 17  (total 17)


TooManyRedirects: Exceeded 30 redirects.

In [12]:
import pandas as pd

# ---------- paths ----------
IN_CSV  = "Proof of Concept/POC_Merged_deduplicated.csv"
OUT_CSV = "Proof of Concept/POC_Merged_deduplicated_keyword filtered.csv"

# ---------- load ----------
df = pd.read_csv(IN_CSV, encoding="utf-8")

# ---------- filter ----------
# keep rows where 'text' contains any of TJ, Transjakarta, or Busway
pattern = r"\b(Tije|Transjakarta|Busway|TJ)\b"
mask = df["text"].str.contains(pattern, case=False, na=False)

removed = (~mask).sum()
kept    = mask.sum()
print(f"Dropped {removed} rows; kept {kept} rows.")

df_filtered = df[mask]

# ---------- save ----------
df_filtered.to_csv(OUT_CSV, index=False, encoding="utf-8")
print(f"✅  Filtered file saved to {OUT_CSV}")

Dropped 3744 rows; kept 2321 rows.
✅  Filtered file saved to Proof of Concept/POC_Merged_deduplicated_keyword filtered.csv


  mask = df["text"].str.contains(pattern, case=False, na=False)


In [22]:
# merging CSV 

import os
import pandas as pd

# Define the directory path
directory = "negative tweets"

# Get a list of all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
csv_files.sort()  # Optional: sorts the files if order matters

# Read the first file to get the header
first_file_path = os.path.join(directory, csv_files[0])
merged_df = pd.read_csv(first_file_path)
header = merged_df.columns  # Save the header from the first file

# Loop through the rest of the files and append data using the same header
for file in csv_files[1:]:
    file_path = os.path.join(directory, file)
    # Read the CSV without header and skip the first row (header row)
    df = pd.read_csv(file_path, header=None, skiprows=1)
    df.columns = header  # Assign the header from the first file
    merged_df = pd.concat([merged_df, df], ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv("negative tweets/negative tweets_Merged.csv", index=False)

print("CSV files merged successfully!")


CSV files merged successfully!


In [23]:
# remove duplicates

import pandas as pd
import os

# Nama file input dan output
input_filename = 'negative tweets/negative tweets_Merged.csv'
output_filename = 'negative tweets/negative tweets_Merged_Remove Duplicates.csv' # Nama file baru untuk output

# Nama kolom yang akan digunakan untuk mendeteksi duplikat
column_to_check = 'text'

print(f"Membaca file: {input_filename}")

# Membaca file CSV ke dalam DataFrame pandas
try:
    df = pd.read_csv(input_filename)
    print(f"Berhasil membaca {len(df)} baris dari {input_filename}")
except FileNotFoundError:
    print(f"Error: File '{input_filename}' tidak ditemukan. Pastikan file berada di direktori yang sama dengan script Python ini, atau berikan path lengkap.")
    exit() # Keluar jika file tidak ditemukan
except Exception as e:
    print(f"Terjadi error saat membaca file: {e}")
    exit()

# Memeriksa apakah kolom 'text' ada dalam DataFrame
if column_to_check not in df.columns:
    print(f"Error: Kolom '{column_to_check}' tidak ditemukan dalam file CSV.")
    print(f"Kolom yang tersedia: {df.columns.tolist()}")
    exit() # Keluar jika kolom tidak ditemukan

# Menghapus baris duplikat berdasarkan kolom 'text'
# keep='first' artinya mempertahankan baris pertama yang ditemukan untuk setiap nilai unik
print(f"Menghapus duplikat berdasarkan kolom '{column_to_check}'...")
df_cleaned = df.drop_duplicates(subset=[column_to_check], keep='first')

# Menampilkan informasi jumlah baris sebelum dan sesudah penghapusan duplikat
num_original_rows = len(df)
num_cleaned_rows = len(df_cleaned)
num_duplicates_removed = num_original_rows - num_cleaned_rows

print(f"Jumlah baris asli: {num_original_rows}")
print(f"Jumlah baris setelah menghapus duplikat: {num_cleaned_rows}")
print(f"Jumlah duplikat yang dihapus: {num_duplicates_removed}")

# Menyimpan DataFrame yang sudah dibersihkan ke file CSV baru
# index=False mencegah pandas menulis index DataFrame sebagai kolom
try:
    df_cleaned.to_csv(output_filename, index=False)
    print(f"Data yang sudah dibersihkan berhasil disimpan ke file: {output_filename}")
except Exception as e:
    print(f"Terjadi error saat menyimpan file: {e}")

Membaca file: negative tweets/negative tweets_Merged.csv
Berhasil membaca 794 baris dari negative tweets/negative tweets_Merged.csv
Menghapus duplikat berdasarkan kolom 'text'...
Jumlah baris asli: 794
Jumlah baris setelah menghapus duplikat: 730
Jumlah duplikat yang dihapus: 64
Data yang sudah dibersihkan berhasil disimpan ke file: negative tweets/negative tweets_Merged_Remove Duplicates.csv


In [None]:
# Clean Dataset with Remove comma

import pandas as pd
import re

SRC_CSV  = "Transjakarta/Transjakarta_TwitterApiIO_Merged.csv"
DEST_CSV = "Transjakarta/Transjakarta_TwitterApiIO_Merged_Clean.csv"

def clean_text(s: str, drop_emoji=True) -> str:
    if pd.isna(s):
        return s                       # biarkan NaN apa adanya
    # hilangkan koma
    s = s.replace(",", "")
    # ganti newline / carriage-return dengan spasi
    s = s.replace("\n", " ").replace("\r", " ")
    # # opsional: buang emoji / karakter non-ASCII (Unicode > 127)
    # if drop_emoji:
    #     s = re.sub(r"[^\x00-\x7F]+", "", s)
    # # rapikan spasi ganda
    # s = re.sub(r"\s{2,}", " ", s).strip()
    return s

df = pd.read_csv(SRC_CSV, encoding="utf-8")

# buat kolom baru atau timpa kolom lama
df["text_clean"] = df["text"].apply(clean_text)

df.to_csv(DEST_CSV, index=False, encoding="utf-8")
print(f"Selesai – hasil disimpan ke {DEST_CSV}")


In [None]:
# Remove Duplicates based on column

import pandas as pd
import re

# -------- path file --------
SMALL_CSV = "neutral tweets/neutral tweets_Merged_Remove Duplicates.csv"
BIG_CSV   = "neutral tweets/Tugas Akhir_Dataset_Merged_Original - v1.0_data processing.csv"
OUT_CSV   = "neutral tweets/neutral tweets_TwitterApiIO_Merged_Clean_Filtered.csv"   # <— hasil = subset SMALL_CSV

# ---------- fungsi normalisasi ----------
def normalize(s: str) -> str:
    """Hilangkan koma, newline, spasi ganda, lalu trim."""
    if pd.isna(s):
        return ""
    s = s.replace(",", "")
    s = s.replace("\n", " ").replace("\r", " ")
    s = re.sub(r"\s{2,}", " ", s)
    return s.strip()

# ---------- baca file besar & bikin SET teks yang mau DIHAPUS ----------
df_big = pd.read_csv(BIG_CSV, encoding="utf-8")
big_set = {normalize(t) for t in df_big["full_text"].dropna()}
print(f"Tweets unik di BIG_CSV  : {len(big_set):,}")

# ---------- baca file kecil ----------
df_small = pd.read_csv(SMALL_CSV, encoding="utf-8")
col_small = "text_clean" if "text_clean" in df_small.columns else "text"

# normalisasi kolom kecil utk pencocokan
norm_small = df_small[col_small].apply(normalize)

# ---------- mask: SIMPAN kalau TIDAK ada di big_set ----------
keep_mask = ~norm_small.isin(big_set)

removed_cnt = (~keep_mask).sum()
kept_cnt    = keep_mask.sum()

print(f"Baris dihapus (match)  : {removed_cnt:,}")
print(f"Baris tersisa          : {kept_cnt:,}")

# ---------- simpan hasil ----------
df_small.loc[keep_mask].to_csv(OUT_CSV, index=False, encoding="utf-8")
print(f"✅  File tersaring disimpan → {OUT_CSV}")