# Crawling PTA dan Berita

### Library

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re, sys, time

## 1. Crawling PTA

In [2]:
BASE_URL = "https://pta.trunojoyo.ac.id/c_search/byprod"

### Fungsi

In [3]:
def get_max_page(prodi_id):
    url = f"{BASE_URL}/{prodi_id}/1"
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")

    # Cari tombol >> (last page)
    last_page = soup.select_one('ol.pagination a:contains("»")')
    if last_page and "href" in last_page.attrs:
        href = last_page["href"]
        # Pecah URL -> ambil angka terakhir
        max_page = int(href.split("/")[-1])
        return max_page

    # fallback kalau pagination tidak ada
    return 1

In [4]:
# Contoh pemakaian
print(get_max_page(10))

172




In [5]:
def print_progress(prodi_id, prodi, current_page, total_pages):
    percent = (current_page / total_pages) * 100
    bar_length = 20
    filled_length = int(bar_length * current_page // total_pages)
    bar = '█' * filled_length + '-' * (bar_length - filled_length)
    sys.stdout.write(f'\r[{prodi_id}] {prodi} - Page {current_page}/{total_pages} [{bar}] {percent:.2f}%')
    sys.stdout.flush()
    if current_page == total_pages:
        sys.stdout.write('\n')

### Crawling Data Fakultas Teknik

In [1]:
import time, sys, re
import pandas as pd
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://pta.trunojoyo.ac.id/c_search/byprod"

def get_max_page(prodi_id):
    """Fungsi bantu untuk mendapatkan jumlah halaman maksimum tiap prodi"""
    try:
        r = requests.get(f"{BASE_URL}/{prodi_id}/1", timeout=15)
        soup = BeautifulSoup(r.content, "html.parser")
        page_info = soup.select_one('div.pagination')  # sesuaikan selector
        if page_info:
            pages = re.findall(r'\d+', page_info.text)
            return max([int(p) for p in pages]) if pages else 1
        return 1
    except:
        return 1

def pta_prd():
    start_time = time.time()

    data = {
        "id": [], "penulis": [], "judul": [], "abstrak_id": [], 
        "abstrak_en": [], "pembimbing_pertama": [], 
        "pembimbing_kedua": [], "prodi": []
    }

    # daftar prodi
    prodi_list = [9, 10, 11, 19, 20, 23, 31, 32, 33]
    max_pages_dict = {i: get_max_page(i) for i in prodi_list}

    session = requests.Session()  # koneksi reuse

    for i in prodi_list:
        max_page = max_pages_dict[i]
        for j in range(1, max_page + 1):
            url = f"{BASE_URL}/{i}/{j}"
            try:
                r = session.get(url, timeout=15)
                soup = BeautifulSoup(r.content, "html.parser")
            except requests.exceptions.RequestException as e:
                print(f"\n⚠ Gagal ambil {url}: {e}")
                continue

            jurnals = soup.select('li[data-cat="#luxury"]')
            isii = soup.select_one('div#begin')
            if not isii:
                continue

            prodi_full = isii.select_one('h2').text.strip() if isii.select_one('h2') else "N/A"
            prodi_name = prodi_full.replace("Journal Jurusan ", "")

            for jurnal in jurnals:
                try:
                    link_keluar = jurnal.select_one('a.gray.button')['href']
                    id_match = re.search(r"/detail/(\d+)", link_keluar)
                    pta_id = id_match.group(1) if id_match else None

                    response = session.get(link_keluar, timeout=15)
                    soup1 = BeautifulSoup(response.content, "html.parser")
                    isi = soup1.select_one('div#content_journal')
                    if not isi:
                        continue

                    judul = isi.select_one('a.title').text.strip() if isi.select_one('a.title') else ""
                    penulis = isi.select_one('span:contains("Penulis")').text.split(' : ')[1] if isi.select_one('span:contains("Penulis")') else ""
                    pembimbing_pertama = isi.select_one('span:contains("Dosen Pembimbing I")').text.split(' : ')[1] if isi.select_one('span:contains("Dosen Pembimbing I")') else ""
                    pembimbing_kedua = isi.select_one('span:contains("Dosen Pembimbing II")').text.split(' :')[1] if isi.select_one('span:contains("Dosen Pembimbing II")') else ""

                    paragraf = isi.select('p[align="justify"]')
                    abstrak_id = paragraf[0].get_text(strip=True) if len(paragraf) > 0 else "N/A"
                    abstrak_en = paragraf[1].get_text(strip=True) if len(paragraf) > 1 else "N/A"

                    data["id"].append(pta_id)
                    data["penulis"].append(penulis)
                    data["judul"].append(judul)
                    data["abstrak_id"].append(abstrak_id)
                    data["abstrak_en"].append(abstrak_en)
                    data["pembimbing_pertama"].append(pembimbing_pertama)
                    data["pembimbing_kedua"].append(pembimbing_kedua)
                    data["prodi"].append(prodi_name)

                    time.sleep(0.1)  # jeda lebih cepat
                except Exception as e:
                    print(f"\n⚠ Error parsing jurnal: {e}")
                    continue

            # update progress bar per prodi
            persen = int(j / max_page * 100)
            sys.stdout.write(f"\r📑 Prodi {i} ({prodi_name}) Hal {j}/{max_page} [{persen}%]")
            sys.stdout.flush()

            # simpan CSV setiap 5 halaman
            if j % 5 == 0:
                pd.DataFrame(data).to_csv("pta_prd.csv", index=False, encoding="utf-8-sig")

        sys.stdout.write("\n")
        # simpan CSV tiap prodi selesai
        pd.DataFrame(data).to_csv("pta_prd.csv", index=False, encoding="utf-8-sig")

    # simpan final
    df = pd.DataFrame(data)
    df.to_csv("pta_prd.csv", index=False, encoding="utf-8-sig")

    # hitung durasi
    elapsed = int(time.time() - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    print("\n✅ Seluruh data berhasil dikumpulkan!")
    print(f"📊 Total entri: {len(df)}")
    print(f"⏱ Waktu eksekusi: {jam} jam {menit} menit {detik} detik")

    return df


In [2]:
pta_prd()



📑 Prodi 9 (Teknik Industri) Hal 1/1 [100%]
📑 Prodi 10 (Teknik Informatika) Hal 1/1 [100%]
📑 Prodi 11 (Manajemen Informatika) Hal 1/1 [100%]
📑 Prodi 19 (Teknik Multimedia Dan Jaringan) Hal 1/1 [100%]
📑 Prodi 20 (Mekatronika) Hal 1/1 [100%]
📑 Prodi 23 (Teknik Elektro) Hal 1/1 [100%]
📑 Prodi 31 (Sistem Informasi) Hal 1/1 [100%]
📑 Prodi 32 (Teknik Mesin) Hal 1/1 [100%]
📑 Prodi 33 (Teknik Mekatronika) Hal 1/1 [100%]

✅ Seluruh data berhasil dikumpulkan!
📊 Total entri: 35
⏱ Waktu eksekusi: 0 jam 0 menit 32 detik


Unnamed: 0,id,penulis,judul,abstrak_id,abstrak_en,pembimbing_pertama,pembimbing_kedua,prodi
0,80421100005,Siliwangi Fitra Rachmawanto S.T.,OPTIMASI PEMILIHAN PORTOFOLIO SAHAM PERUSAHAAN...,Portofolio adalah sekumpulan saham yang dimili...,Portofolio is a collection of stock owned by i...,"Heri Awalul Ilhamsah S.T., M.T.","Retno Indriartiningtias S.T., M.T.",Teknik Industri
1,80421100087,AHMAD MAS'UD,PERANCANGAN TATA LETAK FASILITAS LANTAI PRODUK...,PT. ABC merupakan perusahaan yang bergerak dib...,PT. ABC is a company engaged in the manufactur...,"SABARUDIN AKHMAD, S.T., M.T.","SUGENG PURWOKO, S.T., M.T.",Teknik Industri
2,80421100019,Yulianto Fauzanta,PERUMUSAN STRATEGI BISNIS UD. BUDI JAYA BANGKA...,Bangkalan merupakan salah satu kabupaten yang ...,Bangkalan is one of the districts that have th...,"Fitri Agustina, S.T., M.T","Retno Indriartiningtias, S.T., M.T",Teknik Industri
3,80421100055,M Mundir Muhlisin,USULAN PERBAIKAN UTILITAS RESOURCES PADA LANTA...,Simulasi adalah duplikasi atau abstraksi dari ...,Simulation is a duplication or abstraction of ...,Mu'alim ST MT,Sugeng Purwoko ST MT,Teknik Industri
4,80421100046,Muhibbin,Peningkatan Kepuasan Masyarakat Terhadap Pelay...,Kepuasan adalah tingkat perasaan seseorang ter...,Satisfaction is feeling level of someone to se...,Rahmad Hidayat,Retno Indriartiningtias,Teknik Industri
5,40411100468,A.Ubaidillah S.Kom,PERANCANGAN DAN IMPLEMENTASI SISTEM DATABASE \...,Sistem informasi akademik (SIAKAD) merupaka...,Academic information systems (SIAKAD) is an in...,Budi Setyono M.T,Hermawan S.T,Teknik Informatika
6,40411100476,"M. Basith Ardianto,",APLIKASI KONTROL DAN MONITORING JARINGAN KOMPU...,Berjalannya koneksi jaringan komputer dengan l...,-,"Drs. Budi Soesilo, MT","Koko Joni, ST",Teknik Informatika
7,40411100480,"Akhmad Suyandi, S.Kom",RANCANG BANGUN APLIKASI PROXY SERVER UNTUK\r\n...,Web server adalah sebuah perangkat lunak serve...,Web server is a server software functioning to...,"Drs. Budi Soesilo, M.T","Hermawan, ST, MT",Teknik Informatika
8,70411100070,Heri Supriyanto,SISTEM PENDUKUNG KEPUTUSAN OPTIMASI PENJADWALA...,Penjadwalan kuliah di Perguruan Tinggi me...,Scheduling courses in universities is a ...,"Mulaab, S.Si., M.Kom","Firli Irhamni, ST., M.Kom",Teknik Informatika
9,80411100115,Septian Rahman Hakim,SISTEM AUGMENTED REALITY ANIMASI BENDA BERGERA...,Seiring perkembangan teknologi yang ada diduni...,As the development of technology existing in t...,"Arik Kurniawati, S.Kom., M.T.","Haryanto, S.T., M.T.",Teknik Informatika


### Link Keluaran Data Fakultas Teknik

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, urlparse
import urllib3

# matikan warning SSL insecure
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def scrape_all_links(base_url, max_pages=50):
    visited = set()
    results = []

    def scrape_page(url):
        try:
            response = requests.get(url, verify=False, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')

            # ambil semua link keluar
            for a in soup.find_all('a', href=True):
                href = a['href'].strip()
                if not href or href == "#":
                    continue
                full_link = urljoin(url, href)
                results.append({
                    "Page": url,
                    "Link Keluar": full_link
                })

            # cari link internal untuk dilanjutkan
            for a in soup.find_all('a', href=True):
                href = a['href'].strip()
                if not href or href == "#":
                    continue
                full_link = urljoin(url, href)
                # hanya ambil link internal (masih di domain utama)
                if full_link.startswith(base_url) and full_link not in visited:
                    visited.add(full_link)
                    if len(visited) < max_pages:
                        scrape_page(full_link)

        except Exception as e:
            print(f"⚠ Gagal akses {url}: {e}")

    # mulai dari base_url
    visited.add(base_url)
    scrape_page(base_url)

    # rapikan dataframe
    df = pd.DataFrame(results).reset_index(drop=True)
    df.index += 1
    df.insert(0, "No", df.index)
    return df

# contoh penggunaan
url = "https://informatika.trunojoyo.ac.id/"
df_links = scrape_all_links(url, max_pages=30)  # max_pages = batas biar ga infinite loop

df_links
# df_links.to_csv("semua_link.csv", index=False, encoding="utf-8-sig")

Unnamed: 0,No,Page,Link Keluar
1,1,https://informatika.trunojoyo.ac.id/,https://informatika.trunojoyo.ac.id/
2,2,https://informatika.trunojoyo.ac.id/,https://informatika.trunojoyo.ac.id/bidang-min...
3,3,https://informatika.trunojoyo.ac.id/,https://informatika.trunojoyo.ac.id/bidang-min...
4,4,https://informatika.trunojoyo.ac.id/,https://informatika.trunojoyo.ac.id/bidang-min...
5,5,https://informatika.trunojoyo.ac.id/,https://informatika.trunojoyo.ac.id/bidang-min...
...,...,...,...
1704,1704,https://informatika.trunojoyo.ac.id/unit-kegia...,https://ukmfteecom.vercel.app/
1705,1705,https://informatika.trunojoyo.ac.id/unit-kegia...,https://informatika.trunojoyo.ac.id/berita-pro...
1706,1706,https://informatika.trunojoyo.ac.id/unit-kegia...,https://informatika.trunojoyo.ac.id/berita-pro...
1707,1707,https://informatika.trunojoyo.ac.id/unit-kegia...,https://informatika.trunojoyo.ac.id/berita-pro...


## 2. Crawling Berita

### Fungsi

In [None]:
# Fungsi progress bar manual
def print_progress(kategori, current_page, total_pages):
    percent = (current_page / total_pages) * 100
    bar_length = 20
    filled_length = int(bar_length * current_page // total_pages)
    bar = '█' * filled_length + '-' * (bar_length - filled_length)
    sys.stdout.write(f'\r{kategori} - Page {current_page}/{total_pages} [{bar}] {percent:.2f}%')
    sys.stdout.flush()
    if current_page == total_pages:
        sys.stdout.write('\n\n')

In [None]:
# Fungsi untuk ambil konten artikel
def get_article_content(url):
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(r.text, "html.parser")

    paragraphs = []
    content_divs = soup.find_all("div", id="content-wrapper")
    for div in content_divs:
        for p in div.find_all("p"):
            text = p.get_text(strip=True)
            if text and not text.lower().startswith("baca juga"):
                paragraphs.append(text)
    return " ".join(paragraphs)

### Crawling Berita

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re, sys, time
import csv

# Fungsi progress bar manual
def print_progress(kategori, current_page, total_pages):
    percent = (current_page / total_pages) * 100
    bar_length = 20
    filled_length = int(bar_length * current_page // total_pages)
    bar = '█' * filled_length + '-' * (bar_length - filled_length)
    sys.stdout.write(f'\r{kategori} - Page {current_page}/{total_pages} [{bar}] {percent:.2f}%')
    sys.stdout.flush()
    if current_page == total_pages:
        sys.stdout.write('\n\n')

# Fungsi untuk ambil konten artikel
def get_article_content(url):
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(r.text, "html.parser")

    paragraphs = []
    content_divs = soup.find_all("div", id="content-wrapper")
    for div in content_divs:
        for p in div.find_all("p"):
            text = p.get_text(strip=True)
            if text and not text.lower().startswith("baca juga"):
                paragraphs.append(text)
    return " ".join(paragraphs)

# Fungsi crawling berita
def berita(categories, pages_per_category=1):
    start_time = time.time()  # mulai hitung waktu

    BASE_URL = "https://www.tempo.co/indeks?page={}&category=rubrik&rubric_slug={}"

    data = {
        "id_berita": [],
        "judul_berita": [],
        "isi_berita": [],
        "kategori_berita": []
    }

    for cat_id, cat in enumerate(categories, start=1):
        for page in range(1, pages_per_category+1):
            url = BASE_URL.format(page, cat)
            r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
            soup = BeautifulSoup(r.text, "html.parser")

            articles = soup.select("figure figcaption a")
            for a in articles:
                link = "https://www.tempo.co" + a["href"]
                title = a.get_text(strip=True)

                id_match = re.search(r"-(\d+)$", link)
                berita_id = id_match.group(1) if id_match else None

                try:
                    content = get_article_content(link)
                except:
                    content = ""

                data["id_berita"].append(berita_id)
                data["judul_berita"].append(title)
                data["isi_berita"].append(content)
                data["kategori_berita"].append(cat)

            print_progress(cat, page, pages_per_category)

    df = pd.DataFrame(data)
    # Save to CSV with default quoting behavior
    df.to_csv("tempo_berita.csv", index=False, encoding="utf-8-sig", quoting=csv.QUOTE_MINIMAL, doublequote=True)


    end_time = time.time()
    elapsed = int(end_time - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    # summary
    print("\n✅ Seluruh data berhasil dikumpulkan!")
    print(f"📊 Total entri: {len(df)}")
    print(f"⏱️ Waktu eksekusi: {jam} jam {menit} menit {detik} detik")

    return df

categories = ["politik", "hukum", "ekonomi", "lingkungan", "hiburan", "internasional", "otomotif", "olahraga", "sepakbola"]

In [None]:
berita(categories, pages_per_category=5)

politik - Page 5/5 [████████████████████] 100.00%

hukum - Page 5/5 [████████████████████] 100.00%

ekonomi - Page 5/5 [████████████████████] 100.00%

lingkungan - Page 5/5 [████████████████████] 100.00%

hiburan - Page 5/5 [████████████████████] 100.00%

internasional - Page 5/5 [████████████████████] 100.00%

otomotif - Page 5/5 [████████████████████] 100.00%

olahraga - Page 5/5 [████████████████████] 100.00%

sepakbola - Page 5/5 [████████████████████] 100.00%


✅ Seluruh data berhasil dikumpulkan!
📊 Total entri: 900
⏱️ Waktu eksekusi: 0 jam 33 menit 23 detik


Unnamed: 0,id_berita,judul_berita,isi_berita,kategori_berita
0,2073715,Alasan Ahmad Ali Meninggalkan NasDem untuk Gab...,KETUA Harian Partai Solidaritas Indonesia (PSI...,politik
1,2073713,Perintah Panglima TNI untuk SPPG Agar Kasus Ke...,PANGLIMA TNI JenderalAgus Subiyantomemerintahk...,politik
2,2073697,"Agar Kualitas Makanan MBG Terjaga, BGN Perinta...",Wakil Kepala Badan Gizi Nasional atauBGNNanik ...,politik
3,2073690,"BGN Melarang Makanan Pabrikan untuk MBG, tapi ...",WAKIL Kepala Badan Gizi Nasional (BGN) Nanik S...,politik
4,2073683,Kaesang Pangarep Melantik Bapak J Jadi Ketua D...,KETUA Umum Partai Solidaritas Indonesia atauPS...,politik
...,...,...,...,...
895,2071222,Hasil Liga Champions Pekan Pertama: Barcelona ...,RANGKAIAN pertandinganLiga Champions2025/2026 ...,sepakbola
896,2071219,"Imbang 1-1 Lawan Lion City Sailors, Pelatih Pe...","PELATIHPersibBandung, Bojan Hodak, mengatakan ...",sepakbola
897,2071204,Pelatih Lion City Sailors Anggap Adil Hasil Im...,"PELATIHLion City Sailors, Aleksandar Rankovic,...",sepakbola
898,2071175,Hasil Liga Champions Asia: Laga Persib Bandung...,PERTANDINGAN antaraPersib Bandungvs Lion City ...,sepakbola


## Page & Link Keluar Berita

In [None]:
def berita_links(categories, pages_per_category=1):
    start_time = time.time()  # mulai hitung waktu

    BASE_URL = "https://www.tempo.co/indeks?page={}&category=rubrik&rubric_slug={}"

    data = {
        "id_berita": [],
        "page": [],
        "link_keluar": []
    }

    for cat in categories:
        for page in range(1, pages_per_category+1):
            url = BASE_URL.format(page, cat)
            r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
            soup = BeautifulSoup(r.text, "html.parser")

            articles = soup.select("figure figcaption a")
            for a in articles:
                link = "https://www.tempo.co" + a["href"]
                id_match = re.search(r"-(\d+)$", link)
                berita_id = id_match.group(1) if id_match else None

                data["id_berita"].append(berita_id)
                data["page"].append(url)         # link page
                data["link_keluar"].append(link) # link detail

            # update progress bar
            print_progress(cat, page, pages_per_category)

    df = pd.DataFrame(data)
    df.to_csv("tempo_links.csv", index=False, encoding="utf-8-sig")

    end_time = time.time()
    elapsed = int(end_time - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    # summary
    print("\n✅ Seluruh link berhasil dikumpulkan!")
    print(f"📊 Total entri: {len(df)}")
    print(f"⏱️ Waktu eksekusi: {jam} jam {menit} menit {detik} detik")

    return df

categories = ["politik", "hukum", "ekonomi", "lingkungan", "hiburan", "internasional", "otomotif", "olahraga", "sepakbola"]

In [None]:
berita_links(categories, pages_per_category=5)

politik - Page 5/5 [████████████████████] 100.00%

hukum - Page 5/5 [████████████████████] 100.00%

ekonomi - Page 5/5 [████████████████████] 100.00%

lingkungan - Page 5/5 [████████████████████] 100.00%

hiburan - Page 5/5 [████████████████████] 100.00%

internasional - Page 5/5 [████████████████████] 100.00%

otomotif - Page 5/5 [████████████████████] 100.00%

olahraga - Page 5/5 [████████████████████] 100.00%

sepakbola - Page 5/5 [████████████████████] 100.00%


✅ Seluruh link berhasil dikumpulkan!
📊 Total entri: 900
⏱️ Waktu eksekusi: 0 jam 1 menit 4 detik


Unnamed: 0,id_berita,page,link_keluar
0,2068198,https://www.tempo.co/indeks?page=1&category=ru...,https://www.tempo.co/politik/respons-abdul-kad...
1,2068188,https://www.tempo.co/indeks?page=1&category=ru...,https://www.tempo.co/politik/menkes-budi-gunad...
2,2068167,https://www.tempo.co/indeks?page=1&category=ru...,https://www.tempo.co/politik/penyusunan-ruu-pe...
3,2068149,https://www.tempo.co/indeks?page=1&category=ru...,https://www.tempo.co/politik/bahlil-kirim-tim-...
4,2068145,https://www.tempo.co/indeks?page=1&category=ru...,https://www.tempo.co/politik/koalisi-sipil-sor...
...,...,...,...
895,2065734,https://www.tempo.co/indeks?page=5&category=ru...,https://www.tempo.co/sepakbola/calvin-verdonk-...
896,2065727,https://www.tempo.co/indeks?page=5&category=ru...,https://www.tempo.co/sepakbola/penyebab-bayer-...
897,2065643,https://www.tempo.co/indeks?page=5&category=ru...,https://www.tempo.co/sepakbola/calvin-verdonk-...
898,2065621,https://www.tempo.co/indeks?page=5&category=ru...,https://www.tempo.co/sepakbola/kualifikasi-pia...


## 3. Pre-Pocessing PTA dan Berita

### Library

In [None]:
!pip install pandas nltk spacy Sastrawi pyspellchecker
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


### Pre-Pocessing PTA

In [None]:
import pandas as pd
import re
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
from spellchecker import SpellChecker
import spacy

# Download stopwords (sekali saja)
nltk.download('stopwords')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# === Load dataset ===
pta_all = pd.read_csv("pta_teknik.csv")

# === Daftar Prodi Fakultas Teknik ===
prodi_teknik = [
    "Teknik Industri",
    "Teknik Informatika",
    "Manajemen Informatika",
    "Teknik Multimedia Dan Jaringan",
    "Mekatronika",
    "Teknik Elektro",
    "Sistem Informasi",
    "Teknik Mesin",
    "Teknik Mekatronika"
]

# === Filter hanya Fakultas Teknik ===
pta_teknik = pta_all[pta_all["prodi"].isin(prodi_teknik)].copy()

# === Stopwords ===
stopwords_id = set(stopwords.words("indonesian"))
stopwords_en = set(stopwords.words("english"))

# === Stemmer Indonesia ===
factory = StemmerFactory()
stemmer_id = factory.create_stemmer()

# === Spell checker English ===
spell_en = SpellChecker(language="en")

# ========================
# Fungsi Preprocessing Indo
# ========================
def preprocess_text_id(text):
    if pd.isna(text):
        return []
    # (2) Hapus tanda baca & simbol
    text = re.sub(r"[^a-zA-Z\s]", " ", text.lower())
    # (5) Tokenisasi
    tokens = text.split()
    if not tokens:
        return []
    # (1) Stopword removal
    tokens = [w for w in tokens if w not in stopwords_id]
    if not tokens:
        return []
    # (4) Stemming dengan Sastrawi
    tokens = [stemmer_id.stem(w) for w in tokens]
    return tokens

# ========================
# Fungsi Preprocessing English
# ========================
def preprocess_text_en(text):
    if pd.isna(text):
        return []
    # (2) Hapus tanda baca & simbol
    text = re.sub(r"[^a-zA-Z\s]", " ", text.lower())
    # (5) Tokenisasi
    tokens = text.split()
    if not tokens:
        return []
    # (3) Cek ejaan pembakuan kata (hindari None)
    corrected = []
    for w in tokens:
        corr = spell_en.correction(w)
        corrected.append(corr if corr is not None else w)
    tokens = corrected
    # (1) Stopword removal
    tokens = [w for w in tokens if isinstance(w, str) and w not in stopwords_en]
    if not tokens:
        return []
    # (4) Lematisasi dengan spaCy (hanya jika ada token)
    doc = nlp(" ".join(tokens))
    tokens = [token.lemma_ for token in doc]
    return tokens

# === Terapkan ke abstrak Fakultas Teknik ===
pta_teknik["abstrak_id_clean"] = pta_teknik["abstrak_id"].apply(preprocess_text_id)
pta_teknik["abstrak_en_clean"] = pta_teknik["abstrak_en"].apply(preprocess_text_en)

# === Simpan hasil ===
pta_teknik.to_csv("preprocessing_pta_teknik.csv", index=False)

# Contoh hasil
print("Jumlah data Fakultas Teknik:", len(pta_teknik))
display(pta_teknik[["prodi", "abstrak_id_clean", "abstrak_en_clean"]].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Jumlah data Fakultas Teknik: 2289


Unnamed: 0,prodi,abstrak_id_clean,abstrak_en_clean
0,Teknik Industri,"[portofolio, kumpul, saham, milik, investor, s...","[portfolio, collection, stock, own, investor, ..."
1,Teknik Industri,"[pt, abc, usaha, gerak, bidang, manufaktur, ka...","[pt, arc, company, engage, manufacture, wood, ..."
2,Teknik Industri,"[bangkal, salah, kabupaten, milik, potensi, al...","[bangkalan, one, district, potential, natural,..."
3,Teknik Industri,"[simulasi, duplikasi, abstraksi, hidup, nyata,...","[simulation, duplication, abstraction, real, l..."
4,Teknik Industri,"[puas, tingkat, asa, layan, banding, kerja, ha...","[satisfaction, feel, level, someone, service, ..."


### Pre-Pocessing Berita

In [None]:
import pandas as pd
import re
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords

# Download stopwords (sekali saja)
nltk.download('stopwords')

# === Load dataset ===
tempo_berita = pd.read_csv("tempo_berita.csv")

# === Stopwords & Stemmer Indonesia ===
stopwords_id = set(stopwords.words("indonesian"))
factory = StemmerFactory()
stemmer_id = factory.create_stemmer()

# ========================
# Fungsi Preprocessing Indo
# ========================
def preprocess_text_id(text):
    if pd.isna(text):
        return ""
    # Menghilangkan simbol & tanda baca
    text = re.sub(r"[^a-zA-Z\s]", " ", text.lower())
    # Tokenisasi
    tokens = text.split()
    # Stopword removal
    tokens = [w for w in tokens if w not in stopwords_id]
    # Stemming dengan Sastrawi
    tokens = [stemmer_id.stem(w) for w in tokens]
    return tokens

# === Terapkan ke Tempo ===
tempo_berita["judul_clean"] = tempo_berita["judul_berita"].apply(preprocess_text_id)
tempo_berita["isi_clean"] = tempo_berita["isi_berita"].apply(preprocess_text_id)

# === Simpan hasil ===
tempo_berita.to_csv("preprocessing_berita.csv", index=False)

# Contoh hasil
tempo_berita[["judul_clean", "isi_clean"]].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,judul_clean,isi_clean
0,"[alas, ahmad, ali, tinggal, nasdem, gabung, psi]","[ketua, hari, partai, solidaritas, indonesia, ..."
1,"[perintah, panglima, tni, sppg, racun, mbg]","[panglima, tni, jenderalagus, subiyantomemerin..."
2,"[kualitas, makan, mbg, jaga, bgn, perintah, tu...","[wakil, kepala, badan, gizi, nasional, ataubgn..."
3,"[bgn, larang, makan, pabrikan, mbg, susu, kemas]","[wakil, kepala, badan, gizi, nasional, bgn, na..."
4,"[kaesang, pangarep, lantik, j, ketua, dewan, b...","[ketua, partai, solidaritas, indonesia, ataups..."
