In [None]:
!pip install trafilatura newspaper3k fake-useragent

Collecting trafilatura
  Downloading trafilatura-2.0.0-py3-none-any.whl.metadata (12 kB)
Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting fake-useragent
  Downloading fake_useragent-2.2.0-py3-none-any.whl.metadata (17 kB)
Collecting courlan>=1.3.2 (from trafilatura)
  Downloading courlan-1.3.2-py3-none-any.whl.metadata (17 kB)
Collecting htmldate>=1.9.2 (from trafilatura)
  Downloading htmldate-1.9.3-py3-none-any.whl.metadata (10 kB)
Collecting justext>=3.0.1 (from trafilatura)
  Downloading justext-3.0.2-py2.py3-none-any.whl.metadata (7.3 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Download

In [None]:
import pandas as pd, time, random, requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from fake_useragent import UserAgent

ua = UserAgent()
HEADERS = lambda: {"User-Agent": ua.random, "Accept-Language": "en-US,en;q=0.9"}

def polite_delay(a=1.0, b=2.5):
    time.sleep(random.uniform(a, b))


In [None]:
def fetch_url(url, retries=3, timeout=12):
    last_err = None
    for i in range(retries):
        try:
            resp = requests.get(url, headers=HEADERS(), timeout=timeout)
            # jika redirect ke non-HTML (pdf, image), kita deteksi dari header
            ctype = resp.headers.get("Content-Type","").lower()
            if resp.status_code == 200 and "text/html" in ctype:
                return {"ok": True, "status": resp.status_code, "html": resp.text, "ctype": ctype}
            else:
                return {"ok": False, "status": resp.status_code, "html": None, "ctype": ctype}
        except Exception as e:
            last_err = str(e)
            time.sleep(1.5 * (i+1))  # backoff
    return {"ok": False, "status": None, "html": None, "error": last_err}


In [None]:
def parse_article_bs4(html):
    soup = BeautifulSoup(html, "html.parser")
    title = soup.find("title").get_text(strip=True) if soup.find("title") else None
    # ambil isi <article> kalau ada, else gabung <p>
    art = soup.find("article")
    if art:
        ps = [p.get_text(" ", strip=True) for p in art.find_all("p")]
    else:
        ps = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
    content = " ".join(ps).strip()
    return title, content

def parse_fallback_trafilatura(url):
    try:
        import trafilatura
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            text = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
            return text
    except Exception:
        pass
    return None

def parse_fallback_newspaper(url):
    try:
        from newspaper import Article
        a = Article(url, language='en')
        a.download(); a.parse()
        return a.title, a.text
    except Exception:
        return None, None


In [None]:
def scrape_one(url):
    rec = {"url": url, "sumber": urlparse(url).netloc}
    res = fetch_url(url)
    if not res["ok"]:
        rec.update({
            "http_status": res.get("status"),
            "scrape_status": "fail",
            "error_message": res.get("error") or f"ctype={res.get('ctype')}"
        })
        return rec

    title, content = parse_article_bs4(res["html"])
    # jika konten terlalu pendek → coba fallback
    if not content or len(content) < 500:
        txt = parse_fallback_trafilatura(url)
        if txt and len(txt) >= 300:
            if not title:  # trafilatura kadang tidak ambil title
                title, _ = parse_fallback_newspaper(url)
            content = txt

    if (not content or len(content) < 300):
        t2, c2 = parse_fallback_newspaper(url)
        title = title or t2
        content = content if (content and len(content)>=300) else c2

    if content and len(content) >= 300:
        rec.update({
            "judul": title,
            "konten": content,
            "content_length": len(content),
            "http_status": res["status"],
            "scrape_status": "ok"
        })
    else:
        rec.update({
            "judul": title,
            "konten": None,
            "content_length": 0,
            "http_status": res["status"],
            "scrape_status": "fail",
            "error_message": "Empty/short content after fallbacks"
        })
    return rec


In [None]:
df = pd.read_csv("LINK ARTICLE EMIRATES PBA - Sheet1(1).csv")
url_col = [c for c in df.columns if "url" in c.lower() or "link" in c.lower()][0]

records = []
for u in df[url_col].dropna():
    polite_delay()
    records.append(scrape_one(u))

df_out = pd.DataFrame(records)
# pisah hasil sukses & gagal
ok = df_out[df_out.scrape_status=="ok"].copy()
fail = df_out[df_out.scrape_status!="ok"].copy()

ok.to_csv("articles_raw.csv", index=False)        # → dipakai untuk EDA, sentiment, TF-IDF, NER
fail.to_csv("scrape_log_failed.csv", index=False) # → bukti di laporan + untuk perbaikan link
len(ok), len(fail)


ERROR:trafilatura.downloads:not a 200 response: 401 for URL https://www.reuters.com/world/middle-east/emirates-goes-hiring-spree-cabin-crew-pilots-ground-services-staff-2025-07-22/
ERROR:trafilatura.downloads:not a 200 response: 401 for URL https://www.reuters.com/world/middle-east/dubais-emirates-signs-preliminary-deal-add-crypto-payments-2025-07-09/
ERROR:trafilatura.downloads:not a 200 response: 401 for URL https://www.reuters.com/business/aerospace-defense/emirates-airline-says-wings-clipped-by-boeing-delays-2024-11-27
ERROR:trafilatura.downloads:not a 200 response: 401 for URL https://www.reuters.com/world/middle-east/emirates-suspends-flights-transiting-through-dubai-after-storm-2024-04-19/
ERROR:trafilatura.downloads:not a 200 response: 401 for URL https://www.reuters.com/world/middle-east/emirates-airline-satisfied-with-cash-reserves-ipo-depends-government-chairman-2025-04-29/
ERROR:trafilatura.downloads:not a 200 response: 401 for URL https://www.reuters.com/business/aerospace

(101, 49)

In [None]:
df = pd.read_csv("articles_raw.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   url             101 non-null    object 
 1   sumber          101 non-null    object 
 2   http_status     101 non-null    float64
 3   scrape_status   101 non-null    object 
 4   error_message   0 non-null      float64
 5   judul           101 non-null    object 
 6   konten          101 non-null    object 
 7   content_length  101 non-null    float64
dtypes: float64(3), object(5)
memory usage: 6.4+ KB


In [None]:
df

Unnamed: 0,url,sumber,http_status,scrape_status,error_message,judul,konten,content_length
0,https://www.emirates.com/media-centre/emirates...,www.emirates.com,200.0,ok,,"Emirates expands its mobile footprint, launche...",An error occurred while preparing your downloa...,2978.0
1,https://www.arabianbusiness.com/industries/tra...,www.arabianbusiness.com,200.0,ok,,Emirates Skywards launches Premium Economy fli...,Emirates Skywards has introduced flight reward...,3653.0
2,https://www.emirates.com/media-centre/emirates...,www.emirates.com,200.0,ok,,Emirates Skywards introduces flight rewards in...,An error occurred while preparing your downloa...,4226.0
3,https://gulfnews.com/business/aviation/emirate...,gulfnews.com,200.0,ok,,"Emirates relaunches Skywards Everyday app, her...",Members who live outside the UAE can also down...,1629.0
4,https://www.arabianbusiness.com/industries/ret...,www.arabianbusiness.com,200.0,ok,,Emirates relaunches Skywards Everyday app - Ar...,Emirates has announced the relaunch of its Sky...,1567.0
...,...,...,...,...,...,...,...,...
96,https://www.emirates.com/media-centre/soaring-...,www.emirates.com,200.0,ok,,Soaring into a Silver Jubilee - Emirates Skywa...,An error occurred while preparing your downloa...,5687.0
97,https://www.emirates.com/media-centre/emirates...,www.emirates.com,200.0,ok,,Emirates introduces Premium Economy to Kuala L...,An error occurred while preparing your downloa...,3238.0
98,https://timesofindia.indiatimes.com/world/midd...,timesofindia.indiatimes.com,200.0,ok,,Dubai: Emirates cancels multiple flights as su...,"At TOI World Desk, our dedicated team of seaso...",1168.0
99,https://www.emirates.com/media-centre/emirates...,www.emirates.com,200.0,ok,,Emirates hosted the all-female Emirates ICC ma...,An error occurred while preparing your downloa...,2791.0
