In [1]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
!tesseract --version

'tesseract' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
import cv2
import numpy as np
import pandas as pd
import re
import os
import matplotlib.pyplot as plt
import pytesseract
from typing import List, Dict, Tuple, Optional, Any
from datetime import datetime, timedelta

# ==============================================================================
# 1. KONFIGURASI TERPUSAT (Ubah semua pengaturan hanya di sini)
# ==============================================================================
CONFIG = {
    # Path ke Tesseract-OCR (uncomment jika perlu untuk Windows)
    "tesseract_cmd": r"C:\Program Files\Tesseract-OCR\tesseract.exe",
    
    # ❗ WAJIB: Ganti dengan path folder data Anda yang sebenarnya
    "folder_path": r"C:\Users\LENOVO\Downloads\statexplore\data\c10\consumption_pct_gdp",
    
    # Nama file output yang akan disimpan
    "output_csv_file": "consumption_pct_gdp_extracted_final.csv",
    
    # Rentang tahun yang akan diproses (kosongkan list untuk proses semua file)
    "years_to_process": [],  # Contoh: [2000, 2001, 2002] atau biarkan kosong
    
    # --- Pengaturan dari Kode Referensi ---
    # Rentang sumbu Y fallback (jika OCR gagal)
    "fallback_y_min": 52.0,
    "fallback_y_max": 72.0,
    
    # Rentang "Clamping" untuk menstabilkan nilai dan membuang outlier
    "clamp_y_min": 58.0,
    "clamp_y_max": 71.0,
    
    # Konfigurasi variabel dan warna (HSV)
    "var_name": "consumption_pct_gdp",
    "lower_color": np.array([90, 80, 50]),
    "upper_color": np.array([150, 255, 255]),
    
    # Opsi debug
    "debug_mode": False
}

# Aktifkan path Tesseract jika disetel di CONFIG
if CONFIG["tesseract_cmd"] and os.path.exists(CONFIG["tesseract_cmd"]):
    pytesseract.pytesseract.tesseract_cmd = CONFIG["tesseract_cmd"]

# ==============================================================================
# 2. FUNGSI-FUNGSI BANTU & INTI
# ==============================================================================

def is_leap(year: int) -> bool:
    """Mengecek apakah suatu tahun adalah tahun kabisat."""
    return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)

def preprocess_for_ocr(image: np.ndarray) -> np.ndarray:
    """Membersihkan gambar agar lebih mudah dibaca oleh OCR."""
    gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    # Thresholding sederhana seringkali cukup untuk teks yang jelas
    _, thresh = cv.threshold(gray, 150, 255, cv.THRESH_BINARY_INV)
    return cv.resize(thresh, None, fx=1.5, fy=1.5, interpolation=cv.INTER_CUBIC)

def extract_text_from_image(image_crop: Dict[str, np.ndarray]) -> Tuple[Optional[int], Optional[float], Optional[float]]:
    """Mengekstrak tahun dari judul dan rentang sumbu Y dari gambar."""
    year_ocr, y_min_ocr, y_max_ocr = None, None, None
    try:
        title_text = pytesseract.image_to_string(image_crop['title'], config="--psm 6")
        match = re.search(r'\b(20\d{2})\b', title_text)
        if match:
            year_ocr = int(match.group(1))
    except Exception: pass
    
    try:
        processed_yaxis = preprocess_for_ocr(image_crop['yaxis'])
        ocr_text = pytesseract.image_to_string(processed_yaxis, config='--psm 6')
        numbers = [float(num) for num in re.findall(r'(\d+\.?\d*)', ocr_text)]
        if numbers and len(numbers) > 1:
            y_min_ocr, y_max_ocr = min(numbers), max(numbers)
    except Exception: pass
        
    return year_ocr, y_min_ocr, y_max_ocr

def extract_series_from_mask(mask: np.ndarray, params: Dict[str, Any]) -> pd.DataFrame:
    """Mengekstrak data deret waktu, mengadopsi logika dari kode referensi."""
    extracted_data = []
    plot_h, plot_w = mask.shape
    days_in_year = 366 if is_leap(params['year']) else 365
    start_date = datetime(params['year'], 1, 1)
    
    for x_pixel in range(plot_w):
        y_pixels = np.where(mask[:, x_pixel] > 0)[0]
        if len(y_pixels) > 0:
            y_pixel_center = int(np.median(y_pixels))  # Menggunakan median untuk stabilitas
            
            # Mapping nilai
            value = params['y_max'] - ((y_pixel_center / plot_h) * (params['y_max'] - params['y_min']))
            
            # Clamping nilai agar tetap dalam rentang logis
            value = max(min(value, params['clamp_y_max']), params['clamp_y_min'])
            
            day_of_year = int((x_pixel / plot_w) * days_in_year) + 1
            date = start_date + timedelta(days=day_of_year - 1)
            
            extracted_data.append({
                "Date": date.strftime("%Y-%m-%d"),
                "Year": params['year'], 
                "DayOfYear": day_of_year, 
                params['var_name']: round(value, 2)
            })
            
    return pd.DataFrame(extracted_data)

# ==============================================================================
# 3. FUNGSI UTAMA (Runner)
# ==============================================================================

def main(config: Dict[str, Any]):
    """Fungsi utama untuk menjalankan seluruh alur kerja."""
    
    if not os.path.isdir(config['folder_path']):
        print(f"❌ KESALAHAN: Folder tidak ditemukan di '{config['folder_path']}'.")
        return

    all_files = sorted([f for f in os.listdir(config['folder_path']) if f.endswith(".png")])
    
    # Filter file berdasarkan tahun jika disetel di CONFIG
    if config['years_to_process']:
        years_set = set(config['years_to_process'])
        files_to_process = [
            fname for fname in all_files 
            if re.search(r'(\d{4})', fname) and int(re.search(r'(\d{4})', fname).group(1)) in years_set
        ]
    else:
        files_to_process = all_files
    
    if not files_to_process:
        print(f"⚠️ Peringatan: Tidak ada file .png yang cocok ditemukan.")
        return

    print(f"🚀 Ditemukan {len(files_to_process)} file. Memulai proses...")
    
    all_results = []
    for filename in files_to_process:
        full_path = os.path.join(config['folder_path'], filename)
        img = cv2.imread(full_path)
        if img is None:
            print(f"   -> [Peringatan] Gagal membaca {filename}, dilewati.")
            continue

        print(f"\n[INFO] Memproses {filename}...")
        h, w, _ = img.shape
        
        crops = {
            'title': img[0:int(h*0.1), 0:w],
            'yaxis': img[int(h*0.12):int(h*0.9), 0:int(w*0.06)],
            'plot': img[int(h*0.12):int(h*0.9), int(w*0.05):int(w*0.98)]
        }

        year_ocr, y_min_ocr, y_max_ocr = extract_text_from_image(crops)
        year_from_file = int(re.search(r'(\d{4})', filename).group(1)) if re.search(r'(\d{4})', filename) else 2000
        
        year = year_ocr if year_ocr else year_from_file
        y_min, y_max = (y_min_ocr, y_max_ocr) if y_min_ocr is not None else (config['fallback_y_min'], config['fallback_y_max'])
        
        hsv = cv2.cvtColor(crops['plot'], cv2.COLOR_BGR2HSV)
        mask = cv2.inRange(hsv, config['lower_color'], config['upper_color'])
        
        series_params = {
            'year': year, 'y_min': y_min, 'y_max': y_max,
            'clamp_y_min': config['clamp_y_min'], 'clamp_y_max': config['clamp_y_max'],
            'var_name': config['var_name']
        }

        df_single_year = extract_series_from_mask(mask, series_params)
        if not df_single_year.empty:
            all_results.append(df_single_year)
            print(f"  -> [SUCCESS] {len(df_single_year)} titik data berhasil diekstrak.")
        else:
            print(f"  -> [WARNING] Tidak ada titik data yang ditemukan.")
            
    if not all_results:
        print("\n⚠️ PROSES SELESAI: Tidak ada data yang berhasil diekstrak dari semua file.")
        return

    # Gabungkan semua hasil dan simpan
    final_df = pd.concat(all_results, ignore_index=True).drop_duplicates(subset=['Date'], keep='first').sort_values(by='Date')
    final_df.to_csv(config['output_csv_file'], index=False)
    print(f"\n✅ SUKSES! {len(final_df)} baris data disimpan ke {config['output_csv_file']}")

# ==============================================================================
# 4. TITIK MASUK PROGRAM
# ==============================================================================

if __name__ == '__main__':
    main(CONFIG)

🚀 Ditemukan 25 file. Memulai proses...

[INFO] Memproses consumption_pct_gdp tahun 2000.png...
  -> [SUCCESS] 3389 titik data berhasil diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2001.png...
  -> [SUCCESS] 3388 titik data berhasil diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2002.png...
  -> [SUCCESS] 3388 titik data berhasil diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2003.png...
  -> [SUCCESS] 3389 titik data berhasil diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2004.png...
  -> [SUCCESS] 3389 titik data berhasil diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2005.png...
  -> [SUCCESS] 3388 titik data berhasil diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2006.png...
  -> [SUCCESS] 3389 titik data berhasil diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2007.png...
  -> [SUCCESS] 3389 titik data berhasil diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2008.png...
  -> [SUCCESS] 3389 titik data berhasil diekstrak