In [24]:
import cv2
import numpy as np
import pandas as pd
import re
import os
import matplotlib.pyplot as plt
import pytesseract

# --- PENGATURAN UTAMA ---

# 1. Sesuaikan path ke tesseract jika perlu (untuk Windows)
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# 2. ❗ WAJIB: Ganti dengan path folder data Anda yang sebenarnya
folder_path = r"D:\Dokumen\Proyek Data\Data Ekonomi\debt_gdp_ratio"

# 3. Nama file output yang spesifik
output_csv_file = "debt_gdp_ratio_2000-2009_extracted.csv"

# 4. ❗ PENTING: Sesuaikan rentang sumbu Y fallback sesuai visual di grafik Anda
#    Lihat salah satu gambar Anda, perkirakan nilai terendah dan tertingginya.
FALLBACK_Y_MIN, FALLBACK_Y_MAX = 20, 100 # Contoh, mohon disesuaikan

# 5. Konfigurasi variabel tunggal
var_name = "debt_gdp_ratio"
lower_color = np.array([90, 80, 50]) # Rentang untuk warna biru/garis utama
upper_color = np.array([130, 255, 255])


# ------------------- FUNGSI-FUNGSI BANTU (Telah disesuaikan) -------------------
def preprocess_for_ocr(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (3, 3), 0)
    thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 4)
    return cv2.resize(thresh, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

def clean_mask(mask):
    kernel = np.ones((3, 3), np.uint8)
    return cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=1)

def is_leap(year):
    return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)

def extract_year_from_title(title_crop):
    try:
        title_text = pytesseract.image_to_string(title_crop)
        match = re.search(r'\b(20\d{2})\b', title_text)
        if match: return int(match.group(1))
    except: pass
    return None

def extract_yaxis_range(yaxis_crop):
    try:
        processed_yaxis = preprocess_for_ocr(yaxis_crop)
        ocr_text = pytesseract.image_to_string(processed_yaxis, config='--psm 6')
        numbers = [float(num) for num in re.findall(r'(\d+\.?\d*)', ocr_text)]
        if numbers and len(numbers) > 1: return min(numbers), max(numbers)
    except: pass
    return None, None

def extract_series(mask, plot_h, plot_w, year, y_min, y_max, ocr_results):
    extracted = []
    cleaned_mask = clean_mask(mask)
    days_in_year = 366 if is_leap(year) else 365
    for x_pixel in range(plot_w):
        y_pixels = np.where(cleaned_mask[:, x_pixel] > 0)[0]
        if len(y_pixels) > 0:
            y_pixel_center = int(np.median(y_pixels))
            value = y_max - ((y_pixel_center / plot_h) * (y_max - y_min))
            day_of_year = int((x_pixel / plot_w) * days_in_year) + 1
            data_point = {"Year": year, "DayOfYear": day_of_year, var_name: round(value, 2)}
            data_point.update(ocr_results)
            extracted.append(data_point)
    return extracted

def process_and_visualize_data(df, title="Hasil Ekstraksi Data"):
    print("\n[ANALISIS] Memulai pemrosesan data...")
    proc_df = df.copy()
    proc_df['Date'] = pd.to_datetime(proc_df['Year'].astype(str) + '-' + proc_df['DayOfYear'].astype(str), format='%Y-%j', errors='coerce')
    proc_df = proc_df.sort_values(by='Date').set_index('Date')
    proc_df.interpolate(method='time', inplace=True)
    proc_df.dropna(inplace=True)
    if var_name in proc_df.columns:
        proc_df[f'{var_name}_MA30'] = proc_df[var_name].rolling(window=30).mean()
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(15, 7))
    if var_name in proc_df.columns:
        ax.plot(proc_df.index, proc_df[var_name], label=f'Data Asli - {var_name}', alpha=0.6)
        ax.plot(proc_df.index, proc_df[f'{var_name}_MA30'], label=f'MA30 - {var_name}', linestyle='--')
    ax.set_title(title, fontsize=16)
    ax.legend()
    plt.show()
    return proc_df

# ------------------- KODE UTAMA -------------------
def main():
    print(f"Memulai proses dari folder: '{folder_path}'")
    try:
        all_files = sorted([f for f in os.listdir(folder_path) if f.endswith(".png")])
    except FileNotFoundError:
        print(f"❌ KESALAHAN: Folder tidak ditemukan di '{folder_path}'. Ganti 'folder_path' dengan path yang benar.")
        return
    
    # --- Filter file berdasarkan rentang tahun 2000-2009 ---
    start_year, end_year = 2000, 2009
    files_to_process = []
    for filename in all_files:
        match = re.search(r'(\d{4})', filename)
        if match:
            year = int(match.group(1))
            if start_year <= year <= end_year:
                files_to_process.append(filename)
    
    if not files_to_process:
        print(f"⚠️ Peringatan: Tidak ada file yang ditemukan untuk rentang tahun {start_year}-{end_year}.")
        return
        
    print(f"🚀 Ditemukan {len(files_to_process)} file untuk rentang {start_year}-{end_year}. Memulai proses...")
    all_points = []

    for filename in files_to_process:
        full_path = os.path.join(folder_path, filename)
        img = cv2.imread(full_path)
        if img is None: continue
        
        h, w, _ = img.shape
        print(f"\n[INFO] Memproses {filename}...")

        title_crop = img[0:int(h*0.1), 0:w]
        yaxis_crop = img[int(h*0.12):int(h*0.9), 0:int(w*0.06)]

        year_ocr = extract_year_from_title(title_crop)
        year_from_file = int(re.search(r'(\d{4})', filename).group(1))
        year = year_ocr if year_ocr else year_from_file
        
        y_min_ocr, y_max_ocr = extract_yaxis_range(yaxis_crop)
        
        if y_min_ocr is not None and y_max_ocr is not None:
            y_min, y_max, yaxis_method = y_min_ocr, y_max_ocr, "OCR"
            print(f"  -> Sumbu Y (OCR): Min={y_min}, Max={y_max}")
        else:
            y_min, y_max, yaxis_method = FALLBACK_Y_MIN, FALLBACK_Y_MAX, "Fallback"
            print(f"  -> Sumbu Y (Fallback): Min={y_min}, Max={y_max}")

        ocr_results_to_save = {
            'YearOCR': year_ocr or '', 'YMinOCR': y_min_ocr or '',
            'YMaxOCR': y_max_ocr or '', 'YAxisMethod': yaxis_method
        }

        plot_area = img[int(h*0.12):int(h*0.9), int(w*0.05):int(w*0.98)]
        plot_h, plot_w, _ = plot_area.shape
        hsv = cv2.cvtColor(plot_area, cv2.COLOR_BGR2HSV)
        
        mask = cv2.inRange(hsv, lower_color, upper_color)
        
        data = extract_series(mask, plot_h, plot_w, year, y_min, y_max, ocr_results_to_save)
        
        if data:
            all_points.append(pd.DataFrame(data))

    if all_points:
        raw_df = pd.concat(all_points, ignore_index=True)
        analyzed_df = process_and_visualize_data(raw_df, title=f"Analisis Debt to GDP Ratio ({start_year}-{end_year})")
        
        analyzed_df = analyzed_df.reset_index()
        value_cols = [var_name, f'{var_name}_MA30']
        id_cols = ['Date', 'Year', 'DayOfYear']
        ocr_cols = ['YearOCR', 'YMinOCR', 'YMaxOCR', 'YAxisMethod']
        final_cols_order = [col for col in id_cols + value_cols + ocr_cols if col in analyzed_df.columns]
        
        final_df_to_save = analyzed_df[final_cols_order]
        final_df_to_save.to_csv(output_csv_file, index=False)
        print(f"\n✅ SUKSES! {len(final_df_to_save)} baris data disimpan ke {output_csv_file}")
    else:
        print("\n⚠️ PROSES SELESAI: Tidak ada data yang berhasil diekstrak.")

if __name__ == '__main__':
    main()

Memulai proses dari folder: 'D:\Dokumen\Proyek Data\Data Ekonomi\debt_gdp_ratio'
❌ KESALAHAN: Folder tidak ditemukan di 'D:\Dokumen\Proyek Data\Data Ekonomi\debt_gdp_ratio'. Ganti 'folder_path' dengan path yang benar.


In [25]:
# ==============================================================================
# Script Ekstraksi Data Grafik v2.0
# Deskripsi:
#   Script ini mengekstrak data deret waktu dari serangkaian gambar grafik.
#   Menggunakan OCR untuk mendeteksi tahun dan rentang sumbu Y secara otomatis,
#   serta deteksi warna untuk melacak garis data pada plot.
#
# Kebutuhan (Dependencies):
#   - opencv-python
#   - numpy
#   - pandas
#   - pytesseract
#   - matplotlib
# ==============================================================================

import cv2
import numpy as np
import pandas as pd
import re
import os
import matplotlib.pyplot as plt
import pytesseract
from typing import List, Dict, Tuple, Optional, Any
from datetime import datetime, timedelta

# ==============================================================================
# 1. KONFIGURASI TERPUSAT (Ubah semua pengaturan hanya di sini)
# ==============================================================================
CONFIG = {
    # Path ke Tesseract-OCR (uncomment jika perlu untuk Windows)
    # "tesseract_cmd": r"C:\Program Files\Tesseract-OCR\tesseract.exe",
    
    # ❗ WAJIB: Ganti dengan path folder data Anda yang sebenarnya
    "folder_path": r"C:\Users\LENOVO\Downloads\statexplore\data\c10\consumption_pct_gdp",
    
    # Nama file output yang akan disimpan
    "output_csv_file": "consumption_pct_gdp_final_structured.csv",
    
    # Rentang tahun yang akan diproses (kosongkan list untuk proses semua file)
    "years_to_process": [],  # Contoh: [2000, 2001, 2002]
    
    # Pengaturan fallback dan clamping dari referensi Anda
    "fallback_y_min": 52.0,
    "fallback_y_max": 72.0,
    "clamp_y_min": 58.0,
    "clamp_y_max": 71.0,
    
    # Konfigurasi variabel dan warna (HSV)
    "var_name": "Value", # Nama kolom di CSV diubah menjadi 'Value'
    "lower_color": np.array([90, 80, 50]),
    "upper_color": np.array([150, 255, 255]),
}

# ==============================================================================
# 2. FUNGSI-FUNGSI BANTU (Utility Functions)
# ==============================================================================

def is_leap(year: int) -> bool:
    """Mengecek apakah suatu tahun adalah tahun kabisat."""
    return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)

def validate_config(config: Dict[str, Any]) -> bool:
    """
    Memvalidasi konfigurasi penting sebelum skrip berjalan.
    
    Args:
        config (Dict): Dictionary konfigurasi.
        
    Returns:
        bool: True jika valid, False jika tidak.
    """
    # Uncomment jika Tesseract wajib ada
    # tesseract_path = config.get("tesseract_cmd")
    # if tesseract_path and not os.path.exists(tesseract_path):
    #     print(f"❌ KESALAHAN: Path Tesseract tidak ditemukan di '{tesseract_path}'")
    #     return False

    folder_path = config.get("folder_path")
    if not folder_path or not os.path.isdir(folder_path):
        print(f"❌ KESALAHAN: Folder data tidak ditemukan di '{folder_path}'. Harap periksa CONFIG.")
        return False
        
    return True

# ==============================================================================
# 3. FUNGSI EKSTRAKSI INTI (Core Extraction Functions)
# ==============================================================================

def extract_text_from_image(image_crop: Dict[str, np.ndarray]) -> Tuple[Optional[int], Optional[float], Optional[float]]:
    """Mengekstrak tahun dan rentang sumbu Y dari potongan gambar."""
    year_ocr = None
    try:
        title_text = pytesseract.image_to_string(image_crop['title'], config="--psm 6")
        match = re.search(r'\b(20\d{2})\b', title_text)
        if match:
            year_ocr = int(match.group(1))
    except Exception:
        pass
    # Fungsi ini dapat diperluas untuk OCR sumbu Y jika diperlukan di masa depan
    return year_ocr, None, None

def extract_series_from_mask(mask: np.ndarray, params: Dict[str, Any]) -> pd.DataFrame:
    """Mengekstrak data deret waktu, mengadopsi logika dari kode referensi."""
    extracted_data = []
    plot_h, plot_w = mask.shape
    days_in_year = 366 if is_leap(params['year']) else 365
    start_date = datetime(params['year'], 1, 1)
    
    for x_pixel in range(plot_w):
        y_pixels = np.where(mask[:, x_pixel] > 0)[0]
        if len(y_pixels) > 0:
            y_pixel_center = int(np.median(y_pixels))
            value = params['y_max'] - ((y_pixel_center / plot_h) * (params['y_max'] - params['y_min']))
            value = max(min(value, params['clamp_y_max']), params['clamp_y_min'])
            
            day_of_year = int((x_pixel / plot_w) * days_in_year)
            date = start_date + timedelta(days=day_of_year)
            
            extracted_data.append({
                "Date": date.strftime("%Y-%m-%d"),
                "Year": params['year'], 
                "DayOfYear": day_of_year + 1, 
                params['var_name']: round(value, 2)
            })
            
    return pd.DataFrame(extracted_data)

# ==============================================================================
# 4. FUNGSI UTAMA (Runner)
# ==============================================================================

def main(config: Dict[str, Any]):
    """
    Fungsi utama untuk menjalankan seluruh alur kerja:
    1. Validasi konfigurasi.
    2. Cari dan filter file gambar.
    3. Proses setiap gambar untuk ekstraksi data.
    4. Gabungkan dan simpan hasil ke file CSV.
    """
    if not validate_config(config):
        return

    all_files = sorted([f for f in os.listdir(config['folder_path']) if f.endswith(".png")])
    
    if config['years_to_process']:
        years_set = set(config['years_to_process'])
        files_to_process = [fname for fname in all_files if re.search(r'(\d{4})', fname) and int(re.search(r'(\d{4})', fname).group(1)) in years_set]
    else:
        files_to_process = all_files
    
    if not files_to_process:
        print("⚠️ Peringatan: Tidak ada file .png yang cocok ditemukan.")
        return

    print(f"🚀 Ditemukan {len(files_to_process)} file. Memulai proses...")
    
    all_results = []
    success_count = 0
    failure_count = 0

    for filename in files_to_process:
        try:
            full_path = os.path.join(config['folder_path'], filename)
            img = cv2.imread(full_path)
            if img is None:
                print(f"   -> [Gagal] Tidak bisa membaca {filename}")
                failure_count += 1
                continue

            print(f"\n[INFO] Memproses {filename}...")
            h, w, _ = img.shape
            
            title_crop = img[0:int(h*0.1), 0:w]
            year_ocr, _, _ = extract_text_from_image({'title': title_crop})
            year_from_file = int(re.search(r'(\d{4})', filename).group(1)) if re.search(r'(\d{4})', filename) else 2000
            year = year_ocr if year_ocr else year_from_file

            plot_area = img[int(h*0.12):int(h*0.9), int(w*0.05):int(w*0.98)]
            hsv = cv2.cvtColor(plot_area, cv2.COLOR_BGR2HSV)
            mask = cv2.inRange(hsv, config['lower_color'], config['upper_color'])
            
            series_params = {
                'year': year, 
                'y_min': config['fallback_y_min'], 'y_max': config['fallback_y_max'],
                'clamp_y_min': config['clamp_y_min'], 'clamp_y_max': config['clamp_y_max'],
                'var_name': config['var_name']
            }

            df_single_year = extract_series_from_mask(mask, series_params)
            
            if not df_single_year.empty:
                all_results.append(df_single_year)
                print(f"  -> [Sukses] {len(df_single_year)} titik data diekstrak.")
                success_count += 1
            else:
                print(f"  -> [Gagal] Tidak ada titik data ditemukan.")
                failure_count += 1

        except Exception as e:
            print(f"  -> [ERROR] Terjadi kesalahan tak terduga pada {filename}: {e}")
            failure_count += 1
            
    # --- Proses Akhir & Laporan ---
    print("\n" + "="*50)
    print("PROSES EKSTRAKSI SELESAI")
    print(f"  -> Berhasil diproses: {success_count} file")
    print(f"  -> Gagal diproses   : {failure_count} file")
    print("="*50)

    if not all_results:
        print("\n⚠️ Tidak ada data yang berhasil diekstrak untuk disimpan.")
        return

    final_df = pd.concat(all_results, ignore_index=True).drop_duplicates(subset=['Date'], keep='first').sort_values(by='Date')
    final_df.to_csv(config['output_csv_file'], index=False)
    print(f"\n✅ SUKSES! {len(final_df)} baris data unik disimpan ke {config['output_csv_file']}")

# ==============================================================================
# 5. TITIK MASUK PROGRAM
# ==============================================================================

if __name__ == '__main__':
    # Aktifkan path Tesseract jika disetel di CONFIG (hanya untuk Windows)
    tesseract_cmd = CONFIG.get("tesseract_cmd")
    if tesseract_cmd and os.path.exists(tesseract_cmd):
        pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
        
    main(CONFIG)

🚀 Ditemukan 25 file. Memulai proses...

[INFO] Memproses consumption_pct_gdp tahun 2000.png...
  -> [Sukses] 3389 titik data diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2001.png...
  -> [Sukses] 3388 titik data diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2002.png...
  -> [Sukses] 3388 titik data diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2003.png...
  -> [Sukses] 3389 titik data diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2004.png...
  -> [Sukses] 3389 titik data diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2005.png...
  -> [Sukses] 3388 titik data diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2006.png...
  -> [Sukses] 3389 titik data diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2007.png...
  -> [Sukses] 3389 titik data diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2008.png...
  -> [Sukses] 3389 titik data diekstrak.

[INFO] Memproses consumption_pct_gdp tahun 2009.png...
  -> [Sukses] 3388 titik data di

In [26]:
# ==============================================================================
# Script Ekstraksi Data Grafik v3.0 (Multi-Variabel)
# Deskripsi:
#   Script ini mengekstrak data dari gambar grafik yang berisi satu atau lebih
#   garis data berwarna.
#
# Kebutuhan (Dependencies):
#   - opencv-python, numpy, pandas, pytesseract, matplotlib
# ==============================================================================

import cv2
import numpy as np
import pandas as pd
import re
import os
import matplotlib.pyplot as plt
import pytesseract
from typing import List, Dict, Tuple, Optional, Any
from datetime import datetime, timedelta

# (Konfigurasi CONFIG dari atas ditempatkan di sini)
# ...

# Aktifkan path Tesseract jika disetel di CONFIG
tesseract_cmd = CONFIG.get("tesseract_cmd")
if tesseract_cmd and os.path.exists(tesseract_cmd):
    pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

# ==============================================================================
# 2. FUNGSI-FUNGSI BANTU (Utility Functions)
# ==============================================================================

def is_leap(year: int) -> bool:
    """Mengecek apakah suatu tahun adalah tahun kabisat."""
    return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)

def clean_mask(mask: np.ndarray) -> np.ndarray:
    """Menggunakan morphological opening untuk menghapus noise dari mask."""
    kernel = np.ones((3, 3), np.uint8)
    return cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=1)

def preprocess_for_ocr(image: np.ndarray) -> np.ndarray:
    """Membersihkan gambar agar lebih mudah dibaca oleh OCR."""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (3, 3), 0)
    thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 4)
    return cv2.resize(thresh, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

# ==============================================================================
# 3. FUNGSI EKSTRAKSI INTI (Core Extraction Functions)
# ==============================================================================

def extract_year_from_title(title_crop: np.ndarray) -> Optional[int]:
    """Mengekstrak tahun dari potongan gambar judul."""
    try:
        title_text = pytesseract.image_to_string(title_crop, config="--psm 6")
        match = re.search(r'\b(20\d{2})\b', title_text)
        if match: return int(match.group(1))
    except Exception: pass
    return None

def extract_series_from_mask(mask: np.ndarray, params: Dict[str, Any]) -> pd.DataFrame:
    """Mengekstrak data deret waktu dari mask warna yang sudah bersih."""
    extracted_data = []
    plot_h, plot_w = mask.shape
    days_in_year = 366 if is_leap(params['year']) else 365
    start_date = datetime(params['year'], 1, 1)
    
    for x_pixel in range(plot_w):
        y_pixels = np.where(mask[:, x_pixel] > 0)[0]
        if len(y_pixels) > 0:
            y_pixel_center = int(np.median(y_pixels))
            value = params['y_max'] - ((y_pixel_center / plot_h) * (params['y_max'] - params['y_min']))
            value = max(min(value, params['clamp_y_max']), params['clamp_y_min'])
            
            day_of_year = int((x_pixel / plot_w) * days_in_year)
            date = start_date + timedelta(days=day_of_year)
            
            extracted_data.append({
                "Date": date.strftime("%Y-%m-%d"),
                "Year": params['year'], 
                "DayOfYear": day_of_year + 1, 
                params['var_name']: round(value, 4) # Tingkatkan presisi untuk indeks
            })
            
    return pd.DataFrame(extracted_data)

def process_single_image(image_path: str, config: Dict[str, Any]) -> Optional[pd.DataFrame]:
    """Memproses satu file gambar dan mengembalikan hasilnya sebagai DataFrame."""
    print(f"\n[INFO] Memproses {os.path.basename(image_path)}...")
    img = cv2.imread(image_path)
    if img is None: return None

    h, w, _ = img.shape
    title_crop = img[0:int(h*0.1), 0:w]
    plot_area = img[int(h*0.12):int(h*0.9), int(w*0.05):int(w*0.98)]
    
    year_ocr = extract_year_from_title(title_crop)
    year_from_file = int(re.search(r'(\d{4})', image_path).group(1))
    year = year_ocr if year_ocr else year_from_file
    y_min, y_max = config['fallback_y_min'], config['fallback_y_max']

    hsv = cv2.cvtColor(plot_area, cv2.COLOR_BGR2HSV)
    
    # ✨ DIUBAH: Loop untuk setiap variabel yang didefinisikan di CONFIG
    all_vars_df = []
    for var_info in config['variables']:
        var_name = var_info['name']
        print(f"  -> Mengekstrak '{var_name}'...")
        
        mask = cv2.inRange(hsv, var_info['lower_color'], var_info['upper_color'])
        cleaned_mask = clean_mask(mask)
        
        series_params = {
            'year': year, 'y_min': y_min, 'y_max': y_max,
            'clamp_y_min': config['clamp_y_min'], 'clamp_y_max': config['clamp_y_max'],
            'var_name': var_name
        }
        
        df_var = extract_series_from_mask(cleaned_mask, series_params)
        if not df_var.empty:
            all_vars_df.append(df_var)

    # ✨ DIUBAH: Gabungkan hasil dari semua variabel menjadi satu DataFrame
    if not all_vars_df:
        return None
        
    # Mulai dengan DataFrame pertama
    final_df = all_vars_df[0]
    # Gabungkan dengan sisa DataFrame (jika ada)
    for i in range(1, len(all_vars_df)):
        final_df = pd.merge(final_df, all_vars_df[i], on=["Date", "Year", "DayOfYear"], how="outer")
        
    return final_df.sort_values(by="Date")

# ==============================================================================
# 4. FUNGSI PASCA-PROSES & VISUALISASI
# ==============================================================================

def post_process_data(df: pd.DataFrame, config: Dict[str, Any]) -> pd.DataFrame:
    """Membersihkan, mengubah, dan menambah fitur pada data gabungan."""
    print("\n[ANALISIS] Memulai pasca-pemrosesan data...")
    proc_df = df.copy()
    proc_df['Date'] = pd.to_datetime(proc_df['Date'])
    proc_df = proc_df.sort_values(by='Date').set_index('Date')
    proc_df.interpolate(method='time', inplace=True)
    proc_df.dropna(inplace=True)
    
    # ✨ DIUBAH: Loop untuk setiap variabel
    for var_info in config['variables']:
        var_name = var_info['name']
        if var_name in proc_df.columns:
            proc_df[f'{var_name}_MA30'] = proc_df[var_name].rolling(window=30).mean()
    
    return proc_df

def visualize_results(df: pd.DataFrame, config: Dict[str, Any]):
    """Membuat plot visual dari data akhir."""
    title = f"Analisis Indeks ({config['start_year']}-{config['end_year']})"
    
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(15, 7))
    
    # ✨ DIUBAH: Loop untuk setiap variabel
    for var_info in config['variables']:
        var_name = var_info['name']
        if var_name in df.columns:
            ax.plot(df.index, df[var_name], label=f'Data Asli - {var_name}', alpha=0.7)
            ax.plot(df.index, df[f'{var_name}_MA30'], label=f'MA30 - {var_name}', linestyle='--')

    ax.set_title(title, fontsize=16)
    ax.legend()
    plt.show()

# ==============================================================================
# 5. FUNGSI UTAMA (Runner)
# ==============================================================================

def main(config: Dict[str, Any]):
    """Fungsi utama untuk menjalankan seluruh alur kerja."""
    if not os.path.isdir(config['folder_path']):
        print(f"❌ KESALAHAN: Folder tidak ditemukan di '{config['folder_path']}'.")
        return

    all_files = sorted([f for f in os.listdir(config['folder_path']) if f.endswith(".png")])
    
    # ... (Logika filter file tidak berubah)
    # ...

    results_list = [process_single_image(os.path.join(config['folder_path'], f), config) for f in all_files]
    
    valid_results = [df for df in results_list if df is not None and not df.empty]
    if not valid_results:
        print("\n⚠️ PROSES SELESAI: Tidak ada data yang berhasil diekstrak.")
        return

    raw_df = pd.concat(valid_results, ignore_index=True)
    analyzed_df = post_process_data(raw_df, config)
    visualize_results(analyzed_df, config)
    
    analyzed_df.reset_index(inplace=True)
    analyzed_df.to_csv(config['output_csv_file'], index=False)
    print(f"\n✅ SUKSES! {len(analyzed_df)} baris data disimpan ke {config['output_csv_file']}")

# ==============================================================================
# 6. TITIK MASUK PROGRAM
# ==============================================================================

if __name__ == '__main__':
    main(CONFIG)


[INFO] Memproses consumption_pct_gdp tahun 2000.png...


KeyError: 'variables'

In [27]:
import cv2
import numpy as np
import pandas as pd
import re
import os
import matplotlib.pyplot as plt
import pytesseract
from typing import List, Dict, Tuple, Optional, Any
from datetime import datetime, timedelta

# (Letakkan salah satu blok CONFIG dari atas di sini, sesuai data Anda)
# Contoh untuk education_index and healthcare_index
CONFIG = {
    "folder_path": r"C:\path\to\your\data\education_index_and_healthcare_index",
    "output_csv_file": "education_healthcare_extracted.csv",
    "years_to_process": [],
    "fallback_y_min": 0.55,
    "fallback_y_max": 0.72,
    "clamp_y_min": 0.54,
    "clamp_y_max": 0.71,
    "variables": [
        { "name": "education_index", "lower_color": np.array([90, 80, 50]), "upper_color": np.array([130, 255, 255]) },
        { "name": "healthcare_index", "lower_color": np.array([10, 80, 80]), "upper_color": np.array([25, 255, 255]) }
    ],
    "debug_mode": False
}

# Aktifkan path Tesseract jika disetel di CONFIG
tesseract_cmd = CONFIG.get("tesseract_cmd")
if tesseract_cmd and os.path.exists(tesseract_cmd):
    pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

# ... (Fungsi-fungsi bantu seperti is_leap, clean_mask, dll tidak berubah) ...
def is_leap(year: int) -> bool: return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)
def clean_mask(mask: np.ndarray) -> np.ndarray:
    kernel = np.ones((3, 3), np.uint8)
    return cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=1)
def preprocess_for_ocr(image: np.ndarray) -> np.ndarray:
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (3, 3), 0)
    thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 4)
    return cv2.resize(thresh, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
def extract_year_from_title(title_crop: np.ndarray) -> Optional[int]:
    try:
        title_text = pytesseract.image_to_string(title_crop, config="--psm 6")
        match = re.search(r'\b(20\d{2})\b', title_text)
        if match: return int(match.group(1))
    except Exception: pass
    return None
def extract_series_from_mask(mask: np.ndarray, params: Dict[str, Any]) -> pd.DataFrame:
    extracted_data = []
    plot_h, plot_w = mask.shape
    days_in_year = 366 if is_leap(params['year']) else 365
    start_date = datetime(params['year'], 1, 1)
    for x_pixel in range(plot_w):
        y_pixels = np.where(mask[:, x_pixel] > 0)[0]
        if len(y_pixels) > 0:
            y_pixel_center = int(np.median(y_pixels))
            value = params['y_max'] - ((y_pixel_center / plot_h) * (params['y_max'] - params['y_min']))
            value = max(min(value, params['clamp_y_max']), params['clamp_y_min'])
            day_of_year = int((x_pixel / plot_w) * days_in_year)
            date = start_date + timedelta(days=day_of_year)
            extracted_data.append({
                "Date": date.strftime("%Y-%m-%d"), "Year": params['year'], 
                "DayOfYear": day_of_year + 1, params['var_name']: round(value, 4)
            })
    return pd.DataFrame(extracted_data)


def process_single_image(image_path: str, config: Dict[str, Any]) -> Optional[pd.DataFrame]:
    """Memproses satu file gambar dan mengembalikan hasilnya sebagai DataFrame."""
    print(f"\n[INFO] Memproses {os.path.basename(image_path)}...")
    img = cv2.imread(image_path)
    if img is None: return None

    h, w, _ = img.shape
    title_crop = img[0:int(h*0.1), 0:w]
    plot_area = img[int(h*0.12):int(h*0.9), int(w*0.05):int(w*0.98)]
    
    year_ocr = extract_year_from_title(title_crop)
    year_from_file = int(re.search(r'(\d{4})', image_path).group(1))
    year = year_ocr if year_ocr else year_from_file
    y_min, y_max = config['fallback_y_min'], config['fallback_y_max']

    hsv = cv2.cvtColor(plot_area, cv2.COLOR_BGR2HSV)
    
    # ✨ BARU: Logika fleksibel untuk menangani satu atau banyak variabel
    if 'variables' in config:
        variables_to_process = config['variables']
    else:
        variables_to_process = [{
            "name": config['var_name'],
            "lower_color": config['lower_color'],
            "upper_color": config['upper_color']
        }]

    all_vars_df = []
    for var_info in variables_to_process:
        var_name = var_info['name']
        print(f"  -> Mengekstrak '{var_name}'...")
        mask = cv2.inRange(hsv, var_info['lower_color'], var_info['upper_color'])
        cleaned_mask = clean_mask(mask)
        series_params = {
            'year': year, 'y_min': y_min, 'y_max': y_max,
            'clamp_y_min': config['clamp_y_min'], 'clamp_y_max': config['clamp_y_max'],
            'var_name': var_name
        }
        df_var = extract_series_from_mask(cleaned_mask, series_params)
        if not df_var.empty:
            all_vars_df.append(df_var)

    if not all_vars_df: return None
    
    final_df = all_vars_df[0]
    for i in range(1, len(all_vars_df)):
        final_df = pd.merge(final_df, all_vars_df[i], on=["Date", "Year", "DayOfYear"], how="outer")
        
    return final_df.sort_values(by="Date")

def post_process_data(df: pd.DataFrame, config: Dict[str, Any]) -> pd.DataFrame:
    """Membersihkan, mengubah, dan menambah fitur pada data gabungan."""
    print("\n[ANALISIS] Memulai pasca-pemrosesan data...")
    proc_df = df.copy()
    proc_df['Date'] = pd.to_datetime(proc_df['Date'])
    proc_df = proc_df.sort_values(by='Date').set_index('Date')
    proc_df.interpolate(method='time', inplace=True)
    proc_df.dropna(inplace=True)
    
    # ✨ BARU: Logika fleksibel untuk menangani satu atau banyak variabel
    if 'variables' in config:
        vars_to_analyze = [v['name'] for v in config['variables']]
    else:
        vars_to_analyze = [config['var_name']]

    for var_name in vars_to_analyze:
        if var_name in proc_df.columns:
            proc_df[f'{var_name}_MA30'] = proc_df[var_name].rolling(window=30).mean()
    
    return proc_df

def visualize_results(df: pd.DataFrame, config: Dict[str, Any]):
    """Membuat plot visual dari data akhir."""
    title = f"Analisis Data ({config.get('start_year', 'Semua Tahun')}-{config.get('end_year', '')})"
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(15, 7))
    
    # ✨ BARU: Logika fleksibel untuk menangani satu atau banyak variabel
    if 'variables' in config:
        vars_to_plot = [v['name'] for v in config['variables']]
    else:
        vars_to_plot = [config['var_name']]

    for var_name in vars_to_plot:
        if var_name in df.columns:
            ax.plot(df.index, df[var_name], label=f'Data Asli - {var_name}', alpha=0.7)
            ax.plot(df.index, df[f'{var_name}_MA30'], label=f'MA30 - {var_name}', linestyle='--')

    ax.set_title(title, fontsize=16)
    ax.legend()
    plt.show()

def main(config: Dict[str, Any]):
    """Fungsi utama untuk menjalankan seluruh alur kerja."""
    if not os.path.isdir(config['folder_path']):
        print(f"❌ KESALAHAN: Folder tidak ditemukan di '{config['folder_path']}'.")
        return

    # ... (Logika runner tidak banyak berubah) ...
    all_files = sorted([f for f in os.listdir(config['folder_path']) if f.endswith(".png")])
    results_list = [process_single_image(os.path.join(config['folder_path'], f), config) for f in all_files]
    valid_results = [df for df in results_list if df is not None and not df.empty]
    if not valid_results:
        print("\n⚠️ PROSES SELESAI: Tidak ada data yang berhasil diekstrak.")
        return

    raw_df = pd.concat(valid_results, ignore_index=True)
    analyzed_df = post_process_data(raw_df, config)
    visualize_results(analyzed_df, config)
    analyzed_df.reset_index(inplace=True)
    analyzed_df.to_csv(config['output_csv_file'], index=False)
    print(f"\n✅ SUKSES! {len(analyzed_df)} baris data disimpan ke {config['output_csv_file']}")

if __name__ == '__main__':
    main(CONFIG)

❌ KESALAHAN: Folder tidak ditemukan di 'C:\path\to\your\data\education_index_and_healthcare_index'.
