# Google Geocoding - Συγκριτική Ανάλυση
## Σύγκριση μεθόδων geocoding

Δοκιμή 3 μεθόδων:
- **Run 1**: μόνο διεύθυνση
- **Run 2**: διεύθυνση + νομός  
- **Run 3**: διεύθυνση + δήμος + νομός
- **Best**: καλύτερη από τις 3 (με μικρότερη απόσταση)

### 1. Διαμόρφωση

In [1]:
import pandas as pd
import googlemaps
import numpy as np
from   math import radians, cos, sin, asin, sqrt
import os
from   dotenv import load_dotenv
import time
import matplotlib.pyplot as plt
import seaborn as sns
import json

load_dotenv()
GOOGLE_API_KEY = os.getenv('GOOGLE_MAPS_API_KEY')

if not GOOGLE_API_KEY:
    print("LATHOS: Den vrethike to API Key!")
else:
    print("To API Key fortothike!")

gmaps = googlemaps.Client(key=GOOGLE_API_KEY)

INPUT_FILE = "/Users/geo/Desktop/fuelstation-detection-thesis/data/ALL_cleaned.xlsx"
OUTPUT_FILE = "/Users/geo/Desktop/fuelstation-detection-thesis/data/geocoding_comparison.xlsx"
CACHE_FILE = "/Users/geo/Desktop/fuelstation-detection-thesis/data/geocoding_cache.json"

# Accuracy scoring - vathmologia akribeias
ACCURACY_SCORES = {
    'ROOFTOP': 5,
    'RANGE_INTERPOLATED': 3,
    'GEOMETRIC_CENTER': 2,
    'APPROXIMATE': 1,
    None: 0,
    'FAILED': 0
}

To API Key fortothike!


### 2. Φόρτωση Δεδομένων

In [2]:
df = pd.read_excel(INPUT_FILE, sheet_name='Cleaned')
print(f"Fortothikan {len(df)} stathmoi")
df.head()

Fortothikan 1026 stathmoi


Unnamed: 0,gasStationID,gasStationAddress,gasStationLat,gasStationLong,locationType,ddName,municipalityName,countyName,cluster
0,11078,6o χιλ. ΕΟ Αγρινιου Ιωαννινων,38.670417,21.346708,ROOFTOP,Δ.Δ.Νεάπολης,ΔΗΜΟΣ ΝΕΑΠΟΛΗΣ,ΑΙΤΩΛΟΑΚΑΡΝΑΝΙΑΣ,0
1,28,"2,5 ΧΛΜ Ε.Ο. ΙΩΑΝΝΙΝΩΝ ΑΝΤΙΡΡΙΟΥ",38.337761,21.745611,ROOFTOP,Δ.Δ.Αντιρρίου,ΔΗΜΟΣ ΑΝΤΙΡΡΙΟΥ,ΑΙΤΩΛΟΑΚΑΡΝΑΝΙΑΣ,1
2,31,4ο ΧΛΜ ΕΟ ΑΓΡΙΝΙΟΥ ΘΕΡΜΟΥ,38.608541,21.449021,ROOFTOP,Δ.Δ.Νέας Αβόρανης,ΔΗΜΟΣ ΘΕΣΤΙΕΩΝ,ΑΙΤΩΛΟΑΚΑΡΝΑΝΙΑΣ,2
3,36,"5ο ΧΛΜ. ΝΑΥΠΑΚΤΟΥ-ΘΕΡΜΟΥ, ΝΑΥΠΑΚΤΟΣ",38.403461,21.785474,ROOFTOP,Δ.Δ.Αφροξυλιάς,ΔΗΜΟΣ ΝΑΥΠΑΚΤΟΥ,ΑΙΤΩΛΟΑΚΑΡΝΑΝΙΑΣ,3
4,10861,6 χλμ Ε.Ο. ΑΓΡΙΝΙΟΥ - ΙΩΑΝΝΙΝΩΝ,38.670355,21.360866,ROOFTOP,Δ.Δ.Αγρινίου,ΔΗΜΟΣ ΑΓΡΙΝΙΟΥ,ΑΙΤΩΛΟΑΚΑΡΝΑΝΙΑΣ,4


### 3. Βοηθητικές Συναρτήσεις

In [3]:
def haversine_distance(lat1, lon1, lat2, lon2):
    # Ypologizei apostasi se metra.
    if pd.isna(lat1) or pd.isna(lon1) or pd.isna(lat2) or pd.isna(lon2):
        return None
    
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    
    return c * 6371000


# Cache functions - synartiseis cache
def load_cache():
    """Load cache from JSON file - fortosi cache apo JSON arxeio"""
    try:
        if os.path.exists(CACHE_FILE):
            with open(CACHE_FILE, 'r', encoding='utf-8') as f:
                cache = json.load(f)
            print(f"Cache fortothike: {len(cache)} cached addresses")
            return cache
        else:
            print("Den vrethike cache arxeio - tha dimiourgithei neo")
            return {}
    except Exception as e:
        print(f"Sfalma kata ti fortosi cache: {e}")
        return {}

def save_cache(cache):
    """Save cache to JSON file - apothikefsi cache se JSON arxeio"""
    try:
        # Create directory if it doesn't exist - dimiourgei ton fakelo an den yparchi
        os.makedirs(os.path.dirname(CACHE_FILE), exist_ok=True)
        
        with open(CACHE_FILE, 'w', encoding='utf-8') as f:
            json.dump(cache, f, ensure_ascii=False, indent=2)
        print(f"Cache apothikeythike: {len(cache)} addresses sto {CACHE_FILE}")
    except Exception as e:
        print(f"Sfalma kata tin apothikefsi cache: {e}")

# Load existing cache - fortosi yparxontos cache
geocoding_cache = load_cache()

def geocode_address(address):
    # Geocode mia dieuthinsi me persistent caching.
    
    # Check cache first - elegxos cache proto
    if address in geocoding_cache:
        print(f"    [CACHE HIT] Using cached result for: {address[:50]}...")
        return geocoding_cache[address]
    
    try:
        result = gmaps.geocode(address, region='gr')
        
        if not result:
            geocoding_cache[address] = None  # Cache negative results too
            return None
        
        location = result[0]['geometry']['location']
        accuracy = result[0]['geometry']['location_type']
        
        geocoded_result = {
            'lat': location['lat'],
            'lng': location['lng'],
            'accuracy': accuracy
        }
        
        # Store in cache - apothikefsi sto cache
        geocoding_cache[address] = geocoded_result
        print(f"    [API CALL] New geocoding for: {address[:50]}...")
        
        return geocoded_result
        
    except Exception as e:
        print(f"Sfalma: {e}")
        geocoding_cache[address] = None  # Cache error results too
        return None


def compare_methods(row):
    # Dokimazi 3 methods kai krataei to best.
    address = row['gasStationAddress']
    county = row.get('countyName', '')
    municipality = row.get('municipalityName', '')
    
    ground_truth_lat = row['gasStationLat']
    ground_truth_lng = row['gasStationLong']
    
    results = {}
    
    # Run 1: Address only
    print(f"  Run 1: mono diefthinsi")
    result1 = geocode_address(address)
    if result1:
        distance1 = haversine_distance(
            ground_truth_lat, ground_truth_lng,
            result1['lat'], result1['lng']
        )
        score1 = ACCURACY_SCORES.get(result1['accuracy'], 0)
        results['run1'] = {
            **result1,
            'distance': distance1,
            'score': score1
        }
        print(f"    → {result1['accuracy']} (vathmologia:{score1}), apostasi: {distance1:.1f}m")
    else:
        results['run1'] = None
        print(f"    → APOTYCHIA")
    
    time.sleep(0.1)
    
    # Run 2: Address + County
    query2 = f"{address}, {county}"
    print(f"  Run 2: diefthinsi + nomos")
    result2 = geocode_address(query2)
    if result2:
        distance2 = haversine_distance(
            ground_truth_lat, ground_truth_lng,
            result2['lat'], result2['lng']
        )
        score2 = ACCURACY_SCORES.get(result2['accuracy'], 0)
        results['run2'] = {
            **result2,
            'distance': distance2,
            'score': score2
        }
        print(f"    → {result2['accuracy']} (vathmologia:{score2}), apostasi: {distance2:.1f}m")
    else:
        results['run2'] = None
        print(f"    → APOTYCHIA")
    
    time.sleep(0.1)
    
    # Run 3: Address + Municipality + County
    query3 = f"{address}, {municipality}, {county}"
    print(f"  Run 3: diefthinsi + dimos + nomos")
    result3 = geocode_address(query3)
    if result3:
        distance3 = haversine_distance(
            ground_truth_lat, ground_truth_lng,
            result3['lat'], result3['lng']
        )
        score3 = ACCURACY_SCORES.get(result3['accuracy'], 0)
        results['run3'] = {
            **result3,
            'distance': distance3,
            'score': score3
        }
        print(f"    → {result3['accuracy']} (vathmologia:{score3}), apostasi: {distance3:.1f}m")
    else:
        results['run3'] = None
        print(f"    → APOTYCHIA")
    
    # Best method (mikroteri distance)
    valid_methods = {k: v for k, v in results.items() if v is not None}
    
    if valid_methods:
        # Krataei to method me mikroteri distance
        best_method = min(valid_methods.keys(), key=lambda k: valid_methods[k]['distance'])
        results['best'] = {
            **valid_methods[best_method],
            'best_method': best_method
        }
        print(f"  → KALYTERI: {best_method} ({results['best']['accuracy']}, {results['best']['distance']:.1f}m)")
    else:
        results['best'] = None
    
    return results


# Function to show cache statistics - synartisi gia statistika cache
def show_cache_stats():
    print(f"\nCache Statistics:")
    print(f"- Cached addresses: {len(geocoding_cache)}")
    successful_cache = sum(1 for v in geocoding_cache.values() if v is not None)
    print(f"- Successful geocodings: {successful_cache}")
    print(f"- Failed geocodings: {len(geocoding_cache) - successful_cache}")
    print(f"- Cache file: {CACHE_FILE}")
    
    if len(geocoding_cache) > 0:
        cache_hit_potential = len(geocoding_cache) - len(set(geocoding_cache.keys()))
        print(f"- Potential API calls saved: {cache_hit_potential}")


# Function to save cache at the end - synartisi apothikefsis cache sto telos
def save_cache_and_stats():
    save_cache(geocoding_cache)
    show_cache_stats()

Cache fortothike: 3064 cached addresses


### 4. Επιλογή Δεδομένων (Όλες οι Εγγραφές)

In [4]:
# Use all records - xrisimopoiei oles tis egrafes
df_sample = df.copy()

print(f"Tha ginei geocoding gia {len(df_sample)} stathmous (OLES OI EGRAFES)")
print(f"Ypologizomenos xronos: ~{len(df_sample) * 3 * 0.5:.0f} lepta")
print("\nProtos deigmata:")
display(df_sample[['gasStationID', 'gasStationAddress', 'countyName']].head())

Tha ginei geocoding gia 1026 stathmous (OLES OI EGRAFES)
Ypologizomenos xronos: ~1539 lepta

Protos deigmata:


Unnamed: 0,gasStationID,gasStationAddress,countyName
0,11078,6o χιλ. ΕΟ Αγρινιου Ιωαννινων,ΑΙΤΩΛΟΑΚΑΡΝΑΝΙΑΣ
1,28,"2,5 ΧΛΜ Ε.Ο. ΙΩΑΝΝΙΝΩΝ ΑΝΤΙΡΡΙΟΥ",ΑΙΤΩΛΟΑΚΑΡΝΑΝΙΑΣ
2,31,4ο ΧΛΜ ΕΟ ΑΓΡΙΝΙΟΥ ΘΕΡΜΟΥ,ΑΙΤΩΛΟΑΚΑΡΝΑΝΙΑΣ
3,36,"5ο ΧΛΜ. ΝΑΥΠΑΚΤΟΥ-ΘΕΡΜΟΥ, ΝΑΥΠΑΚΤΟΣ",ΑΙΤΩΛΟΑΚΑΡΝΑΝΙΑΣ
4,10861,6 χλμ Ε.Ο. ΑΓΡΙΝΙΟΥ - ΙΩΑΝΝΙΝΩΝ,ΑΙΤΩΛΟΑΚΑΡΝΑΝΙΑΣ


### 5. Εκτέλεση Σύγκρισης

In [5]:
print("\n")
print("SIGKRISI METHODON GEOCODING")
print("\n")

test_results = []

for idx, row in df_sample.iterrows():
    print(f"\nID Stathmou: {row['gasStationID']}")
    print(f"Pragmatikes syntetetagmenes: ({row['gasStationLat']:.6f}, {row['gasStationLong']:.6f})")
    
    results = compare_methods(row)
    
    test_results.append({
        'gasStationID': row['gasStationID'],
        'ground_truth_lat': row['gasStationLat'],
        'ground_truth_lng': row['gasStationLong'],
        **results
    })

print("\n")
print("I DOKIMI OLOKLIROTHIKE")
print("\n")

# Save cache and show statistics - apothikefsi cache kai emfanisi statistikon
save_cache_and_stats()



SIGKRISI METHODON GEOCODING



ID Stathmou: 11078
Pragmatikes syntetetagmenes: (38.670417, 21.346708)
  Run 1: mono diefthinsi
    [CACHE HIT] Using cached result for: 6o χιλ. ΕΟ Αγρινιου Ιωαννινων...
    → APPROXIMATE (vathmologia:1), apostasi: 118384.8m
  Run 2: diefthinsi + nomos
    [CACHE HIT] Using cached result for: 6o χιλ. ΕΟ Αγρινιου Ιωαννινων, ΑΙΤΩΛΟΑΚΑΡΝΑΝΙΑΣ...
    → GEOMETRIC_CENTER (vathmologia:2), apostasi: 5896.9m
  Run 2: diefthinsi + nomos
    [CACHE HIT] Using cached result for: 6o χιλ. ΕΟ Αγρινιου Ιωαννινων, ΑΙΤΩΛΟΑΚΑΡΝΑΝΙΑΣ...
    → GEOMETRIC_CENTER (vathmologia:2), apostasi: 5896.9m
  Run 3: diefthinsi + dimos + nomos
    [CACHE HIT] Using cached result for: 6o χιλ. ΕΟ Αγρινιου Ιωαννινων, ΔΗΜΟΣ ΝΕΑΠΟΛΗΣ, ΑΙΤ...
    → APPROXIMATE (vathmologia:1), apostasi: 2170.3m
  → KALYTERI: run3 (APPROXIMATE, 2170.3m)

ID Stathmou: 28
Pragmatikes syntetetagmenes: (38.337761, 21.745611)
  Run 1: mono diefthinsi
    [CACHE HIT] Using cached result for: 2,5 ΧΛΜ Ε.Ο. ΙΩΑΝΝΙΝΩΝ ΑΝ

KeyboardInterrupt: 

### 6. Εξαγωγή σε Excel

In [None]:
# Prepare data for Excel export - etoimasia dedomenon gia Excel
excel_data = []

for r in test_results:
    station_id = r['gasStationID']
    gt_lat = r['ground_truth_lat']
    gt_lng = r['ground_truth_lng']
    
    # Get original data for additional fields - pairno ta arxika dedomena
    original_row = df_sample[df_sample['gasStationID'] == station_id].iloc[0]
    
    row_data = {
        'gasStationID': station_id,
        'gasStationAddress': original_row['gasStationAddress'],
        'gasStationLat': gt_lat,
        'gasStationLong': gt_lng,
        'ddName': original_row.get('ddName', ''),
        'municipalityName': original_row.get('municipalityName', ''),
        'countyName': original_row.get('countyName', ''),
    }
    
    # Only keep the best result (kalytero run mono)
    if r.get('best'):
        row_data.update({
            'best_lat': r['best']['lat'],
            'best_lng': r['best']['lng'],
            'best_accuracy': r['best']['accuracy'],
            'best_distance': r['best']['distance'],
            'best_score': r['best']['score'],
            'best_run': r['best'].get('best_method', '')  # poio run itan kalytero
        })
    
    excel_data.append(row_data)

excel_df = pd.DataFrame(excel_data)

# Save to Excel - apothikefsi se Excel
excel_df.to_excel(OUTPUT_FILE, index=False)

print(f"\nTa apotelesmata apothikeytikan sto: {OUTPUT_FILE}")
print(f"Periechei {len(excel_df)} grammes me ta arxika dedomena kai mono to kalytero run gia kathe stathmo (OLES OI EGRAFES)")

### 7. Τελική Σύσταση

In [None]:
print("\n")
print("TELIKI SYSTASI")
print("\n")

# Summary for full dataset - perilepsi gia olo to dataset
print(f"\nOloklirothike i sigkrisi gia {len(test_results)} stathmous (OLES OI EGRAFES)")
print("Ta apotelesmata apothikeytikan sto Excel arxeio")
print("\nTo Excel periechei:")
print("- Ta arxika dedomena kathe stathmou")
print("- Mono to kalytero run (me mikroteri apostasi)")
print("- Ti methodo chrisimopoithike (run1, run2, i run3)")
print("- Akriveia kai apostasi tou kalyterou run")
print(f"\nSynolikos xronos geocoding: {len(test_results) * 3} API calls")
print(f"Cache efficiency: Blepete ta Cache Statistics parakatw")
print("\n")