In [ ]:
import pandas as pd
import requests
import time
from datetime import datetime
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import numpy as np

class WeatherEnricher:
    """
    Enrichit un dataset Uber avec des donn√©es m√©t√©orologiques
    Adapt√© pour format: Date='2024-03-23', Time='12:29:38', Location='Uttam Nagar,New Delhi'
    """
    
    def __init__(self):
        self.base_url = "https://archive-api.open-meteo.com/v1/archive"
        self.geolocator = Nominatim(user_agent="uber_weather_ml_project_2024")
        self.cache = {}
        self.location_cache = {}
    
    def categorize_weather(self, weather_code):
        """
        Cat√©gorise selon WMO Weather codes
        """
        weather_mapping = {
            0: "Clear",
            1: "Partly Cloudy", 2: "Partly Cloudy", 3: "Partly Cloudy",
            45: "Foggy", 48: "Foggy",
            51: "Light Rain", 53: "Moderate Rain", 55: "Heavy Rain",
            61: "Light Rain", 63: "Moderate Rain", 65: "Heavy Rain",
            71: "Snow", 73: "Snow", 75: "Snow", 77: "Snow",
            80: "Rain Showers", 81: "Rain Showers", 82: "Heavy Rain Showers",
            85: "Snow", 86: "Snow",
            95: "Thunderstorm", 96: "Thunderstorm", 99: "Heavy Thunderstorm"
        }
        return weather_mapping.get(weather_code, "Cloudy")
    
    def clean_location_name(self, location):
        """
        Nettoie le nom de localisation pour am√©liorer le g√©ocodage
        Ex: 'Uttam Nagar,New Delhi Railway Station' -> 'Uttam Nagar, New Delhi, India'
        """
        if pd.isna(location):
            return None
        
        # S√©parer par virgule
        parts = [p.strip() for p in str(location).split(',')]
        
        # Prendre les premi√®res parties significatives
        if len(parts) >= 2:
            # Format: "Quartier, Ville"
            cleaned = f"{parts[0]}, {parts[1]}, India"
        elif len(parts) == 1:
            cleaned = f"{parts[0]}, India"
        else:
            cleaned = str(location)
        
        return cleaned
    
    def geocode_location(self, location):
        """
        Convertit un nom de lieu en coordonn√©es (lat, lon)
        Utilise un cache pour √©viter les requ√™tes r√©p√©t√©es
        """
        if pd.isna(location):
            return None, None
        
        # V√©rifier le cache
        if location in self.location_cache:
            return self.location_cache[location]
        
        # Nettoyer la localisation
        cleaned_location = self.clean_location_name(location)
        
        try:
            loc_result = self.geolocator.geocode(cleaned_location, timeout=10)
            
            if loc_result:
                lat, lon = loc_result.latitude, loc_result.longitude
                self.location_cache[location] = (lat, lon)
                
                # Rate limiting pour Nominatim (max 1 req/sec)
                time.sleep(1.1)
                
                return lat, lon
            else:
                print(f"‚ö†Ô∏è G√©ocodage √©chou√© pour: {cleaned_location}")
                self.location_cache[location] = (None, None)
                return None, None
                
        except GeocoderTimedOut:
            print(f"‚è±Ô∏è Timeout pour: {cleaned_location}")
            return None, None
        except Exception as e:
            print(f"‚ùå Erreur g√©ocodage {cleaned_location}: {e}")
            return None, None
    
    def get_weather(self, lat, lon, date_str, hour):
        """
        R√©cup√®re les conditions m√©t√©o pour une date/heure/lieu
        date_str format: '2024-03-23'
        """
        # Cr√©er cl√© de cache
        cache_key = f"{lat}_{lon}_{date_str}_{hour}"
        
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        try:
            params = {
                "latitude": lat,
                "longitude": lon,
                "start_date": date_str,
                "end_date": date_str,
                "hourly": ["temperature_2m", "precipitation", "weathercode", "rain"],
                "timezone": "auto"
            }
            
            response = requests.get(self.base_url, params=params, timeout=15)
            response.raise_for_status()
            data = response.json()
            
            hourly = data['hourly']
            
            # V√©rifier que l'heure existe
            if hour < len(hourly['weathercode']):
                result = {
                    'condition': self.categorize_weather(hourly['weathercode'][hour]),
                    'temperature': hourly['temperature_2m'][hour],
                    'precipitation': hourly['precipitation'][hour],
                    'rain': hourly['rain'][hour],
                    'weather_code': hourly['weathercode'][hour]
                }
            else:
                result = None
            
            self.cache[cache_key] = result
            time.sleep(0.05)  # Rate limiting l√©ger
            
            return result
            
        except Exception as e:
            print(f"‚ùå Erreur API m√©t√©o: {e}")
            return None
    
    def enrich_dataset(self, df, sample_size=None, use_drop_location=False):
        """
        Enrichit le dataset avec les donn√©es m√©t√©o
        
        Parameters:
        -----------
        df : DataFrame
            Dataset Uber original
        sample_size : int, optional
            Nombre de lignes √† traiter (None = tout)
        use_drop_location : bool
            Si True, utilise Drop Location au lieu de Pickup Location
        """
        print("üöÄ D√©but de l'enrichissement m√©t√©o du dataset Uber")
        print(f"üìä Taille du dataset: {len(df)} lignes")
        
        # Cr√©er une copie
        df_work = df.copy()
        
        # √âchantillonner si demand√©
        if sample_size:
            df_work = df_work.sample(n=min(sample_size, len(df_work)), random_state=42)
            print(f"üé≤ √âchantillon de {len(df_work)} lignes")
        
        # Combiner Date et Time
        print("\nüìÖ Parsing des dates et heures...")
        df_work['datetime'] = pd.to_datetime(df_work['Date'] + ' ' + df_work['Time'])
        df_work['date_only'] = df_work['datetime'].dt.date.astype(str)
        df_work['hour'] = df_work['datetime'].dt.hour
        
        # Choisir la colonne de localisation
        location_col = 'Drop Location' if use_drop_location else 'Pickup Location'
        print(f"üìç Utilisation de: {location_col}")
        
        # G√©ocoder les localisations UNIQUES
        print(f"\nüó∫Ô∏è G√©ocodage des localisations uniques...")
        unique_locations = df_work[location_col].dropna().unique()
        print(f"   {len(unique_locations)} localisations uniques √† g√©ocoder")
        
        location_coords = {}
        for i, loc in enumerate(unique_locations, 1):
            lat, lon = self.geocode_location(loc)
            location_coords[loc] = (lat, lon)
            
            if i % 5 == 0 or i == len(unique_locations):
                print(f"   Progression: {i}/{len(unique_locations)} ({i/len(unique_locations)*100:.1f}%)")
        
        # Ajouter les coordonn√©es au DataFrame
        print("\nüéØ Application des coordonn√©es au dataset...")
        df_work['latitude'] = df_work[location_col].map(lambda x: location_coords.get(x, (None, None))[0])
        df_work['longitude'] = df_work[location_col].map(lambda x: location_coords.get(x, (None, None))[1])
        
        # Statistiques g√©ocodage
        geocoded = df_work['latitude'].notna().sum()
        print(f"   ‚úÖ {geocoded}/{len(df_work)} lignes g√©ocod√©es ({geocoded/len(df_work)*100:.1f}%)")
        
        # R√©cup√©rer les donn√©es m√©t√©o
        print("\nüå¶Ô∏è R√©cup√©ration des donn√©es m√©t√©o...")
        weather_results = []
        
        total = len(df_work)
        for idx, row in df_work.iterrows():
            if pd.notna(row['latitude']) and pd.notna(row['longitude']):
                weather = self.get_weather(
                    row['latitude'],
                    row['longitude'],
                    row['date_only'],
                    row['hour']
                )
                
                if weather:
                    weather_results.append(weather)
                else:
                    weather_results.append(self._empty_weather())
            else:
                weather_results.append(self._empty_weather())
            
            # Progress
            if len(weather_results) % 50 == 0:
                print(f"   Trait√©: {len(weather_results)}/{total} ({len(weather_results)/total*100:.1f}%)")
        
        # Cr√©er DataFrame m√©t√©o
        weather_df = pd.DataFrame(weather_results)
        
        # Ajouter au dataset original
        result_df = pd.concat([df_work.reset_index(drop=True), weather_df], axis=1)
        
        # Statistiques finales
        print("\n" + "="*60)
        print("‚úÖ ENRICHISSEMENT TERMIN√â !")
        print("="*60)
        print(f"\nüìä Distribution des conditions m√©t√©o:")
        print(result_df['condition'].value_counts())
        print(f"\nüå°Ô∏è Temp√©rature moyenne: {result_df['temperature'].mean():.1f}¬∞C")
        print(f"üíß Pr√©cipitations moyennes: {result_df['precipitation'].mean():.2f}mm")
        
        # Features suppl√©mentaires
        result_df = self._add_weather_features(result_df)
        
        return result_df
    
    def _empty_weather(self):
        """Retourne un dict m√©t√©o vide"""
        return {
            'condition': 'Unknown',
            'temperature': None,
            'precipitation': None,
            'rain': None,
            'weather_code': None
        }
    
    def _add_weather_features(self, df):
        """Ajoute des features ML d√©riv√©es de la m√©t√©o"""
        print("\nüîß Cr√©ation de features suppl√©mentaires...")
        
        # 1. Pluie binaire
        df['is_rainy'] = df['condition'].isin(['Light Rain', 'Moderate Rain', 
                                                 'Heavy Rain', 'Rain Showers', 
                                                 'Thunderstorm', 'Heavy Thunderstorm']).astype(int)
        
        # 2. M√©t√©o extr√™me
        df['extreme_weather'] = df['condition'].isin(['Heavy Rain', 'Thunderstorm', 
                                                        'Heavy Thunderstorm', 'Snow']).astype(int)
        
        # 3. Cat√©gorie temp√©rature
        df['temp_category'] = pd.cut(df['temperature'], 
                                      bins=[-np.inf, 15, 25, 35, np.inf],
                                      labels=['Cold', 'Moderate', 'Warm', 'Hot'])
        
        # 4. Niveau pr√©cipitation
        df['precipitation_level'] = pd.cut(df['precipitation'],
                                            bins=[-0.1, 0.1, 2.5, 10, np.inf],
                                            labels=['None', 'Light', 'Moderate', 'Heavy'])
        
        print("   ‚úÖ Features cr√©√©es: is_rainy, extreme_weather, temp_category, precipitation_level")
        
        return df


# ============== UTILISATION PRATIQUE ==============

def main():
    """Fonction principale d'utilisation"""
    
    # 1. Charger le dataset
    print("üìÇ Chargement du dataset...")
    df = pd.read_csv('uber_rides_data.csv')
    
    print(f"‚úÖ Dataset charg√©: {len(df)} lignes, {len(df.columns)} colonnes")
    print("\nüîç Aper√ßu des donn√©es:")
    print(df[['Date', 'Time', 'Pickup Location', 'Booking Value']].head())
    
    # 2. Cr√©er l'enrichisseur
    enricher = WeatherEnricher()
    
    # 3. OPTION A: Tester sur un √©chantillon (RECOMMAND√â POUR D√âBUTER)
    print("\n" + "="*60)
    print("PHASE 1: TEST SUR √âCHANTILLON")
    print("="*60)
    
    df_sample = enricher.enrich_dataset(df, sample_size=100)
    df_sample.to_csv('uber_with_weather_SAMPLE.csv', index=False)
    print(f"\nüíæ √âchantillon sauvegard√©: uber_with_weather_SAMPLE.csv")
    
    # 4. V√©rifier les r√©sultats
    print("\nüìà Analyse de l'impact m√©t√©o sur les prix:")
    if 'Booking Value' in df_sample.columns:
        avg_price_rain = df_sample[df_sample['is_rainy']==1]['Booking Value'].mean()
        avg_price_clear = df_sample[df_sample['is_rainy']==0]['Booking Value'].mean()
        
        if pd.notna(avg_price_rain) and pd.notna(avg_price_clear):
            increase = ((avg_price_rain - avg_price_clear) / avg_price_clear) * 100
            print(f"   üí∞ Prix moyen (pluie): ‚Çπ{avg_price_rain:.2f}")
            print(f"   ‚òÄÔ∏è Prix moyen (beau): ‚Çπ{avg_price_clear:.2f}")
            print(f"   üìä Augmentation: {increase:.1f}%")
    
    # 5. OPTION B: Traiter tout le dataset (√† faire apr√®s validation)
    # D√âCOMMENTER SI L'√âCHANTILLON EST BON
    """
    print("\n" + "="*60)
    print("PHASE 2: TRAITEMENT COMPLET (cela peut prendre du temps)")
    print("="*60)
    
    response = input("Voulez-vous continuer avec le dataset complet? (oui/non): ")
    if response.lower() == 'oui':
        df_full = enricher.enrich_dataset(df, sample_size=None)
        df_full.to_csv('uber_with_weather_FULL.csv', index=False)
        print(f"\nüíæ Dataset complet sauvegard√©: uber_with_weather_FULL.csv")
    """
    
    print("\n" + "="*60)
    print("‚úÖ TERMIN√â - Pr√™t pour votre ML model!")
    print("="*60)
    
    return df_sample


# Lancer le script
if __name__ == "__main__":
    df_enriched = main()