## Preparazione dati per Tableau — city/state e aggregazione per Dealer_Region

Scopo:
- creare una colonna `dealer_region_mapped` normalizzata (Title Case)
- creare `city_state` in formato `<City>, USA` per favorire la geocodifica in Tableau
- calcolare l'aggregato per `dealer_region_mapped` con: `sales_count`, `avg_price`, `avg_income`, `price_income_ratio`
- salvare due CSV in `../data/processed/` pronti per import diretto in Tableau

In [6]:
# --- PARAMS / ENV  ---
import os
import pandas as pd

NOTEBOOK_DIR = os.getcwd()
ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR, ".."))

# Percorsi canonici (usati da tutto il notebook)
CLEANED_PATH = os.path.join(ROOT, "data", "processed", "database_cleaned.csv")
RAW_PATH = os.path.join(ROOT, "data", "raw", "car_sales.csv")                      # raw preferito del team
RAW_BACKUP_FROM_REPO = os.path.join(ROOT, "data", "raw", "car_sales_from_repo.csv")  # la tua copia proveniente da git
MAPPINGS_DIR = os.path.join(NOTEBOOK_DIR, "mappings")  # se i mappings sono in notebook/mappings
OUT_DIR = os.path.join(ROOT, "data", "processed")

# Controlli rapidi
print("working dir:", NOTEBOOK_DIR)
print("root:", ROOT)
print("CLEANED_PATH exists?", os.path.exists(CLEANED_PATH))
print("RAW_PATH exists?", os.path.exists(RAW_PATH))
print("RAW_BACKUP_FROM_REPO exists?", os.path.exists(RAW_BACKUP_FROM_REPO))
print("MAPPINGS_DIR exists?", os.path.exists(MAPPINGS_DIR))

if not os.path.exists(CLEANED_PATH):
    raise FileNotFoundError(f"File non trovato: {CLEANED_PATH}. Controlla che il file sia in data/processed/")

cols0 = pd.read_csv(CLEANED_PATH, nrows=0).columns.tolist()
parse_arg = ['Date'] if 'Date' in cols0 else None
df = pd.read_csv(CLEANED_PATH, parse_dates=parse_arg, low_memory=False)

print("Loaded df shape:", df.shape)

working dir: /Users/serenatempesta/Documents/Progetti/Data_Analysis/progetto_finale/notebook
root: /Users/serenatempesta/Documents/Progetti/Data_Analysis/progetto_finale
CLEANED_PATH exists? True
RAW_PATH exists? True
RAW_BACKUP_FROM_REPO exists? True
MAPPINGS_DIR exists? True
Loaded df shape: (23906, 13)


In [7]:
# --- PARAMS / ENV  ---
import os
import pandas as pd
import numpy as np

# Percorsi 
ROOT = ".."                    
CLEANED_PATH = os.path.join(ROOT, "data", "processed", "database_cleaned.csv")
MAPPINGS_DIR = os.path.join("..", "notebook", "mappings")
OUT_DIR = os.path.join("..", "data", "processed")

# Safety checks
print("Working dir:", os.getcwd())
print("CLEANED_PATH exists?", os.path.exists(CLEANED_PATH))
print("MAPPINGS_DIR exists?", os.path.exists(MAPPINGS_DIR))
os.makedirs(OUT_DIR, exist_ok=True)

# Carica il dataframe di partenza 
df = pd.read_csv(CLEANED_PATH, low_memory=False)
print("Loaded df shape:", df.shape)
# assicurati che la colonna Dealer_Region esista
if 'Dealer_Region' not in df.columns:
    raise RuntimeError("Colonna 'Dealer_Region' non trovata nel cleaned CSV. Controlla il nome della colonna.")

out_path = os.path.join(OUT_DIR, "agg_by_dealer_region_for_tableau.csv")
df.to_csv(out_path, index=False)

Working dir: /Users/serenatempesta/Documents/Progetti/Data_Analysis/progetto_finale/notebook
CLEANED_PATH exists? True
MAPPINGS_DIR exists? True
Loaded df shape: (23906, 13)


In [2]:
# crea dealer_region_mapped e city_state (per Tableau)
df['dealer_region_mapped'] = df['Dealer_Region'].astype(str).str.strip().str.title()

def mk_city_state(v):
    s = str(v).strip()
    if s == "" or s.lower() in ['nan','none']:
        return ""
    return s if ',' in s else f"{s}, USA"

df['city_state'] = df['dealer_region_mapped'].apply(mk_city_state)

# mostra anteprima
display(df[['Dealer_Region','dealer_region_mapped','city_state']].drop_duplicates().head(20))
print("Tot righe:", len(df))

Unnamed: 0,Dealer_Region,dealer_region_mapped,city_state
0,Middletown,Middletown,"Middletown, USA"
1,Aurora,Aurora,"Aurora, USA"
2,Greenville,Greenville,"Greenville, USA"
3,Pasco,Pasco,"Pasco, USA"
4,Janesville,Janesville,"Janesville, USA"
5,Scottsdale,Scottsdale,"Scottsdale, USA"
6,Austin,Austin,"Austin, USA"


Tot righe: 23906


In [3]:
# --- AGGREGAZIONE MANUALE per Tableau ---
import re
OUT_AGG = "../data/processed/agg_by_dealer_region_for_tableau.csv"
OUT_DETAIL = "../data/processed/database_for_tableau_city_state.csv"

# colonne manuali 
price_col = "Price ($)"
income_col = "Annual Income"
region_col = "dealer_region_mapped"   # già presente nel df
city_state_col = "city_state"         # già presente nel df

# 1) pulizia valori numerici (rimuove simboli non numerici e converte a float)
def clean_numeric_series(s):
    # rimuove tutto tranne cifre, punto e trattino (es: "42,000" -> "42000")
    return pd.to_numeric(s.astype(str).str.replace(r'[^\d\.\-]', '', regex=True), errors='coerce')

df['_price_clean'] = clean_numeric_series(df[price_col])
df['_income_clean'] = clean_numeric_series(df[income_col]) if income_col in df.columns else None

print("Converted sample:", df[[price_col, '_price_clean', income_col, '_income_clean']].head(3))

# 2) verifica che region e city_state esistano
if region_col not in df.columns:
    raise RuntimeError(f"{region_col} non trovato nel dataframe")
if city_state_col not in df.columns:
    print(f"Attenzione: {city_state_col} non trovato — la mappa potrebbe non geocodare automaticamente")

# 3) aggregazione per dealer_region_mapped
agg = df.groupby(region_col).agg(
    sales_count=('_price_clean', 'count'),
    avg_price=('_price_clean', 'mean'),
    avg_income=('_income_clean' if '_income_clean' in df.columns else '_price_clean', 'mean')
).reset_index()

# 4) ratio (gestisce divisioni per zero o NaN)
agg['price_income_ratio'] = agg.apply(
    lambda r: (r['avg_price'] / r['avg_income']) if pd.notnull(r['avg_price']) and pd.notnull(r['avg_income']) and r['avg_income'] != 0 else float('nan'),
    axis=1
)

# 5) salva file aggregato e file dettaglio per Tableau
agg.to_csv(OUT_AGG, index=False)
df.to_csv(OUT_DETAIL, index=False)

print("Saved:", OUT_AGG)
print("Saved:", OUT_DETAIL)

# 6) output di controllo rapidi
display(agg.sort_values('sales_count', ascending=False).head(20))
print("Unique dealer regions:", agg.shape[0])

Converted sample:    Price ($)  _price_clean  Annual Income  _income_clean
0      26000         26000          13500          13500
1      19000         19000        1480000        1480000
2      31500         31500        1035000        1035000
Saved: ../data/processed/agg_by_dealer_region_for_tableau.csv
Saved: ../data/processed/database_for_tableau_city_state.csv


Unnamed: 0,dealer_region_mapped,sales_count,avg_price,avg_income,price_income_ratio
1,Austin,4135,28341.603628,822849.572189,0.034443
3,Janesville,3821,27833.350955,833916.268778,0.033377
6,Scottsdale,3433,27954.958928,805682.702884,0.034697
5,Pasco,3131,28119.039923,853975.290323,0.032927
0,Aurora,3130,28334.626837,845510.435783,0.033512
2,Greenville,3128,28180.819054,832667.512788,0.033844
4,Middletown,3128,27856.338875,825592.651854,0.033741


Unique dealer regions: 7
