In [4]:
import pandas as pd
from pathlib import Path

csv_path = Path('../data/avocado.csv')
df = pd.read_csv(csv_path, parse_dates=['Date'])
df.head()


Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


In [12]:
df['4046'] = pd.to_numeric(df['4046'], errors='coerce')
df['4225'] = pd.to_numeric(df['4225'], errors='coerce')
df['4770'] = pd.to_numeric(df['4770'], errors='coerce')
df['AveragePrice'] = pd.to_numeric(df['AveragePrice'], errors='coerce')

# 3개 PLU의 총합 계산
df['PLU_Total'] = df['4046'] + df['4225'] + df['4770']

# 각 PLU별 판매 비율
df['ratio_4046'] = df['4046'] / df['PLU_Total']
df['ratio_4225'] = df['4225'] / df['PLU_Total']
df['ratio_4770'] = df['4770'] / df['PLU_Total']

# PLU별 단가 근사치 계산
df['price_4046'] = df['AveragePrice'] * df['ratio_4046']
df['price_4225'] = df['AveragePrice'] * df['ratio_4225']
df['price_4770'] = df['AveragePrice'] * df['ratio_4770']

region_price_by_plu = df.groupby('region')[
    ['price_4046', 'price_4225', 'price_4770']
].mean().reset_index()

# 가장 저렴한 PLU 지역 찾기
cheap_4046 = region_price_by_plu.sort_values('price_4046').head(5)
cheap_4225 = region_price_by_plu.sort_values('price_4225').head(5)
cheap_4770 = region_price_by_plu.sort_values('price_4770').head(5)

print("4046이 저렴한 지역:", cheap_4046)
print("4225이 저렴한 지역:", cheap_4225)
print("4770이 저렴한 지역:", cheap_4770)


4046이 저렴한 지역:                  region  price_4046  price_4225  price_4770
4                Boston    0.038530    1.477943    0.014415
23           Louisville    0.049524    1.220342    0.016820
30   NorthernNewEngland    0.067027    1.397671    0.012698
17  HartfordSpringfield    0.067862    1.742384    0.008393
9      CincinnatiDayton    0.080559    1.105923    0.022719
4225이 저렴한 지역:               region  price_4046  price_4225  price_4770
18           Houston    0.893239    0.141881    0.012809
11     DallasFtWorth    0.896689    0.178973    0.009930
27  NewOrleansMobile    1.095759    0.205604    0.003429
45      SouthCentral    0.865295    0.223844    0.012104
31           Orlando    1.156741    0.348792    0.000680
4770이 저렴한 지역:                region  price_4046  price_4225  price_4770
50              Tampa    1.052043    0.356316    0.000487
31            Orlando    1.156741    0.348792    0.000680
24  MiamiFtLauderdale    1.033246    0.394399    0.000846
1             Atlanta   

In [None]:
heatmap_df = region_price_by_plu.melt(id_vars='region', 
                                       value_vars=['price_4046', 'price_4225', 'price_4770'],
                                       var_name='PLU_Code', 
                                       value_name='Avg_Price')


In [None]:
import plotly.express as px

fig = px.density_heatmap(
    heatmap_df,
    x='PLU_Code',
    y='region',
    z='Avg_Price',
    color_continuous_scale='YlGnBu',
    text_auto='.2f',
    title='지역별 PLU 코드별 평균 단가 (근사값)'
)

fig.update_layout(
    xaxis_title="PLU 코드",
    yaxis_title="지역",
    height=900
)

fig.show()
