In [1]:
import pandas as pd

train_df = pd.read_csv('/Users/hyunkoolee/google_mlb/kaggle/kaggle_playground-series-s4e9/data/playground-series-s4e9/train.csv')
test_df = pd.read_csv('/Users/hyunkoolee/google_mlb/kaggle/kaggle_playground-series-s4e9/data/playground-series-s4e9/test.csv')

In [None]:
import pandas as pd

# Function to clean fuel_type by replacing NaN and '-' values with 'not supported'
def clean_fuel_type(df):
    df['fuel_type'] = df['fuel_type'].fillna('not supported')
    df['fuel_type'] = df['fuel_type'].str.strip().replace(['-', '–'], 'not supported')
    return df

# Improved function to extract fuel type from engine
def extract_fuel_from_engine(df):
    extracted_fuel_types = []
    
    for engine in df['engine']:
        engine = engine.lower() if isinstance(engine, str) else ''
        if 'gasoline' in engine:
            extracted_fuel_types.append('Gasoline')
        elif 'diesel' in engine:
            extracted_fuel_types.append('Diesel')
        elif 'flex fuel' in engine:
            extracted_fuel_types.append('E85 Flex Fuel')
        elif 'electric' in engine:
            extracted_fuel_types.append('Electric')
        elif 'plug-in hybrid' in engine:
            extracted_fuel_types.append('Plug-In Hybrid')
        elif 'hybrid' in engine and 'plug-in' not in engine:
            extracted_fuel_types.append('Hybrid')
        else:
            extracted_fuel_types.append('not supported')
    
    df['fuel_type_extracted'] = extracted_fuel_types
    return df

# Update function to apply extracted fuel types only where necessary
def update_fuel_type(df):
    condition = df['fuel_type'].isin(['not supported']) | df['fuel_type'].isna()
    df.loc[condition, 'fuel_type'] = df.loc[condition, 'fuel_type_extracted']
    return df

# Apply the cleaning, extraction, and update process
train_df = clean_fuel_type(train_df)  # Clean fuel_type column
train_df = extract_fuel_from_engine(train_df)  # Extract fuel types from engine
train_df = update_fuel_type(train_df)  # Update fuel_type based on extracted values

# 처리 후 결과 확인
print(train_df[['fuel_type', 'fuel_type_extracted', 'engine']].head())

# fuel_type 고유값 카운트 확인
print(train_df['fuel_type'].value_counts())
print(train_df['fuel_type_extracted'].value_counts())


In [None]:
# 다시 fuel_type_extracted 컬럼을 생성하고, fuel_type과의 불일치 데이터를 출력하는 로직을 실행

# Improved function to extract fuel type from engine again to ensure column exists
def extract_fuel_from_engine(df):
    extracted_fuel_types = []
    
    for engine in df['engine']:
        engine = engine.lower() if isinstance(engine, str) else ''
        if 'gasoline' in engine:
            extracted_fuel_types.append('Gasoline')
        elif 'diesel' in engine:
            extracted_fuel_types.append('Diesel')
        elif 'flex fuel' in engine:
            extracted_fuel_types.append('E85 Flex Fuel')
        elif 'electric' in engine:
            extracted_fuel_types.append('Electric')
        elif 'plug-in hybrid' in engine:
            extracted_fuel_types.append('Plug-In Hybrid')
        elif 'hybrid' in engine and 'plug-in' not in engine:
            extracted_fuel_types.append('Hybrid')
        else:
            extracted_fuel_types.append('not supported')
    
    df['fuel_type_extracted'] = extracted_fuel_types
    return df


# Apply the extraction process again to ensure the column is created
train_df = extract_fuel_from_engine(train_df)

# Check if there are mismatches between fuel_type and fuel_type_extracted
train_df['fuel_type_match'] = train_df['fuel_type'] == train_df['fuel_type_extracted']

# Filter rows where fuel_type and fuel_type_extracted don't match
mismatch_df = train_df[train_df['fuel_type_match'] == False]


# Output results for a quick look
mismatch_df[['fuel_type', 'fuel_type_extracted', 'engine']].head(), train_df['fuel_type'].value_counts(), train_df['fuel_type_extracted'].value_counts()



In [None]:
# Function to adjust gasoline/hybrid relationships and flag significant mismatches
def adjust_fuel_type_mismatches(df):
    # Adjust cases where engine mentions 'gasoline' but fuel_type is 'Hybrid' (keep Hybrid)
    df.loc[(df['fuel_type'] == 'Hybrid') & (df['fuel_type_extracted'] == 'Gasoline'), 'fuel_type_extracted'] = 'Hybrid'
    
    # Flag rows where there's a significant mismatch between fuel_type and fuel_type_extracted (e.g., gasoline vs diesel)
    df['mismatch_flag'] = df.apply(lambda row: 'Significant Mismatch' if 
                                   (row['fuel_type'] == 'Gasoline' and row['fuel_type_extracted'] == 'Diesel') or
                                   (row['fuel_type'] == 'Diesel' and row['fuel_type_extracted'] == 'Gasoline') 
                                   else '', axis=1)
    
    return df

# Apply the adjustment function to the dataset
train_df = adjust_fuel_type_mismatches(train_df)

# Display rows where significant mismatch exists
significant_mismatches = train_df[train_df['mismatch_flag'] != '']

# 결과 확인
significant_mismatches[['fuel_type', 'fuel_type_extracted', 'engine', 'mismatch_flag', 'model', 'model_year']].head(2000)


In [None]:
# The previous context seems to have been lost, so we will regenerate the significant mismatches and proceed from there.

# Function to adjust gasoline/hybrid relationships and flag significant mismatches
def adjust_fuel_type_mismatches(df):
    # Adjust cases where engine mentions 'gasoline' but fuel_type is 'Hybrid' (keep Hybrid)
    df.loc[(df['fuel_type'] == 'Hybrid') & (df['fuel_type_extracted'] == 'Gasoline'), 'fuel_type_extracted'] = 'Hybrid'
    
    # Flag rows where there's a significant mismatch between fuel_type and fuel_type_extracted (e.g., gasoline vs diesel)
    df['mismatch_flag'] = df.apply(lambda row: 'Significant Mismatch' if 
                                   (row['fuel_type'] == 'Gasoline' and row['fuel_type_extracted'] == 'Diesel') or
                                   (row['fuel_type'] == 'Diesel' and row['fuel_type_extracted'] == 'Gasoline') 
                                   else '', axis=1)
    
    return df

# Reapply the adjustment function to get the significant mismatches
train_df = adjust_fuel_type_mismatches(train_df)

# Filter for significant mismatches
significant_mismatches = train_df[train_df['mismatch_flag'] != '']

# Get the unique value counts for fuel_type and fuel_type_extracted in the mismatches
significant_mismatch_counts = significant_mismatches[['fuel_type', 'fuel_type_extracted']].value_counts()

# Output the counts for review
significant_mismatch_counts


In [None]:
# Diesel로 설정된 모델에서 잘못 추출된 fuel_type_extracted를 Diesel로 수정
diesel_models_to_correct = [
    'Bronco Wildtrak Advanced', 'Express 1500 Cargo', '740 iL', '911 Carrera',
    '1500 Rebel', 'E-Class E 350', 'Navigator Reserve', 'Ram 2500 SLT Quad Cab',
    'Ram 2500 Laramie Quad Cab', 'F-150 Platinum', 'Charger Scat Pack',
    'Grand Wagoneer Series III', '650 i', 'Navigator L', 'Tundra Platinum',
    'XC90 T6 Inscription', 'Tahoe LTZ', 'Corvette Stingray w/3LT'
]

# 해당 모델에 대해 fuel_type_extracted를 Diesel로 수정
train_df.loc[train_df['model'].isin(diesel_models_to_correct), 'fuel_type_extracted'] = 'Diesel'

# 다시 불일치 데이터를 확인하여 올바르게 수정되었는지 검토
significant_mismatches_corrected = train_df[train_df['fuel_type'] != train_df['fuel_type_extracted']]

# 최종 결과 출력
significant_mismatches_corrected[['fuel_type', 'fuel_type_extracted', 'model']].head()


## Hybrid 차량 검토

In [None]:
# Filter for rows where fuel_type is 'Hybrid'
hybrid_fuel_type_data = train_df[train_df['fuel_type'] == 'Hybrid']

# Select relevant columns: fuel_type_extracted, engine, model, brand
hybrid_fuel_type_info = hybrid_fuel_type_data[['fuel_type','fuel_type_extracted', 'engine', 'model', 'brand']]

# Output the data for a quick review
hybrid_fuel_type_info.head()


In [None]:
# Hybrid를 올바르게 라벨링하고, Gasoline과 Hybrid 혼합 표현을 처리하는 코드
def correct_hybrid_and_gasoline_data(df):
    # 'fuel_type'이 Hybrid로 설정되어 있지만 'engine'에서 Gasoline만 언급된 경우 Gasoline으로 수정
    condition_gasoline = (df['fuel_type'] == 'Hybrid') & (df['engine'].str.contains('gasoline', case=False)) & (~df['engine'].str.contains('hybrid', case=False))
    
    # 'fuel_type_extracted'를 gasoline으로 수정
    df.loc[condition_gasoline, 'fuel_type'] = 'Gasoline'
    
    # 'engine'에서 hybrid가 언급된 경우는 fuel_type을 Hybrid로 유지
    condition_hybrid = (df['engine'].str.contains('hybrid', case=False))
    df.loc[condition_hybrid, 'fuel_type'] = 'Hybrid'
    
    return df

# 수정된 로직을 적용
train_df = correct_hybrid_and_gasoline_data(train_df)

# 결과를 확인하여 올바르게 수정되었는지 확인
corrected_hybrid_gasoline_data = train_df[['fuel_type', 'fuel_type_extracted', 'engine', 'model', 'brand']]

# 결과 출력
corrected_hybrid_gasoline_data.head()


In [None]:
train_df['fuel_type'].value_counts()

In [10]:
# fuel_type
# Gasoline          165940
# Hybrid              6832
# E85 Flex Fuel       5406
# Diesel              3955
# –                    781
# Plug-In Hybrid       521
# not supported         15
# Name: count, dtype: int64

In [None]:
# 더 복잡한 연료 유형을 처리하는 함수
def correct_not_supported_fuel_types(df):
    # Flex Fuel 언급된 경우 E85 Flex Fuel로 수정
    condition_flex_fuel = df['engine'].str.contains('flex fuel', case=False)
    df.loc[condition_flex_fuel, 'fuel_type'] = 'E85 Flex Fuel'
    
    # Gasoline + Electric 관련 정보를 가지고 있는 경우 Plug-In Hybrid로 수정
    condition_plug_in_hybrid = df['engine'].str.contains('plug-in hybrid|gasoline/electric', case=False)
    df.loc[condition_plug_in_hybrid, 'fuel_type'] = 'Plug-In Hybrid'
    
    # Gasoline이 언급되고, Hybrid나 Electric이 없는 경우 Gasoline으로 수정
    condition_gasoline = df['engine'].str.contains('gasoline', case=False) & ~df['engine'].str.contains('hybrid|electric', case=False)
    df.loc[condition_gasoline, 'fuel_type'] = 'Gasoline'
    
    # Hydrogen 언급된 경우 Hydrogen으로 수정
    condition_hydrogen = df['engine'].str.contains('hydrogen', case=False)
    df.loc[condition_hydrogen, 'fuel_type'] = 'Hydrogen'
    
    # Dual Motor, Electric 언급된 경우 Electric으로 수정
    condition_electric = df['engine'].str.contains('dual motor|electric', case=False)
    df.loc[condition_electric, 'fuel_type'] = 'Electric'
    
    # Hybrid가 언급된 경우 Hybrid로 수정 (Mild Electric 포함)
    condition_hybrid = df['engine'].str.contains('hybrid|mild electric', case=False)
    df.loc[condition_hybrid, 'fuel_type'] = 'Hybrid'
    
    return df

# 수정된 로직을 적용
train_df = correct_not_supported_fuel_types(train_df)

# 수정된 결과에서 'not supported'에서 수정된 데이터를 확인
corrected_not_supported = train_df[train_df['fuel_type'] != 'not supported']

# 결과 출력
corrected_not_supported[['fuel_type', 'engine', 'model', 'brand']]


In [None]:
# 'not supported' 데이터를 다시 필터링하고, 'electric', 'ev', 'plug-in' 등의 키워드로 Electric으로 라벨링할 수 있는 데이터 확인
def label_electric_from_not_supported_extended(df):
    # 'not supported'이고, 'electric', 'ev', 'plug-in' 등이 언급된 경우 Electric으로 라벨링
    condition_electric_keywords = (df['fuel_type'] == 'not supported') & (
        df['engine'].str.contains('ev|plug-in|motor|battery', case=False))
    
    # 해당 조건을 만족하는 데이터를 Electric으로 수정
    df.loc[condition_electric_keywords, 'fuel_type'] = 'Electric'
    
    return df

# 수정 로직 적용
train_df = label_electric_from_not_supported_extended(train_df)

# 수정된 결과에서 'not supported'에서 'Electric'으로 라벨링된 데이터를 확인
corrected_electric_data_extended = train_df[train_df['fuel_type'] == 'Electric']
# 결과 출력
corrected_electric_data_extended[['fuel_type', 'engine', 'model', 'brand']].head()



In [None]:
train_df[train_df['fuel_type'] == 'Electric']['brand']

In [None]:
# engine 컬럼이 안 끊기게 출력을 위해 전체 너비를 출력 설정
pd.set_option('display.max_colwidth', None)

# 결과를 다시 출력하여 engine 값이 안 끊기게 표시
corrected_electric_data_extended[['fuel_type', 'engine', 'model', 'brand']].head(30000)


In [None]:
# Tesla, Rivian과 같이 전기차만 만드는 브랜드를 Electric으로 분류하는 함수
def label_electric_brands(df):
    # 전기차만 만드는 브랜드 목록
    electric_only_brands = ['Tesla', 'Rivian', 'Lucid']
    
    # 해당 브랜드는 모두 Electric으로 설정
    df.loc[df['brand'].isin(electric_only_brands), 'fuel_type'] = 'Electric'
    
    return df

# 로직 적용
train_df = label_electric_brands(train_df)

# 수정된 결과에서 전기차 브랜드로 라벨링된 데이터를 확인
corrected_electric_brands = train_df[train_df['fuel_type'] == 'Electric']

# 결과 출력
corrected_electric_brands[['brand', 'fuel_type', 'engine', 'model']].head()


In [None]:
train_df['fuel_type'].value_counts()

In [None]:
train_df[train_df['fuel_type'] == 'Hybrid']

In [None]:
train_df[train_df['fuel_type'] == 'Electric']

In [None]:
train_df[train_df['fuel_type'] == 'not supported']['engine']

In [None]:
train_df[train_df['fuel_type'] == 'not supported']['engine'].value_counts()

In [None]:
import pandas as pd

# pandas 설정을 변경하여 모든 행과 열이 잘리지 않도록 설정
pd.set_option('display.max_rows', None)  # 모든 행을 출력
pd.set_option('display.max_colwidth', None)  # 컬럼의 값이 잘리지 않도록 설정

# 'fuel_type'이 'not supported'인 데이터를 필터링하여 출력
not_supported_data = train_df[train_df['fuel_type'] == 'not supported'][['brand', 'model', 'engine', 'fuel_type']]

# 출력
print(not_supported_data)


In [None]:
train_df[train_df['fuel_type'] == 'not supported']['engine'].value_counts()

In [23]:
# # 'not supported' 데이터 및 electric, hybrid 관련 키워드를 탐지해 라벨링하는 함수
# def label_hybrid_and_electric(df):
#     # 'not supported'이면서 'hybrid', 'mild electric' 등 언급된 경우 Hybrid로 라벨링
#     condition_hybrid = (df['fuel_type'] == 'not supported') & (
#         df['engine'].str.contains('hybrid|mild electric', case=False))
    
#     # 해당 조건을 만족하는 데이터를 Hybrid로 수정
#     df.loc[condition_hybrid, 'fuel_type'] = 'Hybrid'
    
#     # 기존 electric 라벨링 로직도 유지
#     condition_electric = (df['fuel_type'] == 'not supported') & (
#         df['engine'].str.contains('electric|ev|plug-in', case=False))
#     df.loc[condition_electric, 'fuel_type'] = 'Electric'
    
#     return df

# # 수정 로직 적용
# train_df = label_hybrid_and_electric(train_df)

# # 수정된 결과에서 Hybrid로 라벨링된 데이터를 확인
# corrected_hybrid_data = train_df[train_df['fuel_type'] == 'Hybrid']

# # 결과 출력
# corrected_hybrid_data[['fuel_type', 'engine', 'model', 'brand']].head(30000)


In [None]:
train_df['fuel_type'].value_counts()

In [25]:
# fuel_type
# Gasoline          165940
# Hybrid              6832
# E85 Flex Fuel       5406
# Diesel              3955
# –                    781
# Plug-In Hybrid       521
# not supported         15
# Name: count, dtype: int64

In [None]:
train_df[train_df['fuel_type'] == 'Hybrid']

In [None]:
train_df[train_df['fuel_type'] == 'Electric']

In [None]:
# engine 정보를 기반으로 fuel_type을 라벨링하는 함수 (우선순위 적용)
def label_fuel_type_from_engine(df):
    # 먼저 모든 엔진 값을 소문자로 변환하여 처리
    df['engine_lower'] = df['engine'].str.lower()

    # Electric 관련 정보 라벨링 (우선순위 높음)
    condition_electric = df['engine_lower'].str.contains('electric|ev|battery|dual motor|120 ah')
    df.loc[condition_electric, 'fuel_type'] = 'Electric'

    # Hybrid 관련 정보 라벨링 (Electric이 아닌 경우)
    condition_hybrid = df['engine_lower'].str.contains('hybrid|mild electric|gasoline/electric') & (df['fuel_type'].isna() | (df['fuel_type'] != 'Electric'))
    df.loc[condition_hybrid, 'fuel_type'] = 'Hybrid'

    # Flex Fuel 관련 정보 라벨링 (Electric/Hybrid이 아닌 경우)
    condition_flex_fuel = df['engine_lower'].str.contains('flex fuel') & (df['fuel_type'].isna() | (df['fuel_type'] != 'Electric') & (df['fuel_type'] != 'Hybrid'))
    df.loc[condition_flex_fuel, 'fuel_type'] = 'E85 Flex Fuel'

    # Gasoline 관련 정보 라벨링 (Electric/Hybrid/Flex Fuel이 아닌 경우)
    condition_gasoline = df['engine_lower'].str.contains('turbo|v6|v8|i4|gdi|tfs|liter') & (df['fuel_type'].isna() | (df['fuel_type'] != 'Electric') & (df['fuel_type'] != 'Hybrid') & (df['fuel_type'] != 'E85 Flex Fuel'))
    df.loc[condition_gasoline, 'fuel_type'] = 'Gasoline'

    # Diesel 관련 정보 라벨링 (가장 낮은 우선순위)
    condition_diesel = df['engine_lower'].str.contains('diesel|tdi') & (df['fuel_type'].isna() | (df['fuel_type'] != 'Electric') & (df['fuel_type'] != 'Hybrid') & (df['fuel_type'] != 'E85 Flex Fuel') & (df['fuel_type'] != 'Gasoline'))
    df.loc[condition_diesel, 'fuel_type'] = 'Diesel'

    # 'engine_lower' 컬럼은 불필요하므로 제거
    df.drop(columns=['engine_lower'], inplace=True)

    return df

# 데이터에 라벨링 함수 적용
train_df = label_fuel_type_from_engine(train_df)

# 결과 출력
train_df[['brand','model', 'model_year', 'engine', 'fuel_type']].head(1000)


In [None]:
train_df['fuel_type'].value_counts()

In [30]:
def value_counts(df, column):
    """
    Returns a beautified DataFrame with the value counts, including missing values, and their respective percentages.

    Parameters:
    - df: pandas DataFrame containing the data.
    - column: str, name of the column to calculate value counts for.

    Returns:
    - A formatted pandas DataFrame showing the counts and percentages, including missing values.
    """
    value_counts = df[column].value_counts(normalize=True, dropna=False) * 100
    formatted_df = pd.DataFrame({
        'Count': df[column].value_counts(dropna=False),
        'Percentage (%)': value_counts.round(2)
    }).reset_index()
    
    formatted_df.columns = [column.capitalize(), 'Count', 'Percentage (%)']
    
    return formatted_df

In [31]:
def add_price_stats(df, value_counts_df, category_column, price_column='price'):
    """
    Adds minimum, maximum, and average price statistics for each category to the value counts DataFrame,
    with styled output resembling a bar plot, without currency symbols.
    
    Parameters:
    - df: Original pandas DataFrame containing the data.
    - value_counts_df: DataFrame with value counts (output of value_counts function).
    - category_column: str, name of the column containing categories.
    - price_column: str, name of the column containing prices (default is 'price').
    
    Returns:
    - A styled DataFrame with added price statistics columns and bar plot-like representation.
    """
    # Find the correct case for the category column in both DataFrames
    df_category_col = next((col for col in df.columns if col.lower() == category_column.lower()), None)
    vc_category_col = next((col for col in value_counts_df.columns if col.lower() == category_column.lower()), None)
    
    if df_category_col is None or vc_category_col is None:
        raise ValueError(f"Column '{category_column}' not found in one or both DataFrames.")
    
    # Find the correct case for the price column
    price_col = next((col for col in df.columns if col.lower() == price_column.lower()), None)
    if price_col is None:
        raise ValueError(f"Price column '{price_column}' not found in the DataFrame.")
    
    # Calculate price statistics for each category
    price_stats = df.groupby(df_category_col)[price_col].agg(['min', 'max', 'mean']).reset_index()
    price_stats.columns = [df_category_col, 'min_price', 'max_price', 'avg_price']
    
    # Round the price values to 2 decimal places
    price_stats['min_price'] = price_stats['min_price'].round(2)
    price_stats['max_price'] = price_stats['max_price'].round(2)
    price_stats['avg_price'] = price_stats['avg_price'].round(2)
    
    # Merge the price statistics with the value counts DataFrame
    result_df = pd.merge(value_counts_df, price_stats, left_on=vc_category_col, right_on=df_category_col, how='left')
    
    # Reorder columns
    column_order = [vc_category_col, 'Count', 'Percentage (%)', 'min_price', 'max_price', 'avg_price']
    result_df = result_df[column_order]
    
    # Style the DataFrame
    styled_df = result_df.style.format({
        'Percentage (%)': '{:.2f}%',
        'min_price': '{:.2f}',  # Removed $ sign
        'max_price': '{:.2f}',  # Removed $ sign
        'avg_price': '{:.2f}'   # Removed $ sign
    })
    
    # Apply bar representation
    max_min_price = result_df['min_price'].max()
    max_max_price = result_df['max_price'].max()
    max_avg_price = result_df['avg_price'].max()
    
    styled_df.bar(subset=['min_price'], color='#5ad8a6', vmin=0, vmax=max_min_price)
    styled_df.bar(subset=['max_price'], color='#ff6e76', vmin=0, vmax=max_max_price)
    styled_df.bar(subset=['avg_price'], color='#fac858', vmin=0, vmax=max_avg_price)
    
    # Set table styles for night mode compatibility
    styled_df.set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#2c3e50'), ('color', 'white')]},
        {'selector': 'td', 'props': [('background-color', '#34495e'), ('color', 'white')]},
    ])
    
    return styled_df


In [None]:
train_df['fuel_type'].value_counts()

In [None]:
fuel_value_counts_train = value_counts(train_df, 'fuel_type')
fuel_value_counts_train


In [None]:
fuel_value_counts_with_price_stats = add_price_stats(train_df, fuel_value_counts_train, 'fuel_type')
display(fuel_value_counts_with_price_stats)


In [None]:
# engine 정보를 기반으로 fuel_type을 라벨링하는 함수 (복합 연료 시스템 처리)
def label_fuel_type_from_engine(df):
    # 먼저 모든 엔진 값을 소문자로 변환하여 처리
    df['engine_lower'] = df['engine'].str.lower()

    # Hybrid 관련 정보 라벨링 (Electric 이전에 처리)
    condition_hybrid = df['engine_lower'].str.contains('hybrid|mild electric|gasoline/electric')
    df.loc[condition_hybrid, 'fuel_type'] = 'Hybrid'

    # Electric 관련 정보 라벨링 (Hybrid가 아닌 경우에만 처리)
    condition_electric = df['engine_lower'].str.contains('electric|ev|battery|dual motor|120 ah') & ~df['engine_lower'].str.contains('mild|hybrid|gasoline/electric')
    df.loc[condition_electric, 'fuel_type'] = 'Electric'

    # Flex Fuel 관련 정보 라벨링 (Electric/Hybrid이 아닌 경우)
    condition_flex_fuel = df['engine_lower'].str.contains('flex fuel') & (df['fuel_type'].isna() | (df['fuel_type'] != 'Electric') & (df['fuel_type'] != 'Hybrid'))
    df.loc[condition_flex_fuel, 'fuel_type'] = 'E85 Flex Fuel'

    # Gasoline 관련 정보 라벨링 (Electric/Hybrid/Flex Fuel이 아닌 경우)
    condition_gasoline = df['engine_lower'].str.contains('turbo|v6|v8|i4|gdi|tfs|liter') & (df['fuel_type'].isna() | (df['fuel_type'] != 'Electric') & (df['fuel_type'] != 'Hybrid') & (df['fuel_type'] != 'E85 Flex Fuel'))
    df.loc[condition_gasoline, 'fuel_type'] = 'Gasoline'

    # Diesel 관련 정보 라벨링 (가장 낮은 우선순위)
    condition_diesel = df['engine_lower'].str.contains('diesel|tdi') & (df['fuel_type'].isna() | (df['fuel_type'] != 'Electric') & (df['fuel_type'] != 'Hybrid') & (df['fuel_type'] != 'E85 Flex Fuel') & (df['fuel_type'] != 'Gasoline'))
    df.loc[condition_diesel, 'fuel_type'] = 'Diesel'

    # 'engine_lower' 컬럼은 불필요하므로 제거
    df.drop(columns=['engine_lower'], inplace=True)

    return df


# 데이터에 라벨링 함수 적용
train_df = label_fuel_type_from_engine(train_df)

# 결과 출력
train_df[['engine', 'fuel_type']].head()


In [None]:
train_df[train_df['fuel_type'] == 'Electric']

In [None]:
# engine 정보를 기반으로 fuel_type을 라벨링하는 함수 (ULEV 처리 추가)
def label_fuel_type_from_engine(df):
    # 먼저 모든 엔진 값을 소문자로 변환하여 처리
    df['engine_lower'] = df['engine'].str.lower()

    # Electric/Gas 관련 정보 라벨링 (명시적으로 Hybrid로 설정)
    condition_electric_gas = df['engine_lower'].str.contains('electric/gas')
    df.loc[condition_electric_gas, 'fuel_type'] = 'Hybrid'

    # Gasoline 관련 정보 라벨링 (Electric 이전에 처리, 명확히 가솔린일 경우 Electric으로 분류되지 않게 함)
    condition_gasoline = df['engine_lower'].str.contains('gasoline|turbo|v6|v8|i4|gdi|tfs|liter|ulev') & ~df['engine_lower'].str.contains('electric|ev|battery')
    df.loc[condition_gasoline, 'fuel_type'] = 'Gasoline'

    # Hybrid 관련 정보 라벨링 (Gasoline 이후에 처리)
    condition_hybrid = df['engine_lower'].str.contains('hybrid|mild electric|gasoline/electric') & (df['fuel_type'].isna() | (df['fuel_type'] != 'Gasoline'))
    df.loc[condition_hybrid, 'fuel_type'] = 'Hybrid'

    # Electric 관련 정보 라벨링 (ULEV를 제외한 경우에만 Electric으로 설정)
    condition_electric = df['engine_lower'].str.contains('electric|battery|dual motor|120 ah') & ~df['engine_lower'].str.contains('ulev') & (df['fuel_type'].isna() | (df['fuel_type'] != 'Gasoline') & (df['fuel_type'] != 'Hybrid'))
    df.loc[condition_electric, 'fuel_type'] = 'Electric'

    # Flex Fuel 관련 정보 라벨링 (Electric/Hybrid/Gasoline이 아닌 경우)
    condition_flex_fuel = df['engine_lower'].str.contains('flex fuel') & (df['fuel_type'].isna() | (df['fuel_type'] != 'Electric') & (df['fuel_type'] != 'Hybrid') & (df['fuel_type'] != 'Gasoline'))
    df.loc[condition_flex_fuel, 'fuel_type'] = 'E85 Flex Fuel'

    # Diesel 관련 정보 라벨링 (가장 낮은 우선순위)
    condition_diesel = df['engine_lower'].str.contains('diesel|tdi') & (df['fuel_type'].isna() | (df['fuel_type'] != 'Electric') & (df['fuel_type'] != 'Hybrid') & (df['fuel_type'] != 'E85 Flex Fuel') & (df['fuel_type'] != 'Gasoline'))
    df.loc[condition_diesel, 'fuel_type'] = 'Diesel'

    # 'engine_lower' 컬럼은 불필요하므로 제거
    df.drop(columns=['engine_lower'], inplace=True)

    return df

# 데이터에 라벨링 함수 적용
train_df = label_fuel_type_from_engine(train_df)

# 결과 출력
train_df[['engine', 'fuel_type']].head()


In [None]:
train_df[train_df['fuel_type'] == 'Hybrid']