In [2]:
import pandas as pd

train_df = pd.read_csv('/Users/hyunkoolee/google_mlb/kaggle/kaggle_playground-series-s4e9/data/playground-series-s4e9/train.csv')
test_df = pd.read_csv('/Users/hyunkoolee/google_mlb/kaggle/kaggle_playground-series-s4e9/data/playground-series-s4e9/test.csv')

Fuel Type 결측치 not supported 로 통일

In [3]:
# Function to clean fuel_type by replacing NaN and '-' values with 'not supported'
def clean_fuel_type(df):
    df['fuel_type'] = df['fuel_type'].fillna('not supported')
    df['fuel_type'] = df['fuel_type'].str.strip().replace(['-', '–'], 'not supported')
    return df

train_df = clean_fuel_type(train_df)
test_df = clean_fuel_type(test_df)

1. 가장 기본이 되는 엔진명에 연료 직접 포함여부 기준으로 분류 상황 체크

In [4]:
train_df_1 = train_df.copy()
test_df_1 = test_df.copy()

# Improved function to extract fuel type from engine
def extract_fuel_from_engine_1(df):
    extracted_fuel_types = []
    
    for engine in df['engine']:
        engine = engine.lower() if isinstance(engine, str) else ''
        if 'gasoline' in engine:
            extracted_fuel_types.append('Gasoline')
        elif 'diesel' in engine:
            extracted_fuel_types.append('Diesel')
        elif 'flex fuel' in engine:
            extracted_fuel_types.append('E85 Flex Fuel')
        elif 'electric' in engine:
            extracted_fuel_types.append('Electric')
        elif 'plug-in hybrid' in engine:
            extracted_fuel_types.append('Plug-In Hybrid')
        elif 'hybrid' in engine and 'plug-in' not in engine:
            extracted_fuel_types.append('Hybrid')
        else:
            extracted_fuel_types.append('not supported')
    
    df['fuel_type_extracted_1'] = extracted_fuel_types
    return df

# Update function to apply extracted fuel types only where necessary
# 초기 버전은 결측치에 대해서만 엔진으로부터 추출된 정보를 채워줌
def update_fuel_type(df, trial_index):
    condition = df['fuel_type'].isin(['not supported']) | df['fuel_type'].isna()
    # df.loc[condition, 'fuel_type'] = df.loc[condition, 'fuel_type_extracted']
    df[f'fuel_type_updated_{trial_index}'] = df['fuel_type'].where(~condition, df[f'fuel_type_extracted_{trial_index}'])
    return df

# Apply the cleaning, extraction, and update process
train_df_1 = clean_fuel_type(train_df_1)  # Clean fuel_type column
train_df_1 = extract_fuel_from_engine_1(train_df_1)  # Extract fuel types from engine
train_df_1 = update_fuel_type(train_df_1, 1)  # Update fuel_type based on extracted values

# 처리 후 결과 확인
print(train_df_1[['fuel_type', 'fuel_type_extracted_1', 'fuel_type_updated_1', 'engine']].head())

# fuel_type 고유값 카운트 확인
print(train_df_1['fuel_type'].value_counts())
print(train_df_1['fuel_type_extracted_1'].value_counts())
print(train_df_1['fuel_type_updated_1'].value_counts())


       fuel_type fuel_type_extracted_1 fuel_type_updated_1  \
0       Gasoline              Gasoline            Gasoline   
1       Gasoline              Gasoline            Gasoline   
2  E85 Flex Fuel         E85 Flex Fuel       E85 Flex Fuel   
3       Gasoline              Gasoline            Gasoline   
4       Gasoline              Gasoline            Gasoline   

                                              engine  
0       172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel  
1       252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel  
2  320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...  
3       420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel  
4       208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel  
fuel_type
Gasoline          165940
Hybrid              6832
not supported       5879
E85 Flex Fuel       5406
Diesel              3955
Plug-In Hybrid       521
Name: count, dtype: int64
fuel_type_extracted_1
Gasoline         139734
not supported     31546
Electric           7566
E85 Flex Fuel    

결측치가 엔진정보로 보간된 케이스 확인

In [5]:
# Find Changed Cases
not_supported_updated_cases = train_df_1[(train_df_1['fuel_type'] != train_df_1['fuel_type_updated_1'])]
print(not_supported_updated_cases[['fuel_type', 'fuel_type_extracted_1', 'fuel_type_updated_1', 'engine']].sample(n=20))

            fuel_type fuel_type_extracted_1 fuel_type_updated_1  \
119164  not supported              Electric            Electric   
130680  not supported              Electric            Electric   
183289  not supported              Electric            Electric   
188056  not supported              Electric            Electric   
156776  not supported              Electric            Electric   
91545   not supported              Electric            Electric   
50029   not supported              Electric            Electric   
113764  not supported              Electric            Electric   
81038   not supported              Electric            Electric   
181102  not supported              Electric            Electric   
116286  not supported              Electric            Electric   
125658  not supported              Electric            Electric   
184003  not supported              Electric            Electric   
104146  not supported              Electric            Electri

결측치가 엔진 정보로 보간되지 않은 케이스 확인 (엔진 정보로부터 연료 추출 실패)

In [6]:
# Find not updated 'not supported' cases
not_supported_unchanged_cases = train_df_1[(train_df_1['fuel_type'] == 'not supported') & (train_df_1['fuel_type_updated_1'] == 'not supported')]
print(not_supported_unchanged_cases[['fuel_type', 'fuel_type_extracted_1', 'fuel_type_updated_1', 'engine']].sample(n=20))

            fuel_type fuel_type_extracted_1 fuel_type_updated_1  \
50580   not supported         not supported       not supported   
84109   not supported         not supported       not supported   
16637   not supported         not supported       not supported   
182335  not supported         not supported       not supported   
101476  not supported         not supported       not supported   
172321  not supported         not supported       not supported   
140691  not supported         not supported       not supported   
100648  not supported         not supported       not supported   
42146   not supported         not supported       not supported   
48422   not supported         not supported       not supported   
140517  not supported         not supported       not supported   
80485   not supported         not supported       not supported   
149204  not supported         not supported       not supported   
46732   not supported         not supported       not supporte

엔진 정보가 있음에도 불구하고 연료 추출에 실패한 경우 확인

In [7]:
# Find not updated but engine info exist cases
engine_exist_but_unchanged_cases = not_supported_unchanged_cases[not_supported_unchanged_cases['engine'] != '–']
print(engine_exist_but_unchanged_cases.shape)
print(engine_exist_but_unchanged_cases[['fuel_type', 'fuel_type_extracted_1', 'fuel_type_updated_1', 'engine']].sample(n=20))

(179, 15)
            fuel_type fuel_type_extracted_1 fuel_type_updated_1  \
112813  not supported         not supported       not supported   
73898   not supported         not supported       not supported   
116523  not supported         not supported       not supported   
25497   not supported         not supported       not supported   
171843  not supported         not supported       not supported   
124698  not supported         not supported       not supported   
158470  not supported         not supported       not supported   
176165  not supported         not supported       not supported   
181625  not supported         not supported       not supported   
140103  not supported         not supported       not supported   
43295   not supported         not supported       not supported   
105568  not supported         not supported       not supported   
33836   not supported         not supported       not supported   
54186   not supported         not supported       no

In [8]:
print(engine_exist_but_unchanged_cases['engine'].unique())
print(len(engine_exist_but_unchanged_cases['engine'].unique()))
print(engine_exist_but_unchanged_cases['engine'].value_counts())

['Dual Motor - Standard' '2.0 Liter TFSI' '1.6L I4 16V GDI DOHC Turbo'
 '2.0 Liter DOHC Turbo' '3.5L V6 24V PDI DOHC Twin Turbo'
 '8.0L W16 64V GDI DOHC Twin Turbo' '6.3L V12 48V GDI DOHC'
 '3.6L V6 24V MPFI DOHC' '5.3 Liter' '5.0 Liter' 'Standard Range Battery'
 '1.5L I3 12V PDI DOHC Turbo' '111.2Ah / FR 70kW / RR 160kW (697V)'
 '3.0L I6 24V GDI DOHC Turbo' '3.5L V6 24V GDI SOHC' '5.5 Liter'
 '2.0 Liter Turbo' '3.8L V6 24V MPFI DOHC Twin Turbo'
 '4.0 Liter Twin Turbo' '3.0L V6 24V GDI DOHC Turbo' 'V8'
 '5.0L V8 32V GDI DOHC Supercharged' '2.0L I4 16V GDI DOHC Turbo'
 '5.0L V8 32V PDI DOHC' '1.3L I3 12V GDI DOHC Turbo'
 '6.0L W12 48V PDI DOHC Twin Turbo' '3.0 Liter' '120 AH'
 '5.7L V8 16V MPFI OHV' '3.5 Liter' '5.6 Liter' '1.5 Liter Turbo'
 '3.0 Liter Turbo' '2.0 Liter' '4.0 Liter']
35
engine
Dual Motor - Standard                  92
Standard Range Battery                 27
111.2Ah / FR 70kW / RR 160kW (697V)     8
1.6L I4 16V GDI DOHC Turbo              6
8.0L W16 64V GDI DOHC Twin T

In [9]:
engine_exist_but_unchanged_cases[['fuel_type', 'fuel_type_extracted_1', 'fuel_type_updated_1', 'engine']].to_excel('data/temp/engine_exist_but_unchanged_cases.xlsx', index=False)

2. 기존 영석님 전처리 마지막 버전 확인

In [10]:
train_df_2 = train_df.copy()
test_df_2 = test_df.copy()


# engine 정보를 기반으로 fuel_type을 라벨링하는 함수 (ULEV 처리 추가)
def label_fuel_type_from_engine(df):
    # 먼저 모든 엔진 값을 소문자로 변환하여 처리
    df['engine_lower'] = df['engine'].str.lower()
    
    df['updated_fuel_type'] = 'No'
    df['fuel_type_orig'] = df['fuel_type']

    # Electric/Gas 관련 정보 라벨링 (명시적으로 Hybrid로 설정)
    condition_electric_gas = df['engine_lower'].str.contains('electric/gas')
    df.loc[condition_electric_gas, 'fuel_type'] = 'Hybrid'
    df.loc[condition_electric_gas, 'updated_fuel_type'] = 'Hybrid'

    # Gasoline 관련 정보 라벨링 (Electric 이전에 처리, 명확히 가솔린일 경우 Electric으로 분류되지 않게 함)
    condition_gasoline = df['engine_lower'].str.contains('gasoline|turbo|v6|v8|i4|gdi|tfs|liter|ulev') & ~df['engine_lower'].str.contains('electric|ev|battery')
    df.loc[condition_gasoline, 'fuel_type'] = 'Gasoline'
    df.loc[condition_gasoline, 'updated_fuel_type'] = 'Gasoline'

    # Hybrid 관련 정보 라벨링 (Gasoline 이후에 처리)
    # TODO: 왜 hybrid 관련 정보 라벨링을 Gasoline 이후에 해야될까?
    condition_hybrid = df['engine_lower'].str.contains('hybrid|mild electric|gasoline/electric') & (df['fuel_type'].isna() | (df['fuel_type'] != 'Gasoline'))
    df.loc[condition_hybrid, 'fuel_type'] = 'Hybrid'
    df.loc[condition_hybrid, 'updated_fuel_type'] = 'Hybrid'

    # Electric 관련 정보 라벨링 (ULEV를 제외한 경우에만 Electric으로 설정)
    condition_electric = df['engine_lower'].str.contains('electric|battery|dual motor|120 ah') & ~df['engine_lower'].str.contains('ulev') & (df['fuel_type'].isna() | (df['fuel_type'] != 'Gasoline') & (df['fuel_type'] != 'Hybrid'))
    df.loc[condition_electric, 'fuel_type'] = 'Electric'
    df.loc[condition_electric, 'updated_fuel_type'] = 'Electric'

    # Flex Fuel 관련 정보 라벨링 (Electric/Hybrid/Gasoline이 아닌 경우)
    condition_flex_fuel = df['engine_lower'].str.contains('flex fuel') & (df['fuel_type'].isna() | (df['fuel_type'] != 'Electric') & (df['fuel_type'] != 'Hybrid') & (df['fuel_type'] != 'Gasoline'))
    df.loc[condition_flex_fuel, 'fuel_type'] = 'E85 Flex Fuel'
    df.loc[condition_flex_fuel, 'updated_fuel_type'] = 'E85 Flex Fuel'

    # Diesel 관련 정보 라벨링 (가장 낮은 우선순위)
    condition_diesel = df['engine_lower'].str.contains('diesel|tdi') & (df['fuel_type'].isna() | (df['fuel_type'] != 'Electric') & (df['fuel_type'] != 'Hybrid') & (df['fuel_type'] != 'E85 Flex Fuel') & (df['fuel_type'] != 'Gasoline'))
    df.loc[condition_diesel, 'fuel_type'] = 'Diesel'
    df.loc[condition_diesel, 'updated_fuel_type'] = 'Diesel'

    # 'engine_lower' 컬럼은 불필요하므로 제거
    df.drop(columns=['engine_lower'], inplace=True)

    return df

# 데이터에 라벨링 함수 적용
train_df_2 = label_fuel_type_from_engine(train_df_2)

# 결과 출력
train_df_2[['engine', 'fuel_type']].head()
# fuel_type 고유값 카운트 확인
print(train_df_2['fuel_type'].value_counts())
print(train_df_2['updated_fuel_type'].value_counts())


fuel_type
Gasoline         170229
Hybrid             5449
Electric           4988
E85 Flex Fuel      4215
Diesel             2872
not supported       780
Name: count, dtype: int64
updated_fuel_type
Gasoline         168511
Hybrid             5400
Electric           4988
E85 Flex Fuel      4210
Diesel             2867
No                 2557
Name: count, dtype: int64


업데이트 된 케이스 확인

In [11]:
updated_cases = train_df_2[(train_df_2['fuel_type_orig'] != train_df_2['fuel_type'])]
print(updated_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].sample(n=20))

        fuel_type_orig fuel_type updated_fuel_type  \
115093          Hybrid  Gasoline          Gasoline   
101649          Hybrid  Gasoline          Gasoline   
182855   not supported  Electric          Electric   
96877    not supported  Electric          Electric   
171901          Hybrid  Gasoline          Gasoline   
60465   Plug-In Hybrid    Hybrid            Hybrid   
76044    E85 Flex Fuel  Gasoline          Gasoline   
61441    not supported  Electric          Electric   
67073    E85 Flex Fuel  Gasoline          Gasoline   
101245  Plug-In Hybrid  Gasoline          Gasoline   
57392   Plug-In Hybrid    Hybrid            Hybrid   
96148           Hybrid  Gasoline          Gasoline   
134373   not supported  Electric          Electric   
81710    not supported  Electric          Electric   
46282   Plug-In Hybrid    Hybrid            Hybrid   
152854   not supported  Electric          Electric   
6079     not supported  Electric          Electric   
41804    not supported  Elec

In [12]:
updated_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].to_excel('data/temp/updated_cases.xlsx', index=False)

결측치가 엔진 정보로 보간되지 않은 케이스 확인 (엔진 정보로부터 연료 추출 실패)

In [13]:
# Find not updated 'not supported' cases
not_supported_unchanged_cases = train_df_2[(train_df_2['fuel_type'] == 'not supported') & (train_df_2['updated_fuel_type'] == 'No')]
print(not_supported_unchanged_cases[['fuel_type', 'updated_fuel_type', 'engine']].sample(n=20))

            fuel_type updated_fuel_type engine
100648  not supported                No      –
15739   not supported                No      –
55604   not supported                No      –
77832   not supported                No      –
57038   not supported                No      –
141363  not supported                No      –
63590   not supported                No      –
102463  not supported                No      –
64083   not supported                No      –
187399  not supported                No      –
113867  not supported                No      –
57585   not supported                No      –
184845  not supported                No      –
176096  not supported                No      –
151965  not supported                No      –
32758   not supported                No      –
133985  not supported                No      –
36726   not supported                No      –
78649   not supported                No      –
58651   not supported                No      –


엔진 정보가 있음에도 불구하고 연료 추출에 실패한 경우 확인

In [14]:
# Find not updated but engine info exist cases
engine_exist_but_unchanged_cases = not_supported_unchanged_cases[not_supported_unchanged_cases['engine'] != '–']
print(engine_exist_but_unchanged_cases.shape)
print(engine_exist_but_unchanged_cases[['fuel_type', 'updated_fuel_type', 'engine']])

(8, 15)
            fuel_type updated_fuel_type                               engine
30172   not supported                No  111.2Ah / FR 70kW / RR 160kW (697V)
42322   not supported                No  111.2Ah / FR 70kW / RR 160kW (697V)
85526   not supported                No  111.2Ah / FR 70kW / RR 160kW (697V)
100226  not supported                No  111.2Ah / FR 70kW / RR 160kW (697V)
101611  not supported                No  111.2Ah / FR 70kW / RR 160kW (697V)
124698  not supported                No  111.2Ah / FR 70kW / RR 160kW (697V)
131824  not supported                No  111.2Ah / FR 70kW / RR 160kW (697V)
176787  not supported                No  111.2Ah / FR 70kW / RR 160kW (697V)


Mild Electric Cases 유효성 확인

In [15]:
mild_electric_cases = train_df_2[train_df_2['engine'].str.lower().str.contains('mild electric')]
print(mild_electric_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].sample(n=20))
mild_electric_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].to_excel('data/temp/mild_electric_cases.xlsx', index=False)

       fuel_type_orig fuel_type updated_fuel_type  \
97255        Gasoline  Gasoline                No   
106815         Hybrid    Hybrid            Hybrid   
113993         Hybrid    Hybrid            Hybrid   
81768          Hybrid    Hybrid            Hybrid   
30192          Hybrid    Hybrid            Hybrid   
166782         Hybrid    Hybrid            Hybrid   
175626         Hybrid    Hybrid            Hybrid   
64593          Hybrid    Hybrid            Hybrid   
114744         Hybrid    Hybrid            Hybrid   
28606          Hybrid    Hybrid            Hybrid   
66395          Hybrid    Hybrid            Hybrid   
39026          Hybrid    Hybrid            Hybrid   
159817         Hybrid    Hybrid            Hybrid   
101518         Hybrid    Hybrid            Hybrid   
27270          Hybrid    Hybrid            Hybrid   
9131           Hybrid    Hybrid            Hybrid   
1796           Hybrid    Hybrid            Hybrid   
158508         Hybrid    Hybrid            Hyb

In [16]:
mild_electric_not_hybrid_cases = mild_electric_cases[~mild_electric_cases['engine'].str.lower().str.contains('hybrid')]
print(mild_electric_not_hybrid_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].head())
# Mild Electric Cases 는 고려하지 않아도 된다

Empty DataFrame
Columns: [fuel_type_orig, fuel_type, updated_fuel_type, engine]
Index: []


Plug-in Hybrid cases 확인

In [17]:
plug_in_cases = train_df_2[train_df_2['engine'].str.lower().str.contains('plug-in')]
print(plug_in_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].sample(n=20))
plug_in_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].to_excel('data/temp/plug_in_cases.xlsx', index=False)

        fuel_type_orig fuel_type updated_fuel_type  \
170866  Plug-In Hybrid    Hybrid            Hybrid   
92687   Plug-In Hybrid    Hybrid            Hybrid   
183142  Plug-In Hybrid    Hybrid            Hybrid   
129887  Plug-In Hybrid    Hybrid            Hybrid   
61205   Plug-In Hybrid    Hybrid            Hybrid   
30674   Plug-In Hybrid    Hybrid            Hybrid   
21485   Plug-In Hybrid    Hybrid            Hybrid   
110253  Plug-In Hybrid    Hybrid            Hybrid   
23973   Plug-In Hybrid    Hybrid            Hybrid   
105987  Plug-In Hybrid    Hybrid            Hybrid   
135323  Plug-In Hybrid    Hybrid            Hybrid   
130192  Plug-In Hybrid    Hybrid            Hybrid   
63659   Plug-In Hybrid    Hybrid            Hybrid   
64809           Hybrid    Hybrid            Hybrid   
49655   Plug-In Hybrid    Hybrid            Hybrid   
92435   Plug-In Hybrid    Hybrid            Hybrid   
177193  Plug-In Hybrid    Hybrid            Hybrid   
166409  Plug-In Hybrid    Hy

In [18]:
plug_in_without_name_mention = train_df_2[(train_df_2['fuel_type'] == 'Plug-In Hybrid') & ~(train_df_2['engine'].str.lower().str.contains('plug-in'))]
print(plug_in_without_name_mention[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].head())
plug_in_without_name_mention[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].to_excel('data/temp/plug_in__without_mention_cases.xlsx', index=False)

Empty DataFrame
Columns: [fuel_type_orig, fuel_type, updated_fuel_type, engine]
Index: []


- Plug-In Hybrid 들은 모두 Engine 에 Plug-in 이 들어가야 한다

Motor Cases 다 전기찬지 확인

In [19]:
motor_cases = train_df_2[train_df_2['engine'].str.lower().str.contains('motor')]
print(motor_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].sample(n=20))
motor_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].to_excel('data/temp/motor_cases.xlsx', index=False)
# Motor 는 다 electric 이 맞는듯

       fuel_type_orig fuel_type updated_fuel_type  \
80736   not supported  Electric          Electric   
87552   not supported  Electric          Electric   
74788   not supported  Electric          Electric   
18653   not supported  Electric          Electric   
38706   not supported  Electric          Electric   
129955  not supported  Electric          Electric   
161188       Gasoline  Gasoline                No   
148663       Gasoline  Gasoline                No   
125794  not supported  Electric          Electric   
178002  not supported  Electric          Electric   
102090  not supported  Electric          Electric   
15292        Gasoline  Gasoline                No   
76320   not supported  Electric          Electric   
7955    not supported  Electric          Electric   
16538   not supported  Electric          Electric   
77380   not supported  Electric          Electric   
126665       Gasoline  Gasoline                No   
11869   not supported  Electric          Elect

Ah case 중에서 암페어가 아니라 그냥 엔진 이름에 들어간 케이스 확인

In [20]:
ah_cases = train_df_2[train_df_2['engine'].str.lower().str.contains('ah')]
print(ah_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].head())
ah_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].to_excel('data/temp/ah_cases.xlsx', index=False)

      fuel_type_orig      fuel_type updated_fuel_type  \
12257       Gasoline       Gasoline                No   
30172  not supported  not supported                No   
42322  not supported  not supported                No   
85526  not supported  not supported                No   
98197  not supported       Electric          Electric   

                                    engine  
12257                               120 AH  
30172  111.2Ah / FR 70kW / RR 160kW (697V)  
42322  111.2Ah / FR 70kW / RR 160kW (697V)  
85526  111.2Ah / FR 70kW / RR 160kW (697V)  
98197                               120 AH  


Ev Cases 확인

In [21]:
ev_cases = train_df_2[train_df_2['engine'].str.lower().str.contains('ev')]
print(ev_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].head())
ev_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].to_excel('data/temp/ev_cases.xlsx', index=False)

      fuel_type_orig fuel_type updated_fuel_type             engine
5728        Gasoline  Gasoline                No  3.0 Liter SC ULEV
18782       Gasoline  Gasoline                No  3.0 Liter SC ULEV
35262         Hybrid    Hybrid                No  3.0 Liter SC ULEV
36276       Gasoline  Gasoline                No  3.0 Liter SC ULEV
39844         Hybrid    Hybrid                No  3.0 Liter SC ULEV


- 따로 엔진명에 ev가 들어간 전기차 케이스는 없다. Liter 케이스나 ULEV 케이스로 다 처리하면 될것으로 보인다

Liter Cases 확인

In [22]:
liter_cases = train_df_2[train_df_2['engine'].str.lower().str.contains('liter')]
print(liter_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].head())
liter_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].to_excel('data/temp/liter_cases.xlsx', index=False)

    fuel_type_orig fuel_type updated_fuel_type                engine
122       Gasoline  Gasoline          Gasoline  2.0 Liter DOHC Turbo
164       Gasoline  Gasoline          Gasoline  2.0 Liter DOHC Turbo
212       Gasoline  Gasoline          Gasoline             2.0 Liter
237       Gasoline  Gasoline          Gasoline       2.0 Liter Turbo
256       Gasoline  Gasoline          Gasoline       3.0 Liter Turbo


In [23]:
liter_diesel_cases = liter_cases[liter_cases['fuel_type_orig'].str.lower().str.contains('diesel')]
print(liter_diesel_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].head())
liter_diesel_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].to_excel('data/temp/liter_diesel_cases.xlsx', index=False)

      fuel_type_orig fuel_type updated_fuel_type     engine
2127          Diesel  Gasoline          Gasoline  3.0 Liter
6299          Diesel  Gasoline          Gasoline  3.0 Liter
11157         Diesel  Gasoline          Gasoline  2.0 Liter
12264         Diesel  Gasoline          Gasoline  3.0 Liter
22130         Diesel  Gasoline          Gasoline  3.0 Liter


- Liter 는 Gasoline, hybrid, plug-in hybrid, not supported 중 하나. 대부분 gasoline 인것으로 보아 합성되는 과정에서 생긴 에러일수도..? 다 Gasoline 으로 처리하거나 처리하지 않고 두는게 좋을듯

tdi cases 확인

In [24]:
tdi_cases = train_df_2[train_df_2['engine'].str.lower().str.contains('tdi')]
print(tdi_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].head())
tdi_cases[['fuel_type_orig', 'fuel_type', 'updated_fuel_type', 'engine']].to_excel('data/temp/tdi_cases.xlsx', index=False)

      fuel_type_orig fuel_type updated_fuel_type          engine
2798        Gasoline  Gasoline          Gasoline  3.0 Liter GTDI
24593         Hybrid  Gasoline          Gasoline  3.0 Liter GTDI
27428       Gasoline  Gasoline          Gasoline  3.5 Liter GTDI
28129       Gasoline  Gasoline          Gasoline  3.5 Liter GTDI
30264       Gasoline  Gasoline          Gasoline  3.5 Liter GTDI


- 다 Gasoline. 디젤로 바꾸지 않는게 좋을듯

개선사항 적용

In [25]:
train_df_3 = train_df.copy()
test_df_3 = test_df.copy()


def extract_full_type_directly(df):
    extracted_fuel_types = []
    
    fuel_type_classes = ['Gasoline', 'Diesel', 'Flex Fuel', 'Electric', 'Hybrid', 'Plug-In']
    fuel_type_classes_lower = [fuel_type.lower() for fuel_type in fuel_type_classes]

    # 정확하게 매칭되는게 하나 있을때만 분류
    for engine_name in df['engine_lower']:
        engine_name = engine_name.lower() if isinstance(engine_name, str) else ''

        fuel_type_class_matching_count = sum(fuel_type in engine_name for fuel_type in fuel_type_classes_lower)
        
        if fuel_type_class_matching_count == 1:
            if 'gasoline' in engine_name:
                extracted_fuel_types.append('Gasoline')
            elif 'diesel' in engine_name:
                extracted_fuel_types.append('Diesel')
            elif 'flex fuel' in engine_name:
                extracted_fuel_types.append('E85 Flex Fuel')
            elif 'electric' in engine_name:
                extracted_fuel_types.append('Electric')
            elif 'hybrid' in engine_name:
                extracted_fuel_types.append('Hybrid')
            else:
                extracted_fuel_types.append('not supported')
        else:
            if 'plug-in' in engine_name:
                extracted_fuel_types.append('Plug-In Hybrid')
            else:
                extracted_fuel_types.append('not supported')

    df['direct_extract_fuel_type'] = extracted_fuel_types
    return df

# Todo: 2개 이상의 fuel_type 이 매칭되면 hybrid 처리..?

def extract_complex_fuel_type_pre(df):    
    # Gasoline
    # Removed Patterns: turbo|v6|i4|tfs (uncertain)
    condition_gasoline_1 = df['engine_lower'].str.contains('liter|ulev|v8|gdi|mpfi') # v8 is checked by gpt
    condition_gasoline_2 = df['engine_lower'].str.contains('dohc') & df['engine_lower'].str.contains('turbo') # DOHC Turbo is common gasoline cases (by gpt)
    df.loc[condition_gasoline_1 | condition_gasoline_2, 'complex_extract_fuel_type'] = 'Gasoline'
    
    # E85 Flex Fuel
    condition_flex_fuel = df['engine_lower'].str.contains('flex') & df['engine_lower'].str.contains('fuel')
    df.loc[condition_flex_fuel, 'complex_extract_fuel_type'] = 'E85 Flex Fuel'
    
    # Hybrid
    # Removed Patterns: mild electric (all cases have hybrid in engine name)
    raw_fuels = ['Gasoline', 'Diesel', 'Electric', 'Gas']
    raw_fuels_lower = [fuel.lower() for fuel in raw_fuels]
    def count_raw_fuels(engine_desc):
        return sum(fuel in engine_desc for fuel in raw_fuels_lower)
    
    # Apply the function row by row and store the count in a new column 'fuel_count'
    # df['fuel_count'] = df['engine_lower'].apply(count_raw_fuels)

    
    condition_hybrid = df['engine_lower'].apply(lambda x: (count_raw_fuels(x) > 1) and 'hybrid' in x and 'plug-in' not in x) # 2개 이상의 raw fuel 이 포함되있으며 plug-in 이 없으면 hybrid
    df.loc[condition_hybrid, 'complex_extract_fuel_type'] = 'Hybrid'
    
    # Disel
    # Removed Patterns: tdi (most of case was gasoline)
    # Skipped since there was no special unique pattern
    
    # Plug-In Hybrid
    # Skipped since there was no special unique pattern
    
    # Electric
    condition_electric = df['engine_lower'].str.contains('battery|motor|ah')
    df.loc[condition_electric, 'complex_extract_fuel_type'] = 'Electric'

    return df


# engine 정보를 기반으로 fuel_type을 라벨링하는 함수 (ULEV 처리 추가)
def label_fuel_type_from_engine(df):
    # 먼저 모든 엔진 값을 소문자로 변환하여 처리
    df['engine_lower'] = df['engine'].str.lower()
    df['fuel_type_orig'] = df['fuel_type']

    df['direct_extract_fuel_type'] = 'not supported'  # 엔진명에서 연료 이름으로 직접 추출한 연료 타입
    df = extract_full_type_directly(df)
    
    df['complex_extract_fuel_type'] = 'not supported'  # 복합적인 방법으로 추출한 연료 타입
    df = extract_complex_fuel_type_pre(df)

    # fuel_type 업데이트
    df['fuel_type'] = df['fuel_type'].where(df['direct_extract_fuel_type'] == 'not supported', df['direct_extract_fuel_type'])
    df['fuel_type'] = df['fuel_type'].where((df['direct_extract_fuel_type'] != 'not supported') | (df['complex_extract_fuel_type'] == 'not supported'), df['complex_extract_fuel_type'])

    # 'engine_lower' 컬럼은 불필요하므로 제거
    # df.drop(columns=['engine_lower'], inplace=True)
    # df.drop(conlumns=['fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type'], inplace=True)

    return df

# 데이터에 라벨링 함수 적용
train_df_3 = label_fuel_type_from_engine(train_df_3)

# 결과 출력
train_df_3[['engine', 'fuel_type']].head()
# fuel_type 고유값 카운트 확인
print(train_df_3['fuel_type'].value_counts())


fuel_type
Gasoline          166440
Electric            5639
Hybrid              5546
E85 Flex Fuel       5476
Diesel              4123
not supported        772
Plug-In Hybrid       537
Name: count, dtype: int64


결측치 중 엔진 정보가 있는 항목중 보간되지 않은 항목들

In [26]:
not_supported_cases = train_df_3[(train_df_3['fuel_type'] == 'not supported') & (train_df_3['engine'] != '–')]
print(not_supported_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine']].head())
# not_supported_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine']].to_excel('data/temp/improved_not_supported_cases.xlsx', index=False)
not_supported_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine']].to_excel('data/temp/improved_not_supported_cases.xlsx')

Empty DataFrame
Columns: [fuel_type, fuel_type_orig, direct_extract_fuel_type, complex_extract_fuel_type, engine]
Index: []


업데이트 된 항목들

In [27]:
updated_cases = train_df_3[(train_df_3['fuel_type_orig'] != train_df_2['fuel_type'])]
print(updated_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine']].sample(n=20))
updated_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine']].to_excel('data/temp/improved_updated_cases.xlsx', index=False)

             fuel_type  fuel_type_orig direct_extract_fuel_type  \
68465           Diesel          Diesel                   Diesel   
150479        Electric   not supported                 Electric   
174916        Electric   not supported                 Electric   
128633        Electric   not supported                 Electric   
109763          Diesel          Diesel                   Diesel   
70280         Gasoline          Hybrid            not supported   
164311          Hybrid          Hybrid                   Hybrid   
165914        Electric   not supported                 Electric   
88231           Diesel          Diesel                   Diesel   
11589         Electric   not supported                 Electric   
147381  Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
64993    E85 Flex Fuel   E85 Flex Fuel            not supported   
180121          Hybrid          Hybrid                   Hybrid   
87828         Electric   not supported                 Electri

Plug-in Cases 에러 핸들링

In [28]:
plug_in_cases = updated_cases[updated_cases['fuel_type_orig'] == 'Plug-In Hybrid']
print(plug_in_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].sample(n=20))
plug_in_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].to_excel('data/temp/improved_plug_in_cases.xlsx', index=False)

             fuel_type  fuel_type_orig direct_extract_fuel_type  \
148558  Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
87626   Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
8889    Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
14964   Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
41152   Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
16514   Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
44165   Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
157752  Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
52244   Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
88784   Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
54016   Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
137063  Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
20471         Electric  Plug-In Hybrid                 Electric   
7133    Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybri

Flex Fuel 에러 핸들링

In [29]:
# E85 Flex Fuel
flex_fuel_cases = updated_cases[updated_cases['fuel_type_orig'] == 'E85 Flex Fuel']
print(flex_fuel_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].sample(n=20))
flex_fuel_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].to_excel('data/temp/improved_flex_fuel_cases.xlsx', index=False)

            fuel_type fuel_type_orig direct_extract_fuel_type  \
121096  E85 Flex Fuel  E85 Flex Fuel            E85 Flex Fuel   
168629  E85 Flex Fuel  E85 Flex Fuel            E85 Flex Fuel   
129525  E85 Flex Fuel  E85 Flex Fuel            E85 Flex Fuel   
49182   E85 Flex Fuel  E85 Flex Fuel            E85 Flex Fuel   
19911   E85 Flex Fuel  E85 Flex Fuel            E85 Flex Fuel   
144852  E85 Flex Fuel  E85 Flex Fuel            E85 Flex Fuel   
3828    E85 Flex Fuel  E85 Flex Fuel            E85 Flex Fuel   
36798   E85 Flex Fuel  E85 Flex Fuel            E85 Flex Fuel   
85911   E85 Flex Fuel  E85 Flex Fuel            E85 Flex Fuel   
160120  E85 Flex Fuel  E85 Flex Fuel            E85 Flex Fuel   
123890  E85 Flex Fuel  E85 Flex Fuel            E85 Flex Fuel   
107713  E85 Flex Fuel  E85 Flex Fuel            not supported   
67397   E85 Flex Fuel  E85 Flex Fuel            E85 Flex Fuel   
187424  E85 Flex Fuel  E85 Flex Fuel            E85 Flex Fuel   
91690   E85 Flex Fuel  E8

In [30]:
mismatch_flex_fuel_cases = updated_cases[(updated_cases['fuel_type_orig'] == 'E85 Flex Fuel') & (updated_cases['fuel_type'] != 'E85 Flex Fuel')]
print(mismatch_flex_fuel_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].sample(n=20))
mismatch_flex_fuel_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].to_excel('data/temp/improved_mismatch_flex_fuel_cases.xlsx', index=False)

       fuel_type fuel_type_orig direct_extract_fuel_type  \
148233  Gasoline  E85 Flex Fuel            not supported   
37136   Gasoline  E85 Flex Fuel            not supported   
34527   Gasoline  E85 Flex Fuel            not supported   
84118   Electric  E85 Flex Fuel                 Electric   
168158  Gasoline  E85 Flex Fuel            not supported   
145114  Gasoline  E85 Flex Fuel            not supported   
60643   Gasoline  E85 Flex Fuel            not supported   
131900  Gasoline  E85 Flex Fuel            not supported   
131907  Gasoline  E85 Flex Fuel            not supported   
163402  Gasoline  E85 Flex Fuel            not supported   
73272   Gasoline  E85 Flex Fuel            not supported   
71992   Gasoline  E85 Flex Fuel            not supported   
89541   Gasoline  E85 Flex Fuel            not supported   
32390   Gasoline  E85 Flex Fuel            not supported   
118948  Gasoline  E85 Flex Fuel            not supported   
148662  Gasoline  E85 Flex Fuel         

- 해당 사항들은 맞게 픽스된 케이스로 보임

Not supported -> Electric 케이스 확인

In [31]:
orig_not_supported_cases = train_df_3[train_df_3['fuel_type_orig'] == 'not supported']
orig_not_supported_to_electric_cases = orig_not_supported_cases[orig_not_supported_cases['fuel_type'].str.lower() == 'electric']
orig_not_supported_to_not_electric_cases = orig_not_supported_cases[orig_not_supported_cases['fuel_type'].str.lower() != 'electric']

print(len(orig_not_supported_to_electric_cases))
print(len(orig_not_supported_to_not_electric_cases))


4988
891


상당수가 electric 인것을 봐서 만약 결측치를 모두 not supported 으로 분류했을 경우, 일종의 electric class 를 만든것과 비슷한 효과를 줬을수도 있을듯

# 영석님 추가 아이디어 내용 확인

TFSI 케이스 확인

In [32]:
tfsi_cases = train_df_3[train_df_3['engine'].str.lower().str.contains('tfsi')]
print(tfsi_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].sample(n=20))
tfsi_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].to_excel('data/temp/improved_tfsi_cases.xlsx', index=False)

       fuel_type fuel_type_orig direct_extract_fuel_type  \
67891   Gasoline         Hybrid            not supported   
100536  Gasoline         Hybrid            not supported   
104171  Gasoline       Gasoline            not supported   
146161  Gasoline         Hybrid            not supported   
17519   Gasoline       Gasoline            not supported   
30786   Gasoline       Gasoline            not supported   
163574  Gasoline       Gasoline            not supported   
150863  Gasoline       Gasoline            not supported   
25200   Gasoline         Hybrid            not supported   
138263  Gasoline       Gasoline            not supported   
169101  Gasoline         Hybrid            not supported   
181585  Gasoline         Hybrid            not supported   
166242  Gasoline         Hybrid            not supported   
34034   Gasoline         Hybrid            not supported   
184788  Gasoline         Hybrid            not supported   
42236   Gasoline         Diesel         

브랜드 확인

In [33]:
brand_tesla_cases = train_df_3[train_df_3['brand'].str.lower().str.contains('tesla')]
print(brand_tesla_cases[['brand', 'fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].sample(n=20))
brand_tesla_cases[['brand', 'fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].to_excel('data/temp/improved_brand_tesla_cases.xlsx', index=False)

        brand fuel_type fuel_type_orig direct_extract_fuel_type  \
22356   Tesla  Electric  not supported                 Electric   
155555  Tesla  Electric  not supported                 Electric   
48756   Tesla  Electric  not supported                 Electric   
149504  Tesla  Electric  not supported                 Electric   
164998  Tesla  Electric  not supported                 Electric   
89544   Tesla  Electric  not supported                 Electric   
79461   Tesla  Electric  not supported                 Electric   
41336   Tesla  Electric  not supported                 Electric   
110577  Tesla  Electric       Gasoline                 Electric   
146582  Tesla  Electric  not supported                 Electric   
172408  Tesla  Electric  not supported                 Electric   
180168  Tesla  Electric  not supported            not supported   
173362  Tesla  Electric  not supported                 Electric   
125650  Tesla  Electric  not supported                 Electri

In [34]:
brand_tesla_no_electric_Cases = brand_tesla_cases[brand_tesla_cases['fuel_type'].str.lower() != 'electric']
print(brand_tesla_no_electric_Cases[['brand', 'fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].sample(n=20))
brand_tesla_no_electric_Cases[['brand', 'fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].to_excel('data/temp/improved_brand_tesla_no_electric_Cases.xlsx', index=False)

        brand      fuel_type fuel_type_orig direct_extract_fuel_type  \
45350   Tesla       Gasoline       Gasoline                 Gasoline   
170260  Tesla       Gasoline       Gasoline                 Gasoline   
162457  Tesla       Gasoline       Gasoline                 Gasoline   
45144   Tesla       Gasoline       Gasoline            not supported   
116523  Tesla       Gasoline  not supported            not supported   
36846   Tesla       Gasoline       Gasoline                 Gasoline   
34817   Tesla  E85 Flex Fuel  E85 Flex Fuel            E85 Flex Fuel   
152891  Tesla  not supported  not supported            not supported   
119617  Tesla       Gasoline  not supported                 Gasoline   
161518  Tesla  not supported  not supported            not supported   
3416    Tesla       Gasoline       Gasoline                 Gasoline   
64397   Tesla       Gasoline       Gasoline            not supported   
115752  Tesla       Gasoline         Hybrid            not suppo

In [35]:
brand_rivian_cases = train_df_3[train_df_3['brand'].str.lower().str.contains('rivian')]
brand_rivian_no_electric_Cases = brand_rivian_cases[brand_rivian_cases['fuel_type'].str.lower() != 'electric']
print(brand_rivian_no_electric_Cases[['brand', 'fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].sample(n=20))
brand_rivian_no_electric_Cases[['brand', 'fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].to_excel('data/temp/improved_brand_rivian_no_electric_Cases.xlsx', index=False)

         brand      fuel_type fuel_type_orig direct_extract_fuel_type  \
72413   Rivian       Gasoline       Gasoline                 Gasoline   
72294   Rivian       Gasoline  not supported            not supported   
28220   Rivian       Gasoline       Gasoline                 Gasoline   
26407   Rivian       Gasoline       Gasoline            not supported   
118459  Rivian       Gasoline       Gasoline            not supported   
144984  Rivian       Gasoline       Gasoline                 Gasoline   
11216   Rivian       Gasoline       Gasoline                 Gasoline   
33720   Rivian       Gasoline       Gasoline            not supported   
22017   Rivian       Gasoline       Gasoline                 Gasoline   
12389   Rivian  not supported  not supported            not supported   
132315  Rivian       Gasoline  not supported            not supported   
56691   Rivian       Gasoline  not supported            not supported   
137430  Rivian       Gasoline       Gasoline       

In [37]:
brand_lucid_cases = train_df_3[train_df_3['brand'].str.lower().str.contains('lucid')]
brand_lucid_no_electric_Cases = brand_lucid_cases[brand_lucid_cases['fuel_type'].str.lower() != 'electric']
print(brand_lucid_no_electric_Cases[['brand', 'fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].head())
brand_lucid_no_electric_Cases[['brand', 'fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].to_excel('data/temp/improved_brand_lucid_no_electric_Cases.xlsx', index=False)

       brand fuel_type fuel_type_orig direct_extract_fuel_type  \
2256   Lucid  Gasoline       Gasoline                 Gasoline   
13875  Lucid  Gasoline       Gasoline            not supported   
43281  Lucid  Gasoline       Gasoline            not supported   
48359  Lucid  Gasoline       Gasoline            not supported   
58545  Lucid  Gasoline       Gasoline                 Gasoline   

      complex_extract_fuel_type  \
2256              not supported   
13875                  Gasoline   
43281                  Gasoline   
48359                  Gasoline   
58545             not supported   

                                              engine  \
2256   262.0HP 3.5L V6 Cylinder Engine Gasoline Fuel   
13875                          5.2L V10 40V GDI DOHC   
43281                            5.3L V8 16V GDI OHV   
48359               5.2L V12 48V GDI DOHC Twin Turbo   
58545   536.0HP 5.5L 8 Cylinder Engine Gasoline Fuel   

                                        engine_lower  


Hydrogen Cases 확인

In [38]:
hydrogen_cases = train_df_3[(train_df_3['engine'].str.lower().str.contains('hydrogen')) | (train_df_3['engine'].str.lower().str.contains('fuel cell'))]
print(hydrogen_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].head())
hydrogen_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].to_excel('data/temp/improved_hydrogen_cases.xlsx', index=False)

      fuel_type fuel_type_orig direct_extract_fuel_type  \
18025  Electric       Gasoline                 Electric   
19182  Electric  not supported                 Electric   
23138  Electric  not supported                 Electric   
24737  Electric         Hybrid                 Electric   
33030  Electric  not supported                 Electric   

      complex_extract_fuel_type                                engine  \
18025                  Electric  151.0HP Electric Motor Hydrogen Fuel   
19182                  Electric  182.0HP Electric Motor Hydrogen Fuel   
23138                  Electric  182.0HP Electric Motor Hydrogen Fuel   
24737                  Electric  151.0HP Electric Motor Hydrogen Fuel   
33030                  Electric  182.0HP Electric Motor Hydrogen Fuel   

                               engine_lower  
18025  151.0hp electric motor hydrogen fuel  
19182  182.0hp electric motor hydrogen fuel  
23138  182.0hp electric motor hydrogen fuel  
24737  151.0hp electri

Plug-In Hybrid Cases 확인

In [39]:
plug_in_hybrid_cases = train_df_3[train_df_3['fuel_type'].str.lower().str.contains('plug-in')]
print(plug_in_hybrid_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].head())
plug_in_hybrid_cases[['fuel_type', 'fuel_type_orig', 'direct_extract_fuel_type', 'complex_extract_fuel_type', 'engine', 'engine_lower']].to_excel('data/temp/improved_plug_in_hybrid_cases.xlsx', index=False)

           fuel_type  fuel_type_orig direct_extract_fuel_type  \
116   Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
213   Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
1605  Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
2825  Plug-In Hybrid  Plug-In Hybrid           Plug-In Hybrid   
2834  Plug-In Hybrid          Hybrid           Plug-In Hybrid   

     complex_extract_fuel_type  \
116              not supported   
213              not supported   
1605             not supported   
2825             not supported   
2834             not supported   

                                                 engine  \
116   389.0HP 3.0L Straight 6 Cylinder Engine Plug-I...   
213   389.0HP 3.0L Straight 6 Cylinder Engine Plug-I...   
1605  369.0HP 1.5L 3 Cylinder Engine Plug-In Electri...   
2825  416.0HP 3.0L V6 Cylinder Engine Plug-In Electr...   
2834  416.0HP 3.0L V6 Cylinder Engine Plug-In Electr...   

                                           engine