In [2]:
# Install if needed (run in cmd if errors: pip install pandas scikit-learn joblib matplotlib)
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib
import matplotlib.pyplot as plt

# Quick check: List files in data folder to confirm CSV name
print("Files in 'data':", os.listdir('data'))
print("Current folder:", os.getcwd())

Files in 'data': ['.ipynb_checkpoints', 'ElectricCarData_Clean.csv']
Current folder: D:\Btech\Ai integrated\EV-Green-Chatbot


In [3]:
# Load CSV - Using EXACT name from your output
csv_name = 'ElectricCarData_Clean.csv'  # This matches your file
df = pd.read_csv(f'data/{csv_name}')

# Basic info
print("Dataset size (rows x columns):", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nColumn names:")
print(df.columns.tolist())
print("\nMissing values per column:")
print(df.isnull().sum())

Dataset size (rows x columns): (103, 14)

First 5 rows:
         Brand                          Model  AccelSec  TopSpeed_KmH  \
0       Tesla   Model 3 Long Range Dual Motor       4.6           233   
1  Volkswagen                       ID.3 Pure      10.0           160   
2    Polestar                               2       4.7           210   
3         BMW                            iX3        6.8           180   
4       Honda                              e        9.5           145   

   Range_Km  Efficiency_WhKm FastCharge_KmH RapidCharge PowerTrain  \
0       450              161            940         Yes        AWD   
1       270              167            250         Yes        RWD   
2       400              181            620         Yes        AWD   
3       360              206            560         Yes        RWD   
4       170              168            190         Yes        RWD   

     PlugType  BodyStyle Segment  Seats  PriceEuro  
0  Type 2 CCS      Sedan       

In [4]:
# Clean & Engineer Features (handles categoricals/numerics)
# Drop any unnamed (none here, but safe)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Price to numeric (already good, but ensure)
df['PriceEuro'] = pd.to_numeric(df['PriceEuro'], errors='coerce')
df = df.dropna(subset=['PriceEuro'])  # No change since clean

# Engineer Green Feature: Efficiency to km/kWh (higher = greener; range / efficiency = score)
df['Efficiency_KmKWh'] = 1000 / df['Efficiency_WhKm']  # e.g., 161 Wh/km -> ~6.21 km/kWh

# Select Features (inputs) & Target (price)
numerical_cols = ['AccelSec', 'TopSpeed_KmH', 'Range_Km', 'Efficiency_WhKm', 'Seats', 'FastCharge_KmH']  # Add fast charge
categorical_cols = ['Brand', 'PowerTrain', 'BodyStyle', 'Segment']  # Brands, etc. (drop 'Model'/'PlugType' for simplicity)

# Verify columns exist (prints if missing)
missing = [col for col in numerical_cols + categorical_cols if col not in df.columns]
if missing:
    print("Warning: Missing columns:", missing)
    # Auto-remove them
    numerical_cols = [col for col in numerical_cols if col in df.columns]
    categorical_cols = [col for col in categorical_cols if col in df.columns]

X = df[numerical_cols + categorical_cols]  # Features
y = df['PriceEuro']  # Target

print("Preprocessed Features Shape:", X.shape)
print("\nSample Features (X):")
print(X.head())
print("\nTarget Sample (Prices):")
print(y.head())
print("\nGreen Example: Efficiency km/kWh for first 5:")
print(df['Efficiency_KmKWh'].head())

Preprocessed Features Shape: (103, 10)

Sample Features (X):
   AccelSec  TopSpeed_KmH  Range_Km  Efficiency_WhKm  Seats FastCharge_KmH  \
0       4.6           233       450              161      5            940   
1      10.0           160       270              167      5            250   
2       4.7           210       400              181      5            620   
3       6.8           180       360              206      5            560   
4       9.5           145       170              168      4            190   

         Brand PowerTrain  BodyStyle Segment  
0       Tesla         AWD      Sedan       D  
1  Volkswagen         RWD  Hatchback       C  
2    Polestar         AWD   Liftback       D  
3         BMW         RWD        SUV       D  
4       Honda         RWD  Hatchback       B  

Target Sample (Prices):
0    55480
1    30000
2    56440
3    68040
4    32997
Name: PriceEuro, dtype: int64

Green Example: Efficiency km/kWh for first 5:
0    6.211180
1    5.988024
2  

In [6]:
import numpy as np  # Add this if not imported

# Clean & Engineer Features
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Key Fix: Replace '-' with NaN in numerical columns, then to float
for col in numerical_cols:  # Use our list
    if col in df.columns:
        df[col] = df[col].replace('-', np.nan).astype(float)

df['PriceEuro'] = pd.to_numeric(df['PriceEuro'], errors='coerce')

# Engineer Green Feature
if 'Efficiency_WhKm' in df.columns:
    df['Efficiency_KmKWh'] = 1000 / df['Efficiency_WhKm']

# Update numerical_cols to include engineered (if exists)
if 'Efficiency_KmKWh' in df.columns:
    if 'Efficiency_KmKWh' not in numerical_cols:
        numerical_cols.append('Efficiency_KmKWh')

# Drop rows with any missing in features/target (safe for 103 rows)
df = df.dropna(subset=numerical_cols + ['PriceEuro'])

categorical_cols = ['Brand', 'PowerTrain', 'BodyStyle', 'Segment']

# Verify columns
missing = [col for col in numerical_cols + categorical_cols if col not in df.columns]
if missing:
    print("Missing columns:", missing)
    numerical_cols = [col for col in numerical_cols if col in df.columns]
    categorical_cols = [col for col in categorical_cols if col in df.columns]

X = df[numerical_cols + categorical_cols]
y = df['PriceEuro']

print("Preprocessed Shape (after cleaning):", X.shape)
print("Rows dropped due to '-':", 103 - len(df))  # Original 103
print("\nSample X (now all numeric):")
print(X.head())
print("\nAny '-' left? Check FastCharge:")
print(df['FastCharge_KmH'].dtype)  # Should be float64

Preprocessed Shape (after cleaning): (98, 11)
Rows dropped due to '-': 5

Sample X (now all numeric):
   AccelSec  TopSpeed_KmH  Range_Km  Efficiency_WhKm  Seats  FastCharge_KmH  \
0       4.6         233.0     450.0            161.0    5.0           940.0   
1      10.0         160.0     270.0            167.0    5.0           250.0   
2       4.7         210.0     400.0            181.0    5.0           620.0   
3       6.8         180.0     360.0            206.0    5.0           560.0   
4       9.5         145.0     170.0            168.0    4.0           190.0   

   Efficiency_KmKWh        Brand PowerTrain  BodyStyle Segment  
0          6.211180       Tesla         AWD      Sedan       D  
1          5.988024  Volkswagen         RWD  Hatchback       C  
2          5.524862    Polestar         AWD   Liftback       D  
3          4.854369         BMW         RWD        SUV       D  
4          5.952381       Honda         RWD  Hatchback       B  

Any '-' left? Check FastCharge:


In [7]:
# Preprocessor: Scales numbers, encodes categories
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import pandas as pd  # For importances

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Now includes Efficiency_KmKWh
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ])

# Process & Split
X_processed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Train
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict & Score
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model Trained!")
print(f"R² Score: {r2:.3f} (0.7+ = solid for 98 samples)")
print(f"MAE: €{mae:.0f} (avg error)")
print("\nSample Predictions vs Actual:")
for i in range(min(5, len(y_test))):
    print(f"Pred: €{y_pred[i]:.0f} | Actual: €{y_test.iloc[i]:.0f} | Diff: €{abs(y_pred[i] - y_test.iloc[i]):.0f}")

# Top Features
feature_names = numerical_cols + [f'cat__{name.replace(" ", "_")}' for name in preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)]  # Fix spacing
importances = pd.Series(model.feature_importances_, index=feature_names).sort_values(ascending=False)
print("\nTop 5 Price Drivers:")
print(importances.head())

Model Trained!
R² Score: 0.881 (0.7+ = solid for 98 samples)
MAE: €6822 (avg error)

Sample Predictions vs Actual:
Pred: €67040 | Actual: €75351 | Diff: €8311
Pred: €99832 | Actual: €79990 | Diff: €19842
Pred: €76111 | Actual: €96050 | Diff: €19939
Pred: €35361 | Actual: €30000 | Diff: €5361
Pred: €35862 | Actual: €34900 | Diff: €962

Top 5 Price Drivers:
cat__Segment_F    0.299713
AccelSec          0.271766
TopSpeed_KmH      0.174441
Range_Km          0.059624
FastCharge_KmH    0.043294
dtype: float64


In [8]:
import os
import joblib  # For saving

# Create models folder
os.makedirs('models', exist_ok=True)

# Save model, preprocessor, & cols (for later loading)
joblib.dump(model, 'models/ev_model.pkl')
joblib.dump(preprocessor, 'models/preprocessor.pkl')
joblib.dump({'numerical_cols': numerical_cols, 'categorical_cols': categorical_cols}, 'models/cols.pkl')

print("Saved! Check /models folder for .pkl files (your trained brain).")

# Test Prediction: Fake new EV (e.g., "Mid-range SUV: 350km range, 5s accel, BMW, AWD, SUV, D-segment")
test_features = pd.DataFrame({
    'AccelSec': [5.0], 'TopSpeed_KmH': [200.0], 'Range_Km': [350.0],
    'Efficiency_WhKm': [170.0], 'Seats': [5.0], 'FastCharge_KmH': [500.0],
    'Brand': ['BMW'], 'PowerTrain': ['AWD'], 'BodyStyle': ['SUV'], 'Segment': ['D']
})

# Add Efficiency_KmKWh if used
test_features['Efficiency_KmKWh'] = 1000 / test_features['Efficiency_WhKm']

# Prep & Predict
X_test_new = test_features[numerical_cols + categorical_cols]
X_test_processed = preprocessor.transform(X_test_new)
pred_price = model.predict(X_test_processed)[0]
print(f"\nTest EV: BMW AWD SUV (350km range, 5s accel) → Predicted Price: €{pred_price:.0f}")
print("Real check: Similar BMW iX3 ~€68K—close?")

Saved! Check /models folder for .pkl files (your trained brain).

Test EV: BMW AWD SUV (350km range, 5s accel) → Predicted Price: €52321
Real check: Similar BMW iX3 ~€68K—close?


In [9]:
from transformers import pipeline
import torch

generator = pipeline('text-generation', model='distilgpt2', device=-1)  # CPU mode

print("Gen AI Loaded! Test:")
test_prompt = "Hello, EV world!"
response = generator(test_prompt, max_length=20, num_return_sequences=1)[0]['generated_text']
print(response)

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Gen AI Loaded! Test:
Hello, EV world!
Since I was a teenager, I found myself thinking, How can I get better with the new tech? And how can I be more productive with the new tech?
I was thinking about it on my blog, and I decided to take a look at how to improve my skills.
I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be doing this. I‪m going to be


In [10]:
import re
import ast

def extract_features(user_input):
    prompt = f"""Extract EV features as Python dict from user query: '{user_input}'.
    Use dataset columns: Brand (e.g., 'Tesla'), Range_Km (int, default 300), AccelSec (float, default 6.0),
    TopSpeed_KmH (int, default 200), Efficiency_WhKm (int, default 170), Seats (int, default 5),
    FastCharge_KmH (int, default 400), PowerTrain (e.g., 'AWD'/'RWD', default 'RWD'),
    BodyStyle (e.g., 'SUV'/'Sedan', default 'Sedan'), Segment (e.g., 'D', default 'C').
    Output ONLY the dict, e.g., {{'Brand': 'Tesla', 'Range_Km': 400, ...}}"""

    response = generator(prompt, max_length=200, num_return_sequences=1, temperature=0.3)[0]['generated_text']

    try:
        dict_match = re.search(r'\{.*\}', response, re.DOTALL)
        if dict_match:
            features_str = dict_match.group(0)
            features = ast.literal_eval(features_str)
        else:
            raise ValueError("No dict found")
    except:
        features = {
            'Brand': 'Generic', 'Range_Km': 300, 'AccelSec': 6.0, 'TopSpeed_KmH': 200,
            'Efficiency_WhKm': 170, 'Seats': 5, 'FastCharge_KmH': 400,
            'PowerTrain': 'RWD', 'BodyStyle': 'Sedan', 'Segment': 'C'
        }
        print("Warning: Used defaults—query unclear.")

    features['Efficiency_KmKWh'] = 1000 / features['Efficiency_WhKm']
    print(f"Extracted: {features}")
    return features

# Test
user_query = "Luxury Tesla SUV with 400km range, AWD, 4.5s accel"
test_features = extract_features(user_query)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Extracted: {'Brand': 'Generic', 'Range_Km': 300, 'AccelSec': 6.0, 'TopSpeed_KmH': 200, 'Efficiency_WhKm': 170, 'Seats': 5, 'FastCharge_KmH': 400, 'PowerTrain': 'RWD', 'BodyStyle': 'Sedan', 'Segment': 'C', 'Efficiency_KmKWh': 5.882352941176471}


In [11]:
def generate_explanation(price, features):
    # Improved prompt: Force clean, eco-focused output (no echo)
    prompt = f"Write a short EV price explanation (~50 words): Price €{price:.0f} for {features['Brand']} {features['BodyStyle']} with {features['Range_Km']}km range, {features['PowerTrain']}, {features['AccelSec']}s accel. Highlight cost drivers + green tip on efficiency {features['Efficiency_KmKWh']:.1f} km/kWh (higher = greener, lower emissions). End with eco suggestion."

    response = generator(prompt, max_length=120, num_return_sequences=1, temperature=0.7, do_sample=True)[0]['generated_text']

    # Better extraction: Trim prompt + clean up
    explanation = response.replace(prompt, '').strip()  # Remove echoed prompt
    explanation = ' '.join(explanation.split()[:25])  # Limit to ~50 words, trim junk

    if len(explanation) < 20:  # Fallback if too short
        explanation = f"This {features['Brand']} EV costs ~€{price:.0f} due to range and features. Green tip: {features['Efficiency_KmKWh']:.1f} km/kWh efficiency saves CO2—pair with home solar!"

    return explanation

# Load & Predict (same as before)
import joblib
import pandas as pd
model = joblib.load('models/ev_model.pkl')
preprocessor = joblib.load('models/preprocessor.pkl')
cols = joblib.load('models/cols.pkl')
numerical_cols = cols['numerical_cols']
categorical_cols = cols['categorical_cols']

test_df = pd.DataFrame([test_features])
X_test_prep = test_df[numerical_cols + categorical_cols]
X_test_processed = preprocessor.transform(X_test_prep)
test_price = model.predict(X_test_processed)[0]

explanation = generate_explanation(test_price, test_features)
print(f"Predicted: €{test_price:.0f}\nExplanation: {explanation}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Predicted: €47596
Explanation: _WhkmH = 30, 'Efficiency_WhkmH': 300, 'AccelSec': 6.0, 'TopSpeed_KmH': 400, 'PowerTrain': 'RWD', 'BodyStyle': 'Sedan', 'Segment': 'C', 'Efficiency_KmWh': 5.882352941176471}.
