In [1]:
import pandas as pd
df=pd.read_csv("EVDATASET.csv")
df.head()

Unnamed: 0,Year,Country,Brand,Model,Battery_kWh,Range_km,Motor_Power_kW,Weight_kg,Price_USD,Charging_Stations,Sales,Govt_Incentive_USD
0,2016,China,Tata,Nexon EV,69.9,332,135,1789,33741,50789,1520,1200
1,2018,USA,Hyundai,Kona Electric,50.6,353,117,1509,36758,36337,1478,900
2,2022,India,Tata,Nexon EV,39.7,381,88,1387,26752,3473,1832,600
3,2019,Germany,Tesla,Model S,49.1,268,101,1367,65866,33699,1018,810
4,2024,Germany,Audi,e-tron,59.7,490,87,1793,54986,48335,3072,360


### ELECTRIC VEHICLE DATASET ANALYSIS
### OBJECTIVE 
To perform initial data loading and exploration on an electric vehicle dataset using python and pandas 
### STEPS PERFORMED
1. imported the pandas library
2. loaded the dataset(EVDATASET.csv)using pd.read_csv()
3. displayed the first few rows with df.read() to verify successful loading
### OBSERVATION
1. the dataset contains details like year , country,brand,model,battery(kwh),range(km),motor power,weight,price,charging stations,and sales.
   data appeared clean for further analysis

## WEEK2-MACHINE LEARNING MODEL BUILDING 

In [None]:
#week 2 : Machine learning imports
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import linearregression 
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
import numpy as np

In [15]:
import pandas as pd
df=pd.read_csv("EVDATASET.csv")
df.head()

Unnamed: 0,Year,Country,Brand,Model,Battery_kWh,Range_km,Motor_Power_kW,Weight_kg,Price_USD,Charging_Stations,Sales,Govt_Incentive_USD
0,2016,China,Tata,Nexon EV,69.9,332,135,1789,33741,50789,1520,1200
1,2018,USA,Hyundai,Kona Electric,50.6,353,117,1509,36758,36337,1478,900
2,2022,India,Tata,Nexon EV,39.7,381,88,1387,26752,3473,1832,600
3,2019,Germany,Tesla,Model S,49.1,268,101,1367,65866,33699,1018,810
4,2024,Germany,Audi,e-tron,59.7,490,87,1793,54986,48335,3072,360


In [16]:
# drop missing values(if any)
df=df.dropna()
#convert categorical columns (country,brand,model) into numeric form using one-hot encoding
df=pd.get_dummies(df,columns=['Country','Brand','Model'],drop_first=True) 
#check shape after encoding 
df.shape 

(500, 56)

In [18]:
X=df.drop('Sales',axis=1)
y=df['Sales']

In [22]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [31]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y_train)

In [32]:
y_pred=model.predict(X_test)
y_pred

array([ 1897.21016993,  2166.1256784 ,   836.17128027,  2031.79760125,
        1288.06362502,  1162.02722362,  -566.5462395 ,  2328.18533053,
         260.06800279,  1733.24577083,   719.35686581,  2899.85169675,
        -244.79636578,  1187.34775588,  1017.30049463,  1211.25532253,
         118.60269437,  1782.29705631,  3649.38689211, -1073.91565905,
        2955.46464849,  2498.84292054,  1902.01436761,   300.82322208,
        2242.96100634,  1552.0955948 ,  2710.79900507,  3196.63237218,
        1720.05909048,  2769.30318791,  1074.01043156,  1909.44950253,
        2534.17440119,  1067.32651497,  2286.62267231,  2372.34376911,
         328.17574639,   495.14898677,  1474.01849438,  2138.0434008 ,
         288.16545086,  1716.54043658,  -339.4268182 ,  4603.63221083,
        1719.32653984,  1203.59921358,  1619.93790273,  1358.14212715,
         987.73618647,  2101.72666564,   971.73545795,  2472.38097703,
        2373.03754599,   473.09365209,   231.23335654,  1882.8766383 ,
      

In [37]:
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
import numpy as np
mae=mean_absolute_error(y_test,y_pred)
mse=mean_squared_error(y_test,y_pred)
rmse=np.sqrt(mse)
r2=r2_score(y_test,y_pred)
mae,rmse,r2

(445.01732269301664, np.float64(570.746291225295), 0.7576988958150546)

In [41]:
print("mean absolute error(MAE):",mae)
print("root mean squared error(rmse):",rmse)
print("r2 score:",r2)

mean absolute error(MAE): 445.01732269301664
root mean squared error(rmse): 570.746291225295
r2 score: 0.7576988958150546


### Model Evaluation summary
The linear regression model was used to predict the range of electric vehicles based on battery capacity,efficiency,and other features.
The model performance metrics are:
.#MAE : 445 km
.#RMSE : 570km
.#R2 score:0.75
A higher r2 indicate a good model fit,meaning the model explains 75% of the variability in EV range.
### Interpretation 
.#The model performs reasonably well.
.#Some prediction errors are large (high RMSE),which suggests improvements can be made 
. #more advanced models (Random forest,gradient boosting) may improve accuracy in week-3
