In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Veri setini yükleme
cars_data = pd.read_csv('Processed_Cardetails.csv')

# İlk 5 satırı görüntüleme
print("Veri setinin ilk 5 satırı:")
print(cars_data.head())

Veri setinin ilk 5 satırı:
                           name  year  selling_price  km_driven    fuel  \
0        Maruti Swift Dzire VDI  2014         450000     145500  Diesel   
1  Skoda Rapid 1.5 TDI Ambition  2014         370000     120000  Diesel   
2      Honda City 2017-2020 EXi  2006         158000     140000  Petrol   
3     Hyundai i20 Sportz Diesel  2010         225000     127000  Diesel   
4        Maruti Swift VXI BSIII  2007         130000     120000  Petrol   

  seller_type transmission         owner     mileage   engine   max_power  \
0  Individual       Manual   First Owner   23.4 kmpl  1248 CC      74 bhp   
1  Individual       Manual  Second Owner  21.14 kmpl  1498 CC  103.52 bhp   
2  Individual       Manual   Third Owner   17.7 kmpl  1497 CC      78 bhp   
3  Individual       Manual   First Owner   23.0 kmpl  1396 CC      90 bhp   
4  Individual       Manual   First Owner   16.1 kmpl  1298 CC    88.2 bhp   

                     torque  seats  brand  model  
0       

In [3]:
# Eksik değerlerin kontrolü ve temizliği
print("Eksik değerler:\n", cars_data.isnull().sum())
cars_data.dropna(inplace=True)

# Yinelenen satırların kontrolü ve temizliği
print("Yinelenen satır sayısı:", cars_data.duplicated().sum())
cars_data.drop_duplicates(inplace=True)

# Torque sütununu kaldırma (kullanılmıyor)
cars_data.drop(columns=['torque'], inplace=True)

# Veri setinin boyutunu görüntüleme
print("Güncellenmiş veri seti boyutu:", cars_data.shape)

Eksik değerler:
 name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
brand              0
model              0
dtype: int64
Yinelenen satır sayısı: 1189
Güncellenmiş veri seti boyutu: (6717, 14)


In [4]:
# LabelEncoder kullanarak kategorik sütunları kodlama
le_brand = LabelEncoder()
le_model = LabelEncoder()

# 'brand' ve 'model' sütunlarını dönüştürme
cars_data['brand'] = le_brand.fit_transform(cars_data['brand'])
cars_data['model'] = le_model.fit_transform(cars_data['model'])

# Diğer kategorik sütunları manuel dönüştürme
cars_data['transmission'].replace(['Manual', 'Automatic'], [1, 2], inplace=True)
cars_data['seller_type'].replace(['Individual', 'Dealer', 'Trustmark Dealer'], [1, 2, 3], inplace=True)
cars_data['fuel'].replace(['Diesel', 'Petrol', 'LPG', 'CNG'], [1, 2, 3, 4], inplace=True)
cars_data['owner'].replace(['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car'], 
                           [1, 2, 3, 4, 5], inplace=True)

# Güncellenmiş veri tiplerini görüntüleme
print(cars_data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 6717 entries, 0 to 8125
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6717 non-null   object 
 1   year           6717 non-null   int64  
 2   selling_price  6717 non-null   int64  
 3   km_driven      6717 non-null   int64  
 4   fuel           6717 non-null   int64  
 5   seller_type    6717 non-null   int64  
 6   transmission   6717 non-null   int64  
 7   owner          6717 non-null   int64  
 8   mileage        6717 non-null   object 
 9   engine         6717 non-null   object 
 10  max_power      6717 non-null   object 
 11  seats          6717 non-null   float64
 12  brand          6717 non-null   int64  
 13  model          6717 non-null   int64  
dtypes: float64(1), int64(9), object(4)
memory usage: 787.1+ KB
None


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cars_data['transmission'].replace(['Manual', 'Automatic'], [1, 2], inplace=True)
  cars_data['transmission'].replace(['Manual', 'Automatic'], [1, 2], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cars_data['seller_type'].replace(['Individual', 'Dealer', 'Trustmark

In [5]:
# Sayısal değerlerin temizlenmesi için bir yardımcı fonksiyon
def clean_data(value):
    try:
        return float(value.split(' ')[0])
    except:
        return 0.0

# 'mileage', 'max_power', ve 'engine' sütunlarını temizleme
cars_data['mileage'] = cars_data['mileage'].apply(clean_data)
cars_data['max_power'] = cars_data['max_power'].apply(clean_data)
cars_data['engine'] = cars_data['engine'].apply(clean_data)

# İlk birkaç satırı kontrol
print(cars_data[['mileage', 'max_power', 'engine']].head())

   mileage  max_power  engine
0    23.40      74.00  1248.0
1    21.14     103.52  1498.0
2    17.70      78.00  1497.0
3    23.00      90.00  1396.0
4    16.10      88.20  1298.0


In [6]:
# Hedef ve bağımsız değişkenleri ayırma
input_data = cars_data.drop(columns=['selling_price'])
output_data = cars_data['selling_price']

# Veriyi %80 eğitim, %20 test olarak bölme
x_train, x_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2, random_state=42)

# Eğitim ve test veri setlerinin boyutlarını görüntüleme
print("Eğitim seti boyutu:", x_train.shape)
print("Test seti boyutu:", x_test.shape)

Eğitim seti boyutu: (5373, 13)
Test seti boyutu: (1344, 13)


In [7]:
# Girdi (X) ve Çıktı (y) veri tiplerini kontrol edin
print(x_train.info())
print(x_train.head())

<class 'pandas.core.frame.DataFrame'>
Index: 5373 entries, 2583 to 941
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          5373 non-null   object 
 1   year          5373 non-null   int64  
 2   km_driven     5373 non-null   int64  
 3   fuel          5373 non-null   int64  
 4   seller_type   5373 non-null   int64  
 5   transmission  5373 non-null   int64  
 6   owner         5373 non-null   int64  
 7   mileage       5373 non-null   float64
 8   engine        5373 non-null   float64
 9   max_power     5373 non-null   float64
 10  seats         5373 non-null   float64
 11  brand         5373 non-null   int64  
 12  model         5373 non-null   int64  
dtypes: float64(4), int64(8), object(1)
memory usage: 587.7+ KB
None
                                   name  year  km_driven  fuel  seller_type  \
2583  Hyundai Xcent 1.2 Kappa SX Option  2014      80000     2            1   
1428    Ford Figo Aspire

In [8]:
# 'name' sütununu giriş verilerinden kaldırma
x_train = x_train.drop(columns=['name'])
x_test = x_test.drop(columns=['name'])

# Değişiklik sonrası kontrol
print(x_train.info())
print(x_test.info())


<class 'pandas.core.frame.DataFrame'>
Index: 5373 entries, 2583 to 941
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          5373 non-null   int64  
 1   km_driven     5373 non-null   int64  
 2   fuel          5373 non-null   int64  
 3   seller_type   5373 non-null   int64  
 4   transmission  5373 non-null   int64  
 5   owner         5373 non-null   int64  
 6   mileage       5373 non-null   float64
 7   engine        5373 non-null   float64
 8   max_power     5373 non-null   float64
 9   seats         5373 non-null   float64
 10  brand         5373 non-null   int64  
 11  model         5373 non-null   int64  
dtypes: float64(4), int64(8)
memory usage: 545.7 KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 1344 entries, 7039 to 962
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          1344 non-null   int64  
 1  

In [16]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

# Model oluşturma ve eğitme
model = LinearRegression()
model.fit(x_train, y_train)

# Test seti üzerinde tahmin yapma
y_pred = model.predict(x_test)

# Performans değerlendirmesi
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Linear Regresyon Hata Kareler Ortalaması (RMSE):", rmse)
print("Linear Regresyon R^2 Değeri:", r2)
print("-------------------------------------------------")
# XGBoost modeli
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(x_train, y_train)
xgb_pred = xgb_model.predict(x_test)
print("XGBoost RMSE:", np.sqrt(mean_squared_error(y_test, xgb_pred)))
print("XGBoost R^2:", r2_score(y_test, xgb_pred))
print("-------------------------------------------------")
# Random Forest modeli
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(x_train, y_train)
rf_pred = rf_model.predict(x_test)
print("En iyi model")
print("Random Forest RMSE:", np.sqrt(mean_squared_error(y_test, rf_pred)))
print("Random Forest R^2:", r2_score(y_test, rf_pred))
print("-------------------------------------------------")
print("R^2 değeri 1'e ne kadar yaklaşırsa model o kadar anlamlı yanıt oluşturur.")

Linear Regresyon Hata Kareler Ortalaması (RMSE): 273879.91784433316
Linear Regresyon R^2 Değeri: 0.6581600303506623
-------------------------------------------------
XGBoost RMSE: 132724.10513408092
XGBoost R^2: 0.9197210073471069
-------------------------------------------------
En iyi model
Random Forest RMSE: 121572.6257846156
Random Forest R^2: 0.9326443460197957
-------------------------------------------------
R^2 değeri 1'e ne kadar yaklaşırsa model o kadar anlamlı yanıt oluşturur.


In [None]:
import pickle

# En iyi modeli kaydetme
with open('model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

print("Model başarıyla kaydedildi.")

Model başarıyla kaydedildi.
