In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
from typing import Any, List, Dict, Tuple
from sklearn.metrics import f1_score, accuracy_score

# 1. Loading the dataset

In [2]:
df = pd.read_csv('car_prices.csv', sep=',')
df.head(10)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner
5,Maruti Alto LX BSIII,2007,140000,125000,Petrol,Individual,Manual,First Owner
6,Hyundai Xcent 1.2 Kappa S,2016,550000,25000,Petrol,Individual,Manual,First Owner
7,Tata Indigo Grand Petrol,2014,240000,60000,Petrol,Individual,Manual,Second Owner
8,Hyundai Creta 1.6 VTVT S,2015,850000,25000,Petrol,Individual,Manual,First Owner
9,Maruti Celerio Green VXI,2017,365000,78000,CNG,Individual,Manual,First Owner


# 2. Preparing the data

**2.1 Consider the column with the brand and model of the car, select only the brand from there, in the category 'Other' we define the brands, the amount of data for which is less than 1% of the entire dataset. In the 'fuel' column, the main categorical values are 'Petrol' and 'Diesel', the other values, which are very few, are defined as 'Other'. We conduct a binarization of these attributes and some others in the form of a nominal scale**

In [3]:
df['brand'] = df['name'].str.split().str[0]
total_records = len(df)


threshold = total_records * 0.01


brand_counts = df['brand'].value_counts()


rare_brands = brand_counts[brand_counts < threshold].index

df['brand'] = df['brand'].replace(rare_brands, 'Other')

df.drop('name', axis=1, inplace=True)
df['fuel'] = df['fuel'].replace(['CNG', 'LPG', 'Electric'], 'Other')
columns_to_encode = ['brand', 'owner', 'transmission', 'seller_type', 'fuel']
dummies = pd.get_dummies(df[columns_to_encode], prefix=columns_to_encode, drop_first=False)


df = pd.concat([df, dummies], axis=1)


df = df.drop(columns=columns_to_encode)

**2.2 We look at 25, 50, 75 percentiles on the 'year' and 'km_driven' attributes to understand the bounds on which we will binarize in the inter-ordinal scale form. Next, we binarize the given attributes.**

In [4]:
km_percentiles = np.percentile(df['km_driven'], [25, 50, 75])


df['km_category'] = pd.cut(
    df['km_driven'],
    bins=[0, km_percentiles[0], km_percentiles[1], km_percentiles[2], df['km_driven'].max()],
    labels=[f'<{km_percentiles[0]}', f'{km_percentiles[0]}-{km_percentiles[1]}', f'{km_percentiles[1]}-{km_percentiles[2]}', f'>{km_percentiles[2]}']
)

year_percentiles = np.percentile(df['year'], [25, 50, 75])


df['year_category'] = pd.cut(
    df['year'],
    bins=[0, year_percentiles[0], year_percentiles[1], year_percentiles[2], df['year'].max()],
    labels=[f'<{year_percentiles[0]}', f'{year_percentiles[0]}-{year_percentiles[1]}', f'{year_percentiles[1]}-{year_percentiles[2]}', f'>{year_percentiles[2]}']
)

dummies1 = pd.get_dummies(df[['year_category', 'km_category']], prefix = ['year', 'km'], drop_first=False)

# Adding dummies to the original DataFrame
df = pd.concat([df, dummies1], axis=1)

# Delete the original columns
df = df.drop(columns=['year', 'year_category', 'km_category', 'km_driven'])

**2.3 Since there is no classification of the targeting variable (car price) in the original dataset, we will do it ourselves by dividing car prices into 2 classes (0-Low, 1-High)**

In [5]:
df['price_class'] = pd.qcut(df['selling_price'], q=2, labels=[0, 1])

In [9]:
df['price_class'].value_counts()

price_class
0    2174
1    2166
Name: count, dtype: int64

In [10]:
df.drop('selling_price', axis=1, inplace=True)

In [11]:
df.corr()['price_class']

brand_Audi                      0.118619
brand_Chevrolet                -0.158068
brand_Ford                      0.059145
brand_Honda                     0.091100
brand_Hyundai                  -0.007935
brand_Mahindra                  0.122599
brand_Maruti                   -0.139277
brand_Nissan                    0.026988
brand_Other                     0.052991
brand_Renault                   0.028459
brand_Skoda                     0.052183
brand_Tata                     -0.152135
brand_Toyota                    0.134782
brand_Volkswagen                0.025553
owner_First Owner               0.303510
owner_Fourth & Above Owner     -0.113818
owner_Second Owner             -0.212538
owner_Test Drive Car            0.062825
owner_Third Owner              -0.158386
transmission_Automatic          0.256597
transmission_Manual            -0.256597
seller_type_Dealer              0.196211
seller_type_Individual         -0.241846
seller_type_Trustmark Dealer    0.149341
fuel_Diesel     

**2.4 Split the data on train and test samples**

In [12]:
X = df.drop('price_class', axis=1)
y = df['price_class']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 3. Lazy FCA classification:

In [14]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
class LazyClassifierFCA:
    def __init__(self):
        self.X_train = None
        self.y_train = None

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series) -> None:
        self.X_train = X_train
        self.y_train = y_train
        
    def classify_sample(self, sample: pd.Series) -> Any:
        # Split X_train into positive and negative classes
        X_train_positive = self.X_train[y_train == 1]
        X_train_negative = self.X_train[y_train == 0]
        
        positive_classifiers = 0
        negative_classifiers = 0
        
        # Function to check if intersection with a train sample is a positive classifier
        def is_positive_classifier(intersection):
            # Find samples in X_train_positive that contain the intersection
            num_positive = ((X_train_positive | ~intersection) == True).all(axis=1).sum()
            num_negative = ((X_train_negative | ~intersection) == True).all(axis=1).sum()
            return num_negative == 0 and num_positive > 1
        
        # Function to check if intersection is a negative classifier
        def is_negative_classifier(intersection):
            # Find samples in X_train_negative that contain the intersection
            num_positive = ((X_train_positive | ~intersection) == True).all(axis=1).sum()
            num_negative = ((X_train_negative | ~intersection) == True).all(axis=1).sum()
            return num_positive == 0 and num_negative > 1
        
        # Check for positive classifiers by intersecting sample with each positive object
        for _, pos_sample in X_train_positive.iterrows():
            intersection = sample & pos_sample
            if is_positive_classifier(intersection):
                positive_classifiers += 1

        # Check for negative classifiers by intersecting sample with each negative object
        for _, neg_sample in X_train_negative.iterrows():
            intersection = sample & neg_sample
            if is_negative_classifier(intersection):
                negative_classifiers += 1

        # Determine the class based on the number of classifiers
        if positive_classifiers > negative_classifiers:
            print(f"sample {sample.name} is classified as 1, {positive_classifiers=}, {negative_classifiers=}")
            return 1  # Predict positive
            
        elif negative_classifiers > positive_classifiers:
            print(f"sample {sample.name} is classified as 0, {positive_classifiers=}, {negative_classifiers=}")
            return 0  # Predict negative

        else:
            # If equal, you can decide on a rule, like defaulting to 0 or 1, or returning 'undetermined'
            print(f"sample {sample.name} is classified as 1, default, {positive_classifiers=}, {negative_classifiers=}")
        
            return 1  # or 0, depending on the choice


    def predict(self, X_test: pd.DataFrame) -> List[Any]:
        # List to store predictions for each test sample
        predictions = []
        
        # Iterate through each sample in X_test
        for _, sample in X_test.iterrows():
            # Classify the sample and append the result to predictions
            prediction = self.classify_sample(sample)
            predictions.append(prediction)
        
        return predictions

In [15]:
sample = X_test.iloc[1]
y.iloc[1]

0

In [16]:
len(X_train[y_train == 1]), len(X_train[y_train == 0])

(1733, 1739)

In [17]:
pos_X_train = X_train[y_train == 1]
neg_X_train = X_train[y_train == 0]
len(pos_X_train), len(neg_X_train)

(1733, 1739)

In [18]:
classifier = LazyClassifierFCA()
classifier.fit(X_train, y_train)

In [19]:
classifier.classify_sample(sample)

sample 1018 is classified as 1, default, positive_classifiers=0, negative_classifiers=0


1

In [20]:
y_pred = classifier.predict(X_test)

sample 1944 is classified as 1, default, positive_classifiers=0, negative_classifiers=0
sample 1018 is classified as 1, default, positive_classifiers=0, negative_classifiers=0
sample 2071 is classified as 1, default, positive_classifiers=0, negative_classifiers=0
sample 3726 is classified as 1, default, positive_classifiers=0, negative_classifiers=0
sample 2448 is classified as 1, positive_classifiers=153, negative_classifiers=0
sample 3681 is classified as 1, default, positive_classifiers=0, negative_classifiers=0
sample 513 is classified as 1, default, positive_classifiers=0, negative_classifiers=0
sample 807 is classified as 1, positive_classifiers=88, negative_classifiers=0
sample 830 is classified as 1, default, positive_classifiers=0, negative_classifiers=0
sample 2639 is classified as 1, default, positive_classifiers=0, negative_classifiers=0
sample 3069 is classified as 1, default, positive_classifiers=0, negative_classifiers=0
sample 185 is classified as 1, positive_classifier

In [21]:
#Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7511520737327189
Classification report:
               precision    recall  f1-score   support

           0       0.91      0.56      0.69       435
           1       0.68      0.94      0.79       433

    accuracy                           0.75       868
   macro avg       0.79      0.75      0.74       868
weighted avg       0.79      0.75      0.74       868

Confusion matrix:
 [[244 191]
 [ 25 408]]


# 4. K Nearest Neighbor (KNN)

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Configuring parameters for GridSearchCV
param_grid = {
    'n_neighbors': np.arange(1, 31),       # Number of neighbors
    'weights': ['uniform', 'distance'],   # Weights: uniform or weighted
    'metric': ['euclidean', 'manhattan']  # Distance metrics
}

# Model initialization
knn = KNeighborsClassifier()

# GridSearchCV initialization
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Model training and parameter selection
grid_search.fit(X_train, y_train)

# 8. GridSearchCV results
print("The best parametrs:", grid_search.best_params_)
print("Better accuracy on cross-validation:", grid_search.best_score_)

# Evaluation of the model with the best parameters
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

The best parametrs: {'metric': 'manhattan', 'n_neighbors': 27, 'weights': 'distance'}
Better accuracy on cross-validation: 0.8384288764953455
Accuracy: 0.8433179723502304
Classification report:
               precision    recall  f1-score   support

           0       0.84      0.85      0.84       435
           1       0.84      0.84      0.84       433

    accuracy                           0.84       868
   macro avg       0.84      0.84      0.84       868
weighted avg       0.84      0.84      0.84       868

Confusion matrix:
 [[368  67]
 [ 69 364]]


# 5. Naive Bayes

In [23]:
from sklearn.naive_bayes import GaussianNB


# Model initialization
nb = GaussianNB()

# Model training
nb.fit(X_train, y_train)

# Predictions
y_pred = nb.predict(X_test)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.6117511520737328
Classification report:
               precision    recall  f1-score   support

           0       0.57      0.96      0.71       435
           1       0.88      0.26      0.40       433

    accuracy                           0.61       868
   macro avg       0.72      0.61      0.56       868
weighted avg       0.72      0.61      0.56       868

Confusion matrix:
 [[419  16]
 [321 112]]


# 6. Logistic Regression

In [24]:

from sklearn.linear_model import LogisticRegression


# Define parameter grid for GridSearchCV
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Regularization types
    'C': np.logspace(-4, 4, 20),                   # Inverse regularization strength
    'solver': ['lbfgs', 'liblinear', 'saga'],      # Solvers
    'max_iter': [100, 200, 300]                    # Number of iterations
}

# Initialize the model
log_reg = LogisticRegression()

# GridSearchCV initialization
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_log_reg = grid_search.best_estimator_

# Predictions
y_pred = best_log_reg.predict(X_test)

# Metrics
print("The best parameters:", grid_search.best_params_)
print("Better accuracy on cross-validation:", grid_search.best_score_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

2100 fits failed out of a total of 3600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "E:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\Anaconda\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\Anaconda\Lib\

The best parameters: {'C': 0.615848211066026, 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}
Better accuracy on cross-validation: 0.8436091472643211
Accuracy: 0.8179723502304147
Classification report:
               precision    recall  f1-score   support

           0       0.84      0.79      0.81       435
           1       0.80      0.84      0.82       433

    accuracy                           0.82       868
   macro avg       0.82      0.82      0.82       868
weighted avg       0.82      0.82      0.82       868

Confusion matrix:
 [[345  90]
 [ 68 365]]


# 7. SVM

In [25]:

from sklearn.svm import SVC


# Define parameter grid for GridSearchCV
param_grid = {
    'C': np.logspace(-3, 3, 10),         # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel types
    'gamma': ['scale', 'auto'],          # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
    'degree': [2, 3, 4],                 # Degree for 'poly' kernel
    'class_weight': [None, 'balanced']   # Handles class imbalance
}

# Initialize the model
svm = SVC()

# GridSearchCV initialization
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_svm = grid_search.best_estimator_

# Predictions
y_pred = best_svm.predict(X_test)

# Metrics
print("The best parameters:", grid_search.best_params_)
print("Better accuracy on cross-validation:", grid_search.best_score_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

The best parameters: {'C': 46.41588833612773, 'class_weight': None, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'}
Better accuracy on cross-validation: 0.8617544005141708
Accuracy: 0.826036866359447
Classification report:
               precision    recall  f1-score   support

           0       0.85      0.80      0.82       435
           1       0.81      0.86      0.83       433

    accuracy                           0.83       868
   macro avg       0.83      0.83      0.83       868
weighted avg       0.83      0.83      0.83       868

Confusion matrix:
 [[346  89]
 [ 62 371]]


# 8. Decision Tree

In [26]:
from sklearn.tree import DecisionTreeClassifier


# Define parameter grid for GridSearchCV
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],  # Splitting criteria
    'max_depth': [None, 5, 10, 20, 50],           # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],              # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 5, 10],            # Minimum number of samples required to be at a leaf node
    'class_weight': [None, 'balanced']            # Handles class imbalance
}

# Initialize the model
dt = DecisionTreeClassifier()

# GridSearchCV initialization
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_dt = grid_search.best_estimator_

# Predictions
y_pred = best_dt.predict(X_test)

# Metrics
print("The best parameters:", grid_search.best_params_)
print("Better accuracy on cross-validation:", grid_search.best_score_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

The best parameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
Better accuracy on cross-validation: 0.8519627640826819
Accuracy: 0.826036866359447
Classification report:
               precision    recall  f1-score   support

           0       0.86      0.78      0.82       435
           1       0.80      0.87      0.83       433

    accuracy                           0.83       868
   macro avg       0.83      0.83      0.83       868
weighted avg       0.83      0.83      0.83       868

Confusion matrix:
 [[341  94]
 [ 57 376]]


# 9. Random Forest

In [39]:
from sklearn.ensemble import RandomForestClassifier


# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200, 500],          # Number of trees in the forest
    'criterion': ['gini', 'entropy', 'log_loss'], # Splitting criteria
    'max_depth': [None, 10, 20, 50],              # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],              # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 5, 10],            # Minimum number of samples required to be at a leaf node
    'class_weight': [None, 'balanced']            # Handles class imbalance
}

# Initialize the model
rf = RandomForestClassifier()

# GridSearchCV initialization
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_rf = grid_search.best_estimator_

# Predictions
y_pred = best_rf.predict(X_test)

# Metrics
print("The best parameters:", grid_search.best_params_)
print("Better accuracy on cross-validation:", grid_search.best_score_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

The best parameters: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}
Better accuracy on cross-validation: 0.8608886032384466
Accuracy: 0.8329493087557603
Classification report:
               precision    recall  f1-score   support

           0       0.86      0.80      0.83       435
           1       0.81      0.87      0.84       433

    accuracy                           0.83       868
   macro avg       0.83      0.83      0.83       868
weighted avg       0.83      0.83      0.83       868

Confusion matrix:
 [[348  87]
 [ 58 375]]


# 10. XGBoost

In [52]:
from xgboost import XGBClassifier


# Замените < и другие символы в столбцах на more_than_
X_train.columns = X_train.columns.str.replace(r'<', 'less_than_', regex=True)
X_test.columns = X_test.columns.str.replace(r'<', 'less_than_', regex=True)

X_train.columns = X_train.columns.str.replace(r'>', 'more_than_', regex=True)
X_test.columns = X_test.columns.str.replace(r'>', 'more_than_', regex=True)

y_train = y_train.astype('bool')
y_test = y_test.astype('bool')

# Определение параметров для GridSearchCV с умеренным количеством значений
param_grid = {
    'n_estimators': [100, 200, 500],  # Количество деревьев
    'learning_rate': [0.05, 0.1, 0.2],  # Шаг обучения
    'max_depth': [5, 7, 10],  # Глубина деревьев
    'subsample': [0.8, 1.0],  # Доля примеров для обучения каждого дерева
    'colsample_bytree': [0.8, 1.0],  # Доля признаков для обучения каждого дерева
    'gamma': [0, 0.1, 0.5],  # Минимальное уменьшение потерь для разделения
    'reg_alpha': [0, 0.1],  # Линейная регуляризация L1
    'reg_lambda': [1, 10],  # Линейная регуляризация L2
}

# Initialize the model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# GridSearchCV initialization
grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_xgb = grid_search.best_estimator_

# Predictions
y_pred = best_xgb.predict(X_test)

# Metrics
print("The best parameters for XGBoost:", grid_search.best_params_)
print("Better accuracy on cross-validation:", grid_search.best_score_)
print("Accuracy for XGBoost:", accuracy_score(y_test, y_pred))
print("Classification report for XGBoost:\n", classification_report(y_test, y_pred))
print("Confusion matrix for XGBoost:\n", confusion_matrix(y_test, y_pred))

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits
The best parameters for XGBoost: {'colsample_bytree': 0.8, 'gamma': 0.5, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 1.0}
Better accuracy on cross-validation: 0.8637716915804532
Accuracy for XGBoost: 0.8387096774193549
Classification report for XGBoost:
               precision    recall  f1-score   support

       False       0.86      0.81      0.83       435
        True       0.82      0.87      0.84       433

    accuracy                           0.84       868
   macro avg       0.84      0.84      0.84       868
weighted avg       0.84      0.84      0.84       868

Confusion matrix for XGBoost:
 [[351  84]
 [ 56 377]]


Parameters: { "use_label_encoder" } are not used.

