In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

In [2]:
row_data = pd.read_csv("training.csv")

In [3]:
def fill_price(df, col):
    # I'm not sure what is the best way to fill prices
    df[col].fillna(df[col].mean(), inplace = True)

In [4]:
def data_fillna(df):
    res = df.drop(['RefId', 'WheelTypeID'], axis = 1)
    # I'll drop Model for now, as I don't know how to correctly encode it.
    # Making 1063 labels with one hot encoding looks bad for me, as I would use knn.
    # The same for Trim and SubModel, maybe I would need some text processing later
    # to extract features from SubModel such as 'Wagon', 'Sedan', 'Sport' etc, but I'm not there yet
    res = res.drop(['Model', 'Trim', 'SubModel'], axis = 1) 
    
    res['WheelType'].fillna('NotSpecified', inplace = True)
    
    res['PRIMEUNIT'].fillna("NO", inplace = True)
    res['AUCGUART'].fillna('RED', inplace = True)
    
    primeunit = {'NO': 0, 'YES': 1}
    # yellow exists in dataset description
    # we assume that RED < YELLOW < GREEN
    aucguart = {'RED': -1, 'YELLOW': 0, 'GREEN': 1} 
    
    res['PRIMEUNIT'].replace(primeunit, inplace = True)
    res['AUCGUART'].replace(aucguart, inplace = True)
    
    res['TopThreeAmericanName'].fillna('OTHER', inplace = True)

    res['AutoTransmition'] = res.apply(lambda t: t['Transmission'] == 'AUTO', axis = 1).astype(np.int64)
    res = res.drop('Transmission', axis = 1)
    
    # maybe we can do better, but as we have only 5 instances without Size, lets do stupid things
    res['Size'].fillna('MEDIUM', inplace = True) 
    
    # I think VNST gives enough information about state
    # as VNST zipcode is categorial feature, but with more instances, so I'll remove it
    res = res.drop('VNZIP1', axis = 1)
    
    # I don't really understand this feature, so, I'll ignore it for now, 
    # As it will grow my feature space to 74 features
    res = res.drop('BYRNO', axis = 1)
    
    res['Nationality'].fillna('OTHER', inplace = True)
    
    res['Color'].fillna('NOT AVAIL', inplace = True)
    
    fill_price(res, "MMRAcquisitionAuctionAveragePrice")
    fill_price(res, "MMRAcquisitionAuctionCleanPrice")
    fill_price(res, "MMRAcquisitionRetailAveragePrice")
    fill_price(res, "MMRAcquisitonRetailCleanPrice")
    fill_price(res, "MMRCurrentAuctionAveragePrice")
    fill_price(res, "MMRCurrentAuctionCleanPrice")
    fill_price(res, "MMRCurrentRetailAveragePrice")
    fill_price(res, "MMRCurrentRetailCleanPrice")
    
    res['PurchDate'] = (pd.to_datetime(res['PurchDate']).astype(np.int64)/ 10**9).astype(np.int64)
    
    return res

In [5]:
preprocessed_data = data_fillna(row_data)

In [6]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()

def final_data_preparation(df):
    nonbinary_cols = list(filter(lambda c: df[c].dtype == object, df.columns))
    data_nonbinary = pd.get_dummies(df[nonbinary_cols])
    tmp = df.drop(nonbinary_cols, axis=1)
    tmp = pd.concat((tmp, data_nonbinary), axis = 1)
    # we normalize also binary data, assuming it is [0, 1]
    y = tmp['IsBadBuy'].get_values()
    X = tmp.drop('IsBadBuy', axis = 1).get_values()
    X = min_max_scaler.fit_transform(X)
    return (X, y)

In [7]:
X, y = final_data_preparation(preprocessed_data)

In [26]:
from sklearn.cluster import KMeans,DBSCAN,AffinityPropagation

# KMeans

In [9]:
kmeans = KMeans(n_clusters=2, random_state=42)

In [10]:
kmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=42, tol=0.0001, verbose=0)

In [11]:
np.mean(y != kmeans.labels_)

0.6051409232286971

Ошибка 60% означает, что 0 кластер соответствует y == 1 и наоборот

In [12]:
y_correct = y != 1

In [13]:
np.mean(y_correct != kmeans.labels_)

0.3948590767713029

In [25]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_correct, kmeans.labels_)

array([[ 2736,  6240],
       [22578, 41429]], dtype=int64)

мы более менее хорошо определяем хорошие машины, но плохо определяем плохие

# DBSCAN

In [15]:
dbscan = DBSCAN(n_jobs = 8, eps=2, min_samples=2000)

In [16]:
dbscan.fit(X)

DBSCAN(algorithm='auto', eps=2, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=2000, n_jobs=8, p=None)

приводим значения классов к 0 | 1

In [17]:
dbscan_labels = dbscan.labels_ == 1

In [18]:
np.mean(y != dbscan_labels)

0.12298754504473644

In [19]:
confusion_matrix(y, dbscan_labels)

array([[64007,     0],
       [ 8976,     0]], dtype=int64)

алгоритм определил почти все значения как хорошие машины 

# AffinityPropagation

In [20]:
affinityProp = AffinityPropagation()

In [21]:
affinityProp.fit(X)

MemoryError: 

Метод не работает

# MiniBatchKMeans

In [28]:
from sklearn.cluster import MiniBatchKMeans, Birch

In [29]:
mbkmeans = MiniBatchKMeans(n_clusters=2, random_state=42)

In [30]:
mbkmeans.fit(X)

MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=2, n_init=3, random_state=42,
                reassignment_ratio=0.01, tol=0.0, verbose=0)

In [31]:
np.mean(y_correct != mbkmeans.labels_)

0.5049943137442966

ошибка на уровне случайного выбора

# Birch

In [32]:
birch = Birch(threshold=0.8, n_clusters=2)

In [33]:
birch.fit(X)

Birch(branching_factor=50, compute_labels=True, copy=True, n_clusters=2,
      threshold=0.8)