# Zadatak 5: Implementacija klasifikacije

In [1]:
import math
import pandas as pd
import re

from sklearn.preprocessing import StandardScaler

## Učitavanje podataka

In [2]:
df = pd.read_csv('./db/scrapy_vozila_20220602.csv', sep=';', names=['url', 'naslov', 'cena', 'stanje', 'marka', 'model', 'godiste', 'kilometraza', 'karoserija', 'gorivo', 'kubikaza', 'snaga', 'menjac', 'vrata', 'boja', 'lokacija_prodavca'])

df

Unnamed: 0,url,naslov,cena,stanje,marka,model,godiste,kilometraza,karoserija,gorivo,kubikaza,snaga,menjac,vrata,boja,lokacija_prodavca
0,https://www.polovniautomobili.com/auto-oglasi/...,Fiat 500 14 16v SPORT CH,4599,Polovno vozilo,Fiat,500,2008,150661,Hečbek,Benzin,1400,74/101 (kW/KS),Manuelni 6 brzina,2/3 vrata,Siva,Vidikovac
1,https://www.polovniautomobili.com/auto-oglasi/...,Peugeot 508 2.0,8950,Polovno vozilo,Peugeot,508,2015,187000,Karavan,Dizel,1997,120/163 (kW/KS),Automatski / poluautomatski,4/5 vrata,Siva,Vidikovac
2,https://www.polovniautomobili.com/auto-oglasi/...,Hyundai Tucson 2.0 CRDI 4WD,4600,Polovno vozilo,Hyundai,Tucson,2006,231000,Džip/SUV,Dizel,1998,82/111 (kW/KS),Manuelni 5 brzina,4/5 vrata,Zlatna,Vidikovac
3,https://www.polovniautomobili.com/auto-oglasi/...,Volkswagen Golf 7 GTD,17000,Polovno vozilo,Volkswagen,Golf 7,2016,122000,Hečbek,Dizel,1968,135/184 (kW/KS),Manuelni 6 brzina,4/5 vrata,Crna,Vidikovac
4,https://www.polovniautomobili.com/auto-oglasi/...,Opel Antara 2.0 CDTI 4WD COSMO,5600,Polovno vozilo,Opel,Antara,2007,204000,Džip/SUV,Dizel,1991,110/150 (kW/KS),Manuelni 5 brzina,4/5 vrata,Crna,Vidikovac
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31651,https://www.polovniautomobili.com/auto-oglasi/...,Toyota Avensis,2800,Polovno vozilo,Toyota,Avensis,2004,288000,Limuzina,Dizel,1995,85/116 (kW/KS),Manuelni 5 brzina,4/5 vrata,Zlatna,Niš
31652,https://www.polovniautomobili.com/auto-oglasi/...,Honda Civic 1.8 GT,8000,Polovno vozilo,Honda,Civic,2010,205000,Hečbek,Benzin,1799,103/140 (kW/KS),Manuelni 6 brzina,4/5 vrata,Crna,Niš
31653,https://www.polovniautomobili.com/auto-oglasi/...,Volkswagen Golf 5 1.9tdi 9,4450,Polovno vozilo,Volkswagen,Golf 5,2007,346567,Hečbek,Dizel,1896,77/105 (kW/KS),Manuelni 6 brzina,4/5 vrata,Teget,Niš
31654,https://www.polovniautomobili.com/auto-oglasi/...,Mercedes Benz A 180 A 180 cdi DCT,14800,Polovno vozilo,Mercedes Benz,A 180,2013,153000,Hečbek,Dizel,1796,80/109 (kW/KS),Automatski / poluautomatski,4/5 vrata,Siva,Niš


In [3]:
df.dtypes

url                  object
naslov               object
cena                 object
stanje               object
marka                object
model                object
godiste               int64
kilometraza           int64
karoserija           object
gorivo               object
kubikaza              int64
snaga                object
menjac               object
vrata                object
boja                 object
lokacija_prodavca    object
dtype: object

## Uklanjanje nenumeričkih vrednosti cena i snage

In [4]:
# Cena
df = df.loc[pd.to_numeric(df['cena'], errors='coerce').notnull()]
df.loc[:, 'cena'] = df.loc[:, 'cena'].astype('int64')


# Snaga - zadrzava se brojka u konjskim snagama
def sredi_snagu(snaga: str) -> str:
    try:
        return re.search('(?<=/)\d+', snaga).group(0)
    except AttributeError:
        return None

df['snaga'] = df['snaga'].apply(sredi_snagu)

df = df.loc[pd.to_numeric(df['snaga'], errors='coerce').notnull()]
df.loc[:, 'snaga'] = df.loc[:, 'snaga'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'cena'] = df.loc[:, 'cena'].astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['snaga'] = df['snaga'].apply(sredi_snagu)


In [5]:
df.dtypes

url                  object
naslov               object
cena                  int64
stanje               object
marka                object
model                object
godiste               int64
kilometraza           int64
karoserija           object
gorivo               object
kubikaza              int64
snaga                 int64
menjac               object
vrata                object
boja                 object
lokacija_prodavca    object
dtype: object

In [6]:
df.shape

(30487, 16)

## Dodeljivanje kategorije cenovnog opsega vozilima.

In [7]:
bins = [0, 2000, 5000, 10000, 15000, 20000, 25000, 30000, math.inf] # math.inf or np.inf?
labels = ['<=1999', '2000-4999', '5000-9999', '10000-14999', '15000-19999', '20000-24999', '25000-29999', '>=30000']
df['cenovni_opseg'] = pd.cut(df['cena'], bins=bins, labels=labels, right=False)

df = df.drop('cena', axis=1)

df

Unnamed: 0,url,naslov,stanje,marka,model,godiste,kilometraza,karoserija,gorivo,kubikaza,snaga,menjac,vrata,boja,lokacija_prodavca,cenovni_opseg
0,https://www.polovniautomobili.com/auto-oglasi/...,Fiat 500 14 16v SPORT CH,Polovno vozilo,Fiat,500,2008,150661,Hečbek,Benzin,1400,101,Manuelni 6 brzina,2/3 vrata,Siva,Vidikovac,2000-4999
1,https://www.polovniautomobili.com/auto-oglasi/...,Peugeot 508 2.0,Polovno vozilo,Peugeot,508,2015,187000,Karavan,Dizel,1997,163,Automatski / poluautomatski,4/5 vrata,Siva,Vidikovac,5000-9999
2,https://www.polovniautomobili.com/auto-oglasi/...,Hyundai Tucson 2.0 CRDI 4WD,Polovno vozilo,Hyundai,Tucson,2006,231000,Džip/SUV,Dizel,1998,111,Manuelni 5 brzina,4/5 vrata,Zlatna,Vidikovac,2000-4999
3,https://www.polovniautomobili.com/auto-oglasi/...,Volkswagen Golf 7 GTD,Polovno vozilo,Volkswagen,Golf 7,2016,122000,Hečbek,Dizel,1968,184,Manuelni 6 brzina,4/5 vrata,Crna,Vidikovac,15000-19999
4,https://www.polovniautomobili.com/auto-oglasi/...,Opel Antara 2.0 CDTI 4WD COSMO,Polovno vozilo,Opel,Antara,2007,204000,Džip/SUV,Dizel,1991,150,Manuelni 5 brzina,4/5 vrata,Crna,Vidikovac,5000-9999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31651,https://www.polovniautomobili.com/auto-oglasi/...,Toyota Avensis,Polovno vozilo,Toyota,Avensis,2004,288000,Limuzina,Dizel,1995,116,Manuelni 5 brzina,4/5 vrata,Zlatna,Niš,2000-4999
31652,https://www.polovniautomobili.com/auto-oglasi/...,Honda Civic 1.8 GT,Polovno vozilo,Honda,Civic,2010,205000,Hečbek,Benzin,1799,140,Manuelni 6 brzina,4/5 vrata,Crna,Niš,5000-9999
31653,https://www.polovniautomobili.com/auto-oglasi/...,Volkswagen Golf 5 1.9tdi 9,Polovno vozilo,Volkswagen,Golf 5,2007,346567,Hečbek,Dizel,1896,105,Manuelni 6 brzina,4/5 vrata,Teget,Niš,2000-4999
31654,https://www.polovniautomobili.com/auto-oglasi/...,Mercedes Benz A 180 A 180 cdi DCT,Polovno vozilo,Mercedes Benz,A 180,2013,153000,Hečbek,Dizel,1796,109,Automatski / poluautomatski,4/5 vrata,Siva,Niš,10000-14999


## Pretvaranje kategorija u numeričke vrednosti i skaliranje numeričkih podataka

In [8]:
df = pd.get_dummies(df, columns=['cenovni_opseg', 'stanje', 'marka'])

df

Unnamed: 0,url,naslov,model,godiste,kilometraza,karoserija,gorivo,kubikaza,snaga,menjac,...,marka_Toyota,marka_Trabant,marka_UAZ,marka_Vauxhall,marka_Volkswagen,marka_Volvo,marka_Wartburg,marka_Zastava,marka_ZhiDou,marka_Škoda
0,https://www.polovniautomobili.com/auto-oglasi/...,Fiat 500 14 16v SPORT CH,500,2008,150661,Hečbek,Benzin,1400,101,Manuelni 6 brzina,...,0,0,0,0,0,0,0,0,0,0
1,https://www.polovniautomobili.com/auto-oglasi/...,Peugeot 508 2.0,508,2015,187000,Karavan,Dizel,1997,163,Automatski / poluautomatski,...,0,0,0,0,0,0,0,0,0,0
2,https://www.polovniautomobili.com/auto-oglasi/...,Hyundai Tucson 2.0 CRDI 4WD,Tucson,2006,231000,Džip/SUV,Dizel,1998,111,Manuelni 5 brzina,...,0,0,0,0,0,0,0,0,0,0
3,https://www.polovniautomobili.com/auto-oglasi/...,Volkswagen Golf 7 GTD,Golf 7,2016,122000,Hečbek,Dizel,1968,184,Manuelni 6 brzina,...,0,0,0,0,1,0,0,0,0,0
4,https://www.polovniautomobili.com/auto-oglasi/...,Opel Antara 2.0 CDTI 4WD COSMO,Antara,2007,204000,Džip/SUV,Dizel,1991,150,Manuelni 5 brzina,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31651,https://www.polovniautomobili.com/auto-oglasi/...,Toyota Avensis,Avensis,2004,288000,Limuzina,Dizel,1995,116,Manuelni 5 brzina,...,1,0,0,0,0,0,0,0,0,0
31652,https://www.polovniautomobili.com/auto-oglasi/...,Honda Civic 1.8 GT,Civic,2010,205000,Hečbek,Benzin,1799,140,Manuelni 6 brzina,...,0,0,0,0,0,0,0,0,0,0
31653,https://www.polovniautomobili.com/auto-oglasi/...,Volkswagen Golf 5 1.9tdi 9,Golf 5,2007,346567,Hečbek,Dizel,1896,105,Manuelni 6 brzina,...,0,0,0,0,1,0,0,0,0,0
31654,https://www.polovniautomobili.com/auto-oglasi/...,Mercedes Benz A 180 A 180 cdi DCT,A 180,2013,153000,Hečbek,Dizel,1796,109,Automatski / poluautomatski,...,0,0,0,0,0,0,0,0,0,0


In [9]:
scaler = StandardScaler()
columns_to_scale = ['godiste', 'kilometraza', 'kubikaza', 'snaga']
scaler.fit(df[columns_to_scale])
scaled_features = scaler.transform(df[columns_to_scale])
for i in range(len(columns_to_scale)):
    df[columns_to_scale[i]] = scaled_features[:, i]
    
df

Unnamed: 0,url,naslov,model,godiste,kilometraza,karoserija,gorivo,kubikaza,snaga,menjac,...,marka_Toyota,marka_Trabant,marka_UAZ,marka_Vauxhall,marka_Volkswagen,marka_Volvo,marka_Wartburg,marka_Zastava,marka_ZhiDou,marka_Škoda
0,https://www.polovniautomobili.com/auto-oglasi/...,Fiat 500 14 16v SPORT CH,500,-0.089751,-0.343189,Hečbek,Benzin,-0.044211,-0.440034,Manuelni 6 brzina,...,0,0,0,0,0,0,0,0,0,0
1,https://www.polovniautomobili.com/auto-oglasi/...,Peugeot 508 2.0,508,1.060758,-0.077278,Karavan,Dizel,0.007066,0.758765,Automatski / poluautomatski,...,0,0,0,0,0,0,0,0,0,0
2,https://www.polovniautomobili.com/auto-oglasi/...,Hyundai Tucson 2.0 CRDI 4WD,Tucson,-0.418468,0.244693,Džip/SUV,Dizel,0.007152,-0.246679,Manuelni 5 brzina,...,0,0,0,0,0,0,0,0,0,0
3,https://www.polovniautomobili.com/auto-oglasi/...,Volkswagen Golf 7 GTD,Golf 7,1.225117,-0.552917,Hečbek,Dizel,0.004575,1.164810,Manuelni 6 brzina,...,0,0,0,0,1,0,0,0,0,0
4,https://www.polovniautomobili.com/auto-oglasi/...,Opel Antara 2.0 CDTI 4WD COSMO,Antara,-0.254110,0.047120,Džip/SUV,Dizel,0.006551,0.507404,Manuelni 5 brzina,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31651,https://www.polovniautomobili.com/auto-oglasi/...,Toyota Avensis,Avensis,-0.747185,0.661791,Limuzina,Dizel,0.006894,-0.150002,Manuelni 5 brzina,...,1,0,0,0,0,0,0,0,0,0
31652,https://www.polovniautomobili.com/auto-oglasi/...,Honda Civic 1.8 GT,Civic,0.238966,0.054437,Hečbek,Benzin,-0.009940,0.314049,Manuelni 6 brzina,...,0,0,0,0,0,0,0,0,0,0
31653,https://www.polovniautomobili.com/auto-oglasi/...,Volkswagen Golf 5 1.9tdi 9,Golf 5,-0.254110,1.090356,Hečbek,Dizel,-0.001609,-0.362692,Manuelni 6 brzina,...,0,0,0,0,1,0,0,0,0,0
31654,https://www.polovniautomobili.com/auto-oglasi/...,Mercedes Benz A 180 A 180 cdi DCT,A 180,0.732041,-0.326074,Hečbek,Dizel,-0.010198,-0.285350,Automatski / poluautomatski,...,0,0,0,0,0,0,0,0,0,0


## e) Broj (i procentualni odnos) svih automobila za prodaju, koje po ceni pripadaju jednom od sledećih opsega: manje od 2000 €, između 2 000 i 4 999 €, između 5 000 i 9 999 €, između 10 000 € i 14 999 €, između 15 000 € i 19 999 €, između 20 000 € i 24 999 €, između 25 000 € i 29 999 €, 30 000 € ili više.

In [10]:
# fig, ax = plt.subplots()
# opseg_cena, vozila_counts = zip(*[(elem['opseg_cena'], elem['cnt']) for elem in db.opsezi_cena()])
# vozila_count_total = db.vozila_count()
# ax.bar(opseg_cena, vozila_counts)
# ax.set_title("Broj automobila po opsezima cena")
# ax.set_xlabel("cena")
# ax.set_ylabel("broj automobila")

# fig.set_size_inches(18, 3)
# for opseg_cena, cnt in zip(opseg_cena, vozila_counts):
#     ax.annotate("{} ({:.2%})".format(cnt, cnt / vozila_count_total), xy=(opseg_cena, cnt - 100))