# Data Science mit gebrauchten Autos von Willhaben

Hier befassen wir uns mit der Analyse des Datensatzes

In [1]:
import pandas as pd
import numpy as np
import gc
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
data = pd.read_csv("./data/final.csv", sep=";")
df = data

  data = pd.read_csv("./data/final.csv", sep=";")



## 1. Feature Engineering

Das erste Ziel ist den Datensatz kennenzulernen. Dabei untersuchen wir die vorhandenen Features mittels deskriptiver Methoden. 

In [3]:
df.columns

Index(['id', 'description', 'product_id', 'heading', 'body_dyn', 'price',
       'year_model', 'mileage', 'brand', 'model', 'car_type', 'no_of_owners',
       'noofseats', 'engine_effect', 'engine_fuel_resolved',
       'transmission_resolved', 'condition_resolved', 'warranty_resolved',
       'published_string', 'country', 'coordinates', 'postcode', 'state',
       'district', 'address', 'location', 'orgname', 'fnmmocount',
       'upselling_ad_searchresult', 'isprivate', 'equipment_resolved'],
      dtype='object')

Wir entfernen ein paar wenig versprechende Spalten:

In [4]:
df = df.drop(columns=["product_id", "published_string", "condition_resolved", "country", 
    "fnmmocount", "upselling_ad_searchresult", "no_of_owners", "address", "description"])

In [5]:
df.head(6)

Unnamed: 0,id,heading,body_dyn,price,year_model,mileage,brand,model,car_type,noofseats,engine_effect,engine_fuel_resolved,transmission_resolved,warranty_resolved,coordinates,postcode,state,district,location,orgname,isprivate,equipment_resolved
0,612174978,Skoda Octavia Combi Style TSI ACT,-,34990,2022,4500,Skoda,Octavia,Kombi / Family Van,5.0,110,Benzin,Schaltgetriebe,Nein,"47.05489,15.1365",8570,Steiermark,Voitsberg,Voitsberg,F. Fripertinger GmbH,0,Bordcomputer|elektr. Fensterheber|Sitzheizung|...
1,612174235,VW Golf R-Line TDI 4MOTION DSG,Extras: Coming-Home-Funktion LED-Rückleuchten ...,43990,2022,3000,VW,Golf,Limousine,5.0,110,Diesel,Automatik,Nein,"47.44651,15.30058",8605,Steiermark,Bruck-Mürzzuschlag,Kapfenberg,Porsche Kapfenberg,0,Leichtmetall-/Alufelgen|Anhängerkupplung|elekt...
2,612174149,Seat Tarraco Xcellence 2.0 TDI DSG 4Drive,Extras: Fahrwerksregelung elektronisch Automat...,38490,2019,40880,Seat,Tarraco,SUV / Geländewagen,7.0,110,Diesel,Automatik,Nein,"48.56585,13.98543",4150,Oberösterreich,Rohrbach,Rohrbach in Oberösterreich,Kneidinger Center GmbH,0,Leichtmetall-/Alufelgen|Bordcomputer|elektr. F...
3,575429445,"Suzuki Vitara 1,4 Hybrid ALLGRIP 6AGS shine",Irrtümer und Tippfehler vorbehalten.,26500,2022,610,Suzuki,Vitara,SUV / Geländewagen,5.0,95,Hybrid Elektro/Benzin,Schaltgetriebe,Ja,"46.6682,12.99953",9640,Kärnten,Hermagor,Kötschach-Mauthen,Autohaus Presslauer-Webhofer GmbH,0,Servicegepflegt|Leasingfähig|elektr. Spiegel|e...
4,575429451,"Suzuki Vitara 1,4 Hybrid ALLGRIP 6AGS flash",Fahrzeug verfügbar ab 09.2022. Irrtümer und ...,27500,2022,620,Suzuki,Vitara,SUV / Geländewagen,5.0,95,Hybrid Elektro/Benzin,Schaltgetriebe,Ja,"46.6682,12.99953",9640,Kärnten,Hermagor,Kötschach-Mauthen,Autohaus Presslauer-Webhofer GmbH,0,Servicegepflegt|Leasingfähig|elektr. Spiegel|e...
5,604066734,BMW 6er-Reihe 630d xDrive Gran Turismo Aut. AC...,Sie suchen eine komfortable Reiselimousine mit...,55890,2018,30000,BMW,6er-Reihe,Limousine,5.0,195,Diesel,Automatik,Ja,"48.50931,15.51201",3542,Niederösterreich,Krems Land,Gföhl,Sinhuber GmbH,0,ABS|Servicegepflegt|Nichtraucherfahrzeug|Fahre...


Aus den ersten sechs Zeilen sieht man, wo sich `pandas` schwer tut mit den Datentypen. Deshalb setzen wir die types selbst:

In [6]:
df = df.astype({
  "id": str, 
  "body_dyn": str,
  "engine_fuel_resolved": "category", 
  "transmission_resolved": "category",
  "postcode": str,
  "equipment_resolved": str
  })

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220108 entries, 0 to 220107
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   id                     220108 non-null  object  
 1   heading                220108 non-null  object  
 2   body_dyn               220108 non-null  object  
 3   price                  220108 non-null  int64   
 4   year_model             220108 non-null  int64   
 5   mileage                220108 non-null  int64   
 6   brand                  220108 non-null  object  
 7   model                  220108 non-null  object  
 8   car_type               220108 non-null  object  
 9   noofseats              217319 non-null  float64 
 10  engine_effect          220108 non-null  int64   
 11  engine_fuel_resolved   219656 non-null  category
 12  transmission_resolved  219898 non-null  category
 13  warranty_resolved      220108 non-null  object  
 14  coordinates         

Anschließend erstellen wir ein paar neue Features, die für später relevant sein können:

In [8]:
# Feature Engineering (cleaning, new Features)
df["orgname"] = df["orgname"].apply(lambda x: str(x).strip())
df["fuel"] = df["engine_fuel_resolved"]
df["transmission"] = df["transmission_resolved"]

df["ln_price"] = df["price"].apply(lambda x: np.log(x) if x > 0 else 0)
df["ln_mileage"] = df["mileage"].apply(lambda x: np.log(x) if x > 0 else 0)
df["age"] = 2022 - df["year_model"]
df["has_warranty"] = df["warranty_resolved"].apply(lambda x: 1 if x in "Ja" else 0)
df["gewerblich"] = 1 - df["isprivate"]
df["ps"] = 1.36 * df["engine_effect"] 

def check_defect(text: str):
  defect = 1 if sum([word in text.lower() for word in ["schaden", "unfall"]]) > 0 else 0
  return defect
df["has_defect"] = df["body_dyn"].apply(check_defect)

df["is_servicegepflegt"] = df["equipment_resolved"].apply(lambda x: 1 if "servicegepflegt" in x.lower() else 0)
df["is_leasing"] = df["equipment_resolved"].apply(lambda x: 1 if "leasingfähig" in x.lower() else 0)
df["has_alufelgen"] = df["equipment_resolved"].apply(lambda x: 1 if "alufelgen" in x.lower() else 0)
df["length_heading"] = df["heading"].apply(len)

Nach dem Feature Engineering räumen wir den Datensatz nochmal auf, in wem wir nicht benötigte Spalten entfernen und 

In [None]:
df = df.drop(columns=[
  "heading", "body_dyn", "engine_effect", "engine_fuel_resolved", "transmission_resolved",
  "warranty_resolved", "equipment_resolved"
])
df = df.drop_duplicates(subset="id")
gc.collect()

In [13]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 145428 entries, 0 to 220105
Data columns (total 28 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   id                  145428 non-null  object  
 1   price               145428 non-null  int64   
 2   year_model          145428 non-null  int64   
 3   mileage             145428 non-null  int64   
 4   brand               145428 non-null  object  
 5   model               145428 non-null  object  
 6   car_type            145428 non-null  object  
 7   noofseats           143728 non-null  float64 
 8   coordinates         145163 non-null  object  
 9   postcode            145428 non-null  object  
 10  state               145428 non-null  object  
 11  district            145428 non-null  object  
 12  location            145428 non-null  object  
 13  orgname             145428 non-null  object  
 14  isprivate           145428 non-null  int64   
 15  fuel             

In [14]:
df.describe()

Unnamed: 0,price,year_model,mileage,noofseats,isprivate,ln_price,ln_mileage,age,has_warranty,gewerblich,ps,has_defect,is_servicegepflegt,is_leasing,has_alufelgen,length_heading
count,145428.0,145428.0,145428.0,143728.0,145428.0,145428.0,145428.0,145428.0,145428.0,145428.0,145428.0,145428.0,145428.0,145428.0,145428.0,145428.0
mean,23508.08,2013.012769,112716.857751,4.936672,0.478058,9.626442,10.735652,8.987231,0.196516,0.521942,157.271747,0.045514,0.459361,0.185542,0.670985,33.447555
std,31227.89,9.319773,91237.173815,0.938394,0.49952,1.03741,2.304749,9.319773,0.397365,0.49952,87.493063,0.208429,0.498347,0.388738,0.469857,17.082377
min,1.0,1900.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
25%,8790.0,2009.0,32042.0,5.0,0.0,9.08137,10.374803,3.0,0.0,0.0,110.16,0.0,0.0,0.0,0.0,21.0
50%,18205.5,2016.0,99010.0,5.0,0.0,9.809479,11.502976,6.0,0.0,1.0,140.08,0.0,0.0,0.0,1.0,31.0
75%,29900.0,2019.0,174000.0,5.0,1.0,10.305614,12.066811,13.0,0.0,1.0,187.68,0.0,1.0,0.0,1.0,44.0
max,5840000.0,2022.0,940000.0,95.0,1.0,15.580241,13.753635,122.0,1.0,1.0,12920.0,1.0,1.0,1.0,1.0,662.0
