# Clean Used Car Data

In [1]:
import pandas as pd 
import numpy as np 
import statistics as stat 
import sys
sys.path.append(".")
import logging as logger

## Load the data & check the data types

In [2]:
#load data into dataframe for cleaning 

raw_data = pd.read_csv("../data/vehicles.csv")
  

In [None]:
raw_data.shape # checking the expected data has arrived

In [None]:
raw_data.head(10)

In [None]:
raw_data.describe()

In [None]:
%%time
raw_data.info() # get the information breakdown of the data

## Initial Notes
1. The data shows a number of features that contain null values.
2. There are a number of features that are not required for modelling

## Dropping columns no required 

In [None]:
#drop all columns not required for modelling 
data_1 = raw_data.drop(columns=['id', 'url','region_url','VIN', 'county', 'lat','long', 'posting_date', 'image_url', 'description', 'model'])

## Calulate the number of NaN's per columns

In [None]:
#count up the number of nulls 

null_check = data_1.isnull().sum()
null_check = pd.DataFrame(null_check, columns=["Null"])
total = len(raw_data)
null_check['%'] = pd.DataFrame(null_check['Null']/total)
null_check.sort_values('%',ascending=False)


## Removing small levels of rows that contain NaN's 

In [None]:
#removing the rows that are missing <= 10% of the over data 
data_2 = data_1.dropna(subset=['year','transmission', 'fuel', 'odometer','title_status'])
null_check_2 = data_2.isnull().sum()
null_check_2 = pd.DataFrame(null_check_2, columns=["Null"])
total = len(raw_data)
null_check_2['%'] = pd.DataFrame(null_check_2['Null']/total)
null_check_2.sort_values('%',ascending=False)

# Check the value counts for all the categorical features

In [None]:
# check the catigorical type of data
data_2['size'].value_counts()

In [None]:
data_2['condition'].value_counts()

In [None]:
data_2['drive'].value_counts()

In [None]:
data_2['paint_color'].value_counts()

In [None]:
data_2['type'].value_counts()

In [None]:
data_2['manufacturer'].value_counts()

In [None]:
data_2['title_status'].value_counts()

In [None]:
data_2['region'].value_counts()

In [None]:
data_2['fuel'].value_counts()

In [None]:
data_2['transmission'].value_counts()

In [None]:
data_2['state'].value_counts()

## Notes 
1. fuel has 30k records  classed as other. Unable to know or predict the fuel type so these row will be removed
2. Transmission as ~62k records classed as other. Unable to know or predict the transamission type so these rows will be removed. 
3. in the feature title status, the field is used to highlight if the item being sold is 

In [None]:
#removing the rows of data as per the notes above 
data_3 = data_2[data_2['fuel'] != 'other']
data_3 = data_2[data_2['transmission'] != 'other']

In [None]:
null_check_3 = data_3.isnull().sum()
null_check_3 = pd.DataFrame(null_check_3, columns=["Null"])
total = len(raw_data)
null_check_3['%'] = pd.DataFrame(null_check_3['Null']/total)
null_check_3.sort_values('%',ascending=False)


## Encoding features
Preparing each of the features for imputation using KNN.
Each feature will require its NaNs rows removed to a seperate dataframe. These will be merged back once the feature is encoded. 


In [None]:
## seperated the data into two dataframes. One contain all rows with no nan and the other rows with nans
prep_encoding = data_3

In [None]:
cols =list(prep_encoding.columns)
remove = ['year','price', 'odometer']
for item in remove:
    cols.remove(item)
cols

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

In [None]:
#created a dict to hold the encoders for each column
encoder_dict = {}

#loop through the dataframe and extract non-null, encode them and place them back. 
for name in cols:
    #encoder for the column
    encoder_dict[name] = OrdinalEncoder()
    
    #select no nulls 
    col = prep_encoding[name]
    col_notnull = col[col.notnull()]
    reshape = col_notnull.values.reshape(-1,1)

    #encode non-nills 
    encoded_vals = encoder_dict[name].fit_transform(reshape)
    prep_encoding.loc[col.notnull(), name] = np.squeeze(encoded_vals)

In [None]:
prep_encoding.head(50)

# Impute NaNs using KNN 

In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

In [None]:
imputer = KNNImputer(n_neighbors=5)
#create a scaler to bring all values on the same scale
#scale = MinMaxScaler()
#scaled_data = pd.DataFrame(scale.fit_transform(prep_encoding), columns=prep_encoding.columns) # new dataframe 



In [None]:
#Imputing the scaled data using knn 
# for testing purpose, a sample is created. Please edit to use to the full scaled data dataframe

sample = prep_encoding.sample(frac=0.1, replace=False, random_state=1)
finaldataset = pd.DataFrame(imputer.fit_transform(sample), columns=prep_encoding.columns)


In [None]:
# Viewing the dataset fully imputed and scaled. 
finaldataset
#store the data to be accessed in the modeling file 
%store finaldataset

In [None]:
finaldataset

# Reverse the scaled data to its normal form

In [None]:
x = pd.DataFrame(scale.inverse_transform(finaldataset), columns=finaldataset.columns)
%store readydata

In [None]:
readydata

In [None]:
# Loop the data and reverse the encoding to create the orginal but imputed dataframe. 
for col in x:
    #encoder for the column
   
    reshape_col = x[col].values.reshape(-1,1)
    x[col] = encoder_dict[col].inverse_transform(reshape_col)

In [None]:
#checking for all nulls removed
x.isnull().any()

In [None]:
x.head(35)