In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv('test.csv')

In [3]:
df = pd.concat([train, test], axis = 0)

In [4]:
df.head(5)

Unnamed: 0,myear,transmission,fuel,km,ip,oem,model,variant,City,top_features,...,Max Torque At,body,Engine Type,Gear Box,Steering Type,Front Brake Type,Rear Brake Type,Tyre Type,exterior_color,listed_price
0,2013,manual,petrol,74211.0,0,toyota,toyota corolla altis,1.8 j,mumbai,"['power steering', 'power windows front', 'air...",...,4000.0,sedan,2zr-fe gasoline 4cylinder,6 speed,power,ventilated disc,disc,tubeless radial,white,440000.0
1,2014,manual,petrol,73259.0,0,hyundai,hyundai i10,sportz,new delhi,"['power steering', 'power windows front', 'air...",...,4000.0,hatchback,dohc kappa2 engine,5 speed,power,ventilated disc,drum,tubeless radial,white,252915.0
2,2018,manual,petrol,74275.0,0,hyundai,hyundai i20,1.2 spotz,lucknow,"['power steering', 'power windows front', 'air...",...,4000.0,hatchback,kappa vtvt petrol engine,5 speed,power,disc,drum,tubeless,silver,475000.0
3,2015,automatic,petrol,58212.0,1,mercedes-benz,mercedes-benz s-class,s 500 l,mumbai,"['power steering', 'power windows front', 'air...",...,2650.0,sedan,v-type petrol engine,7 speed,power,disc,disc,tubeless radial,blue,5399999.0
4,2019,manual,petrol,16093.0,0,maruti,maruti baleno,delta,gurgaon,"['power steering', 'power windows front', 'air...",...,4200.0,hatchback,1.2l vvt engine,5 speed,power,disc,drum,tubeless radial,blue,583412.0


In [5]:
df.columns

Index(['myear', 'transmission', 'fuel', 'km', 'ip', 'oem', 'model', 'variant',
       'City', 'top_features', 'comfort_features', 'interior_features',
       'exterior_features', 'safety_features', 'No of Cylinder',
       'Valves per Cylinder', 'Valve Configuration', 'Turbo Charger',
       'Super Charger', 'Length', 'Width', 'Height', 'Wheel Base',
       'Front Tread', 'Rear Tread', 'Kerb Weight', 'Drive Type', 'Seats',
       'Top Speed', 'Acceleration', 'Doors', 'Cargo Volume', 'state',
       'owner_type', 'Fuel Supply System', 'Max Power Delivered',
       'Max Power At', 'Max Torque Delivered', 'Max Torque At', 'body',
       'Engine Type', 'Gear Box', 'Steering Type', 'Front Brake Type',
       'Rear Brake Type', 'Tyre Type', 'exterior_color', 'listed_price'],
      dtype='object')

## 1. Cleaning the data

### 1.1. Cleaning exterior_color

In [6]:
train['exterior_color'][101].split()

['metallic', 'woodland', 'brown']

In [7]:
train['exterior_color'].value_counts()[0:40]

white                       9152
silver                      4012
grey                        2962
red                         2549
blue                        1827
black                       1790
brown                        930
other                        553
golden                       446
gray                         435
maroon                       428
silky silver                 344
polar white                  341
orange                       262
pearl arctic white           260
sleek silver                 257
pure white                   224
superior white               208
candy white                  179
glistening grey              172
granite grey                 168
golden brown metallic        142
diamond white                122
carnelian red pearl          120
green                        119
pearl white                  108
urban titanium metallic      100
metallic silky silver         99
tafeta white                  95
star dust                     94
glacier wh

In [8]:
colors = ['white', 'silver', 'grey', 'red', 'blue', 'black', 'brown', 'maroon', 'orange', 'green',
          'metallic']

In [9]:
#Reducing the cardinality of color and some data entry errors
def reduceColor(row):
    if 'gray' in row['exterior_color'].split():
        return 'grey'
    if row['exterior_color'] not in colors:
        if row['exterior_color'] == 'other':
            return 'white'
        for color in colors:
            if color in row['exterior_color'].split():
                return color
        else:
            return row['exterior_color']
    else:
        return row['exterior_color']

In [10]:
train['exterior_color'] = train.apply(reduceColor, axis = 1)
test['exterior_color'] = test.apply(reduceColor, axis = 1)

In [11]:
train['exterior_color'].nunique()

154

### 1.2. Cleaning 'top features'

In [12]:
train['top_features'][0]

"['power steering', 'power windows front', 'air conditioner', 'heater', 'adjustable head lights', 'power adjustable exterior rear view mirror', 'anti lock braking system', 'brake assist', 'speakers front']"

In [13]:
len(train['top_features'][0][1:-1].split(', '))

9

In [14]:
#Converting the entries to categorical columns
def reduceTF(row):
    if row['top_features'] == '[]':
        return np.nan
    else:
        return len(row['top_features'][1:-1].split(', '))

In [23]:
#Applying function to the dataset
train['top_features'] = train.apply(reduceTF, axis = 1)
test['top_features'] = test.apply(reduceTF, axis = 1)

print("In train, Nans in top_features: ", train['top_features'].isnull().sum())
print("In test, Nans in top_features: ", test['top_features'].isnull().sum())

In train, Nans in top_features:  228
In test, Nans in top_features:  34


### 1.3. Cleaning comfort_features

In [24]:
#Creating encodings out of the feature
def reduceCF(row):
    if row['comfort_features'] == '[]':
        return np.nan
    else:
        return len(row['comfort_features'][1:-1].split(', '))

In [26]:
#Applying the function to the dataset
train['comfort_features'] = train.apply(reduceCF, axis = 1)
test['comfort_features'] = test.apply(reduceCF, axis = 1)

print("In train, Nans in comfort_features: ", train['comfort_features'].isnull().sum())
print("In test, Nans in comfort_features: ", test['comfort_features'].isnull().sum())

In train, Nans in comfort_features:  232
In test, Nans in comfort_features:  36


### 1.4. Cleaning interior_features

In [29]:
#Creating encodings out of the feature
def reduceIF(row):
    if row['interior_features'] == '[]':
        return np.nan
    else:
        return len(row['interior_features'][1:-1].split(', '))

In [30]:
#Applying the function to the dataset
train['interior_features'] = train.apply(reduceIF, axis = 1)
test['interior_features'] = test.apply(reduceIF, axis = 1)

print("In train, Nans in interior_features: ", train['interior_features'].isnull().sum())
print("In test, Nans in interior_features: ", test['interior_features'].isnull().sum())

In train, Nans in interior_features:  273
In test, Nans in interior_features:  38


### 1.5. Cleaning exterior_features

In [33]:
#Creating encodings out of the feature
def reduceEF(row):
    if row['exterior_features'] == '[]':
        return np.nan
    else:
        return len(row['exterior_features'][1:-1].split(', '))

In [34]:
#Applying the function to the dataset
train['exterior_features'] = train.apply(reduceEF, axis = 1)
test['exterior_features'] = test.apply(reduceEF, axis = 1)

print("In train, Nans in exterior_features: ", train['exterior_features'].isnull().sum())
print("In test, Nans in exterior_features: ", test['exterior_features'].isnull().sum())

In train, Nans in exterior_features:  304
In test, Nans in exterior_features:  40


### 1.6. Cleaning safety_features

In [36]:
#Creating encodings out of the feature
def reduceSF(row):
    if row['safety_features'] == '[]':
        return np.nan
    else:
        return len(row['safety_features'][1:-1].split(', '))

In [37]:
#Applying the function to the dataset
train['safety_features'] = train.apply(reduceSF, axis = 1)
test['safety_features'] = test.apply(reduceSF, axis = 1)

print("In train, Nans in safety_features: ", train['safety_features'].isnull().sum())
print("In test, Nans in safety_features: ", test['safety_features'].isnull().sum())   

In train, Nans in safety_features:  345
In test, Nans in safety_features:  46


### 2. Imputing the newly created NaNs

In [40]:
#Gathering the columns with Nans
nan_cols = [col for col in train.columns
           if train[col].isnull().sum() != 0]
nan_cols

['top_features',
 'comfort_features',
 'interior_features',
 'exterior_features',
 'safety_features']

In [42]:
#Creating the simple imputer object
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy = 'mean')

In [44]:
#Imputing the features
imp_train = pd.DataFrame(si.fit_transform(train[nan_cols]))
imp_test = pd.DataFrame(si.transform(test[nan_cols]))

In [51]:
#Adding column names
imp_train.columns = nan_cols
imp_test.columns = nan_cols
#Index is already the same since both indexes start from 0

In [55]:
#Replacing the columns with the new columns in the original dataset
train[nan_cols] = imp_train
test[nan_cols] = imp_test

In [58]:
#Creating csv files
train.to_csv('train2.csv')
test.to_csv('test2.csv')