## Importing libraries

In [1]:
import pandas as pd
import numpy as np

## Reading CSV data

In [2]:
olx_raw = pd.read_csv("olx_data.csv", names=['Price','Title','Description','DOP','Make','Model','Year','Fuel','Milage','Location','Area'])
print(olx_raw.shape)

(5032, 11)


## Sample data

In [3]:
olx_raw.head()

Unnamed: 0,Price,Title,Description,DOP,Make,Model,Year,Fuel,Milage,Location,Area
0,735008.0,"Mahindra Xuv500 XUV500 W4, 2015, Diesel",Selling Price 811000 / Deal Of The Day F...,2019-09-07T15:52:40+05:30,mahindra,mahindra-xuv500,2015.0,diesel,50000,Vadodara,Akota
1,737000.0,"Mahindra Xuv500 XUV500 W4, 2015, Diesel",Selling Price 811000 / Deal Of The Day F...,2019-09-07T15:12:33+05:30,mahindra,mahindra-xuv500,2015.0,diesel,52000,Vadodara,Akota
2,740000.0,"Mahindra Xuv500 XUV500 W4, 2015, Diesel",Selling Price 811000 / Deal Of The Day F...,2019-09-07T15:16:49+05:30,mahindra,mahindra-xuv500,2015.0,diesel,54000,Vadodara,Akota
3,790000.0,"Mahindra Xuv500 XUV500 W6, 2016, Diesel",Well maintained genuine kilometre Company reco...,2019-09-15T15:32:47+05:30,mahindra,mahindra-xuv500,2016.0,diesel,125000,Vadodara,Gotri Road
4,210000.0,Jeep for Sale,CJ3B jeep for URGENT sale. Single owner,2019-09-15T12:49:18+05:30,mahindra,mahindra-others,1984.0,diesel,23000,Vadodara,Kareli Bagh


## Dropping columns

In [4]:
olx_lr = olx_raw.drop(['Title','Description','DOP','Location','Area','Make'],axis=1)

In [5]:
olx_lr.head()

Unnamed: 0,Price,Model,Year,Fuel,Milage
0,735008.0,mahindra-xuv500,2015.0,diesel,50000
1,737000.0,mahindra-xuv500,2015.0,diesel,52000
2,740000.0,mahindra-xuv500,2015.0,diesel,54000
3,790000.0,mahindra-xuv500,2016.0,diesel,125000
4,210000.0,mahindra-others,1984.0,diesel,23000


## Dropping rows with 0 milage

In [6]:
olx_lr = olx_lr[olx_lr['Milage']!=0]

## Substituting less frequent cars-models with NA

In [7]:
threshold = 30 # Anything that occurs less than this will be removed.
for col in ['Model']:
    value_counts = olx_lr[col].value_counts() # Specific column 
    to_remove = value_counts[value_counts <= threshold].index
    olx_lr[col].replace(to_remove, np.nan, inplace=True)

## Dropping rows in NA

In [8]:
olx_lr = olx_lr.dropna()
print(olx_lr.shape)

(3608, 5)


## Checking NA values

In [9]:
olx_lr.isnull().sum()

Price     0
Model     0
Year      0
Fuel      0
Milage    0
dtype: int64

## Inspecting data

In [10]:
def inspect_data(data):
    return pd.DataFrame({"Data Type":data.dtypes,"No of Levels":data.apply(lambda x: x.unique().shape[0],axis=0), "Levels":data.apply(lambda x: str(x.unique()),axis=0)})

inspect_data(olx_lr)

Unnamed: 0,Data Type,No of Levels,Levels
Price,float64,533,[735008.0 737000.0 740000.0 790000.0 124999.0 ...
Model,object,55,['mahindra-xuv500' 'mahindra-scorpio' 'mahindr...
Year,float64,25,[2015.0 2016.0 2004.0 2014.0 2012.0 2007.0 201...
Fuel,object,4,['diesel' 'petrol' 'cng' 'lpg']
Milage,int64,772,[50000 52000 54000 125000 96000 30950 53000 95...


## Hardcoding datatypes

In [11]:
olx_lr['Price'] = olx_lr['Price'].astype('int')
olx_lr['Year'] = olx_lr['Year'].astype('int')
olx_lr['Milage'] = olx_lr['Milage'].astype('int')

In [12]:
olx_lr.to_csv("olx_clean.csv",index=False)