In [109]:
import pandas as pd

## read csv

In [110]:
df = pd.read_csv(
    "../data/auto.csv",
    index_col='ID'
)
df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
4,E34877152RUS,Ford Focus,2.0,6100.0,
...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0,
927,M0309X197RUS,Ford Focus,1.0,22300.0,
928,O673E8197RUS,Ford Focus,2.0,600.0,
929,8610T8154RUS,Ford Focus,1.0,2000.0,


## count

In [111]:
print("Number of observations:")
df.count()

Number of observations:


CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64

## drop the duplicates

In [112]:
df.drop_duplicates(['CarNumber', 'Make_n_model', 'Fines'], keep='last', inplace=True)
print("Number of observations:")
df.count()

Number of observations:


CarNumber       725
Make_n_model    725
Refund          713
Fines           665
History          65
dtype: int64

## work with missing values

In [113]:
print("number of missing values:")
df.isnull().sum()

number of missing values:


CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64

In [114]:
df.dropna(axis='columns', thresh=(len(df.index) - 500), inplace=True)
print("number of missing values:")
df.isnull().sum()


number of missing values:


CarNumber        0
Make_n_model     0
Refund          12
Fines           60
dtype: int64

In [115]:
df['Refund'] = df['Refund'].fillna(method='ffill')
print("number of missing values:")
df.isnull().sum()

number of missing values:


CarNumber        0
Make_n_model     0
Refund           0
Fines           60
dtype: int64

In [116]:
df['Fines'] = df['Fines'].fillna(df['Fines'].mean(skipna=True))
print("number of missing values:")
df.isnull().sum()

number of missing values:


CarNumber       0
Make_n_model    0
Refund          0
Fines           0
dtype: int64

## split and parse the make and model

In [117]:
make = df["Make_n_model"].apply(lambda x: x.split(" ")[0])
model = df["Make_n_model"].apply(lambda x: x.split(" ")[-1])
df = df.join(make.to_frame(name="Make"))
df = df.join(model.to_frame(name="Model"))
df.drop('Make_n_model', axis='columns', inplace=True)
df = df.reset_index(drop=True)
df.to_json('auto.json', orient='records')
