## Preprocessing

In [1]:
import pandas as pd

### Make ID the index column

In [2]:
views = pd.read_csv(
    '../../datasets/auto.csv',
    sep=',',
    header=0,
    index_col='ID'
)
print(views)

        CarNumber    Make_n_model  Refund    Fines  History
ID                                                         
0    Y163O8161RUS      Ford Focus     2.0   3200.0      NaN
1     E432XX77RUS    Toyota Camry     1.0   6500.0      NaN
2     7184TT36RUS      Ford Focus     1.0   2100.0      NaN
3    X582HE161RUS      Ford Focus     2.0   2000.0      NaN
4    E34877152RUS      Ford Focus     2.0   6100.0      NaN
..            ...             ...     ...      ...      ...
926  Y163O8161RUS      Ford Focus     2.0   1600.0      NaN
927  M0309X197RUS      Ford Focus     1.0  22300.0      NaN
928  O673E8197RUS      Ford Focus     2.0    600.0      NaN
929  8610T8154RUS      Ford Focus     1.0   2000.0      NaN
930  H419XE197RUS  Toyota Corolla     2.0      NaN      2.0

[931 rows x 5 columns]


### Count the number of observations using the method count()

In [3]:
print(views['CarNumber'].count())

931


### Drop the duplicates, taking into account only the following columns: CarNumber, Make_n_model, Fines

In [4]:
views = views.drop_duplicates(
    subset=['CarNumber', 'Make_n_model', 'Fines'], 
    keep='last'
)
print(views['CarNumber'].count())

725


### Work with missing values

In [5]:
print(views.isna().sum())
views = views.dropna(axis='columns', thresh=len(views)-500)
print(views.isna().sum())
views['Refund'] = views['Refund'].fillna(views['Fines'].ffill()) #forward fill
views['Fines'] = views['Fines'].fillna(views['Fines'].mean())
print(views.isna().sum())

CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64
CarNumber        0
Make_n_model     0
Refund          12
Fines           60
dtype: int64
CarNumber       0
Make_n_model    0
Refund          0
Fines           0
dtype: int64


### Split and parse the make and model

In [6]:
views[['Make', 'Model']] = views['Make_n_model'].apply(
    lambda x: pd.Series(str(x).split(' ', 1)))
views = views.drop(columns=['Make_n_model'])
print(views)



        CarNumber  Refund         Fines    Make    Model
ID                                                      
0    Y163O8161RUS     2.0   3200.000000    Ford    Focus
1     E432XX77RUS     1.0   6500.000000  Toyota    Camry
2     7184TT36RUS     1.0   2100.000000    Ford    Focus
3    X582HE161RUS     2.0   2000.000000    Ford    Focus
5    92918M178RUS     1.0   5700.000000    Ford    Focus
..            ...     ...           ...     ...      ...
926  Y163O8161RUS     2.0   1600.000000    Ford    Focus
927  M0309X197RUS     1.0  22300.000000    Ford    Focus
928  O673E8197RUS     2.0    600.000000    Ford    Focus
929  8610T8154RUS     1.0   2000.000000    Ford    Focus
930  H419XE197RUS     2.0   8594.586466  Toyota  Corolla

[725 rows x 5 columns]


In [7]:
views.to_json('auto.json', orient='records')

In [8]:
print(views.count())
print(views['Fines'].mean())

CarNumber    725
Refund       725
Fines        725
Make         725
Model        716
dtype: int64
8594.586466165412
