In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import KNNImputer

**Attribute Information:**

- A1:	b, a.
- A2:	continuous.
- A3:	continuous.
- A4:	u, y, l, t.
- A5:	g, p, gg.
- A6:	c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff.
- A7:	v, h, bb, j, n, z, dd, ff, o.
- A8:	continuous.
- A9:	t, f.
- A10:	t, f.
- A11:	continuous.
- A12:	t, f.
- A13:	g, p, s.
- A14:	continuous.
- A15:	continuous.
- A16: +,-         (class attribute)

**Missing Attribute Values:**

*37 cases (5%) have one or more missing values. The missing values from particular attributes are:*

- A1:  12
- A2:  12
- A4:   6
- A5:   6
- A6:   9
- A7:   9
- A14: 13

In [2]:
header_list = ['A' + str(i) for i in range(1, 17)]
df = pd.read_csv('crx.data', names=header_list)
df = df.replace('?', np.nan)

In [3]:
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [4]:
df.isna().sum()

A1     12
A2     12
A3      0
A4      6
A5      6
A6      9
A7      9
A8      0
A9      0
A10     0
A11     0
A12     0
A13     0
A14    13
A15     0
A16     0
dtype: int64

In [5]:
df.nunique()

A1       2
A2     349
A3     215
A4       3
A5       3
A6      14
A7       9
A8     132
A9       2
A10      2
A11     23
A12      2
A13      3
A14    170
A15    240
A16      2
dtype: int64

## Split test/train 

In [6]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)

In [7]:
for train_index, test_index in sss.split(df.drop('A16', axis=1), df['A16']):
    train_df = df.iloc[train_index]
    dev_df = df.iloc[test_index]

In [8]:
train_df.isna().sum()

A1     10
A2      6
A3      0
A4      5
A5      5
A6      7
A7      7
A8      0
A9      0
A10     0
A11     0
A12     0
A13     0
A14    11
A15     0
A16     0
dtype: int64

In [9]:
dev_df.isna().sum()

A1     2
A2     6
A3     0
A4     1
A5     1
A6     2
A7     2
A8     0
A9     0
A10    0
A11    0
A12    0
A13    0
A14    2
A15    0
A16    0
dtype: int64

## Encoding categorical

In [10]:
# one_hot = pd.get_dummies(df[['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']])
# df = df.drop(['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'], axis=1)
# df = df.join(one_hot)

In [11]:
cleanup_data = {'A1': {'b': 10, 'a': 11},
                'A4': {'u': 40, 'y': 41, 'l': 42, 't': 43},
                'A5': {'g': 50, 'p': 51, 'gg': 52},
                'A6': {'c': 600, 'd': 601, 'cc': 602, 'i': 603, 'j': 604, 'k': 605, 'm': 606, 'r': 607, 'q': 608, 'w': 609, 'x': 610, 'e': 611, 'aa': 612, 'ff': 613},
                'A7': {'v': 70, 'h': 71, 'bb': 72, 'j': 73, 'n': 74, 'z': 75, 'dd': 76, 'ff': 77, 'o': 78},
                'A9': {'t': 90, 'f': 91},
                'A10': {'t': 100, 'f': 101},
                'A12': {'t': 120, 'f': 121},
                'A13': {'g': 130, 'p': 131, 's': 132},
                'A16': {'+': 1, '-': -1}
               }

In [12]:
train_df = train_df.replace(cleanup_data)
dev_df = dev_df.replace(cleanup_data)

## Fill missing values

In [13]:
imputer = KNNImputer(n_neighbors=3)
imputer.fit(train_df)

KNNImputer(add_indicator=False, copy=True, metric='nan_euclidean',
           missing_values=nan, n_neighbors=3, weights='uniform')

In [14]:
train_df = pd.DataFrame(imputer.transform(train_df), columns=header_list)
dev_df = pd.DataFrame(imputer.transform(dev_df), columns=header_list)

In [15]:
train_df[['A1', 'A4', 'A5', 'A6', 'A7']] = train_df[['A1', 'A4', 'A5', 'A6', 'A7']].round(0)
dev_df[['A1', 'A4', 'A5', 'A6', 'A7']] = dev_df[['A1', 'A4', 'A5', 'A6', 'A7']].round(0)

In [16]:
train_df.isna().sum()

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A12    0
A13    0
A14    0
A15    0
A16    0
dtype: int64

In [17]:
dev_df.isna().sum()

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A12    0
A13    0
A14    0
A15    0
A16    0
dtype: int64

## Save data

In [18]:
train_df.to_csv('train.csv', index=False)
dev_df.to_csv('dev.csv', index=False)