# Missing value treatment

In [108]:
import pandas as pd
from io import StringIO

In [109]:
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
0.0,11.0,12.0,'''

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [110]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [111]:
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [112]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,0.0,11.0


In [113]:
from sklearn.preprocessing import Imputer

In [114]:
imr = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imr.fit(df)
imputed_data = imr.transform(df.values)
imputed_data

array([[  1.,   2.,   3.,   4.],
       [  5.,   6.,   3.,   8.],
       [  0.,  11.,  12.,   4.]])

# Categorical feature

In [115]:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class1'],
    ['red', 'L', 13.5, 'class2'],
    ['blue', 'S', 15, 'class1']
])

In [116]:
df.columns = ['color', 'size', 'price', 'classlabel']

In [117]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,S,15.0,class1


In [118]:
size_mapping = {
    'L':3,
    'M':2,
    'S':1,
}

In [119]:
df['size'] = df['size'].map(size_mapping)

In [120]:
df

Unnamed: 0,color,size,price,classlabel
0,green,2,10.1,class1
1,red,3,13.5,class2
2,blue,1,15.0,class1


In [121]:
inv_size_mapping = {v:k for k, v in size_mapping.items()}

In [122]:
df['size'] = df['size'].map(inv_size_mapping)

In [123]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,S,15.0,class1


In [124]:
class_mapping = {label:idx for idx,label in enumerate(np.unique(df['classlabel']))}

In [125]:
class_mapping

{'class1': 0, 'class2': 1}

In [126]:
df['classlabel'] = df['classlabel'].map(class_mapping)

In [127]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,0
1,red,L,13.5,1
2,blue,S,15.0,0


In [128]:
inv_class_mapping = {v: k  for k,v in class_mapping.items()}

In [129]:
inv_class_mapping

{0: 'class1', 1: 'class2'}

In [130]:
df['classlabel'] = df['classlabel'].map(inv_class_mapping)

In [131]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,S,15.0,class1


In [132]:
from sklearn.preprocessing import LabelEncoder

In [133]:
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([0, 1, 0], dtype=int64)

In [134]:
class_le.inverse_transform(y)

array(['class1', 'class2', 'class1'], dtype=object)

In [164]:
X = df[['color', 'size', 'price']].values

In [165]:
X

array([['green', 2, 10.1],
       ['red', 3, 13.5],
       ['blue', 1, 15.0]], dtype=object)

In [166]:
color_le = LabelEncoder()

In [167]:
X[:, 0] = color_le.fit_transform(X[:, 0])

In [168]:
X

array([[1, 2, 10.1],
       [2, 3, 13.5],
       [0, 1, 15.0]], dtype=object)

In [169]:
df['size'] = df['size'].map(size_mapping)

In [170]:
from sklearn.preprocessing import OneHotEncoder

In [172]:
X

array([[1, 2, 10.1],
       [2, 3, 13.5],
       [0, 1, 15.0]], dtype=object)

In [174]:
ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()

array([[  0. ,   1. ,   0. ,   2. ,  10.1],
       [  0. ,   0. ,   1. ,   3. ,  13.5],
       [  1. ,   0. ,   0. ,   1. ,  15. ]])

In [175]:
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,,0,1,0
1,13.5,,0,0,1
2,15.0,,1,0,0


# Scailing

In [177]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)

df_wine.columns = ['Class label', 'Alcohol',
                  'Malic acid', 'Ash',
                  'Alcalinity of ash', 'Magnesium',
                  'Total phenols', 'Flavanoids',
                  'Nonflavanoid phenols',
                  'Proanthocyanins',
                  'Color intensity', 'Hue',
                  'OD280/OD315 of diluted wines',
                  'Proline']

df_wine.head()

from sklearn.cross_validation import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.fit_transform(X_test)

from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.fit_transform(X_test)

In [213]:
from sklearn.linear_model import LogisticRegression

In [214]:
lr = LogisticRegression(penalty='l1', C=0.1)
lr.fit(X_train_std, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [215]:
lr.score(X_train_std, y_train)

0.9838709677419355

In [217]:
lr.score(X_test_std, y_test)

0.98148148148148151

In [218]:
lr.intercept_

array([-0.38383556, -0.15814855, -0.70047056])

In [219]:
lr.coef_

array([[ 0.28025029,  0.        ,  0.        , -0.02805439,  0.        ,
         0.        ,  0.71002146,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.23597787],
       [-0.64374821, -0.06896563, -0.05717455,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        , -0.92723727,
         0.05980803,  0.        , -0.37100233],
       [ 0.        ,  0.06156755,  0.        ,  0.        ,  0.        ,
         0.        , -0.63524493,  0.        ,  0.        ,  0.49784481,
        -0.35838675, -0.57191374,  0.        ]])