### 0. Creation dataset

In [27]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'time_columns' : ['2010-01-01 00:00:00','2010-01-02 00:10:00','2010-01-03 00:00:00','2010-01-04 00:20:00'],
    'boolean_column': [True, False, True, False], 
    'integer_column': [1, 2, 3, 4],
    'float_column': [5.,2.5,3.,4.5],
    'categorical_column': ['a', 'b', 'a', 'c']
})
df['time_columns'] = pd.to_datetime(df['time_columns'], format='%Y-%m-%d %H:%M:%S')
df

Unnamed: 0,boolean_column,categorical_column,float_column,integer_column,time_columns
0,True,a,5.0,1,2010-01-01 00:00:00
1,False,b,2.5,2,2010-01-02 00:10:00
2,True,a,3.0,3,2010-01-03 00:00:00
3,False,c,4.5,4,2010-01-04 00:20:00


### 1. variables' type verification

In [2]:
boolean_columns = df.select_dtypes(include=['bool'])
print(boolean_columns)

numerical_columns = df.select_dtypes(include=[np.number])
print(numerical_columns)

categorical_columns = df.select_dtypes(include=['O'])
print(categorical_columns)

time_columns = df.select_dtypes(include=['datetime'])
print(time_columns)

   boolean_column
0            True
1           False
2            True
3           False
   float_column  integer_column
0           5.0               1
1           2.5               2
2           3.0               3
3           4.5               4
  categorical_column
0                  a
1                  b
2                  a
3                  c
         time_columns
0 2010-01-01 00:00:00
1 2010-01-02 00:10:00
2 2010-01-03 00:00:00
3 2010-01-04 00:20:00


### 2. Feature selection class

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])

In [4]:
obj_type_selector = TypeSelector(dtype = 'bool')
print(obj_type_selector.fit_transform(df))

obj_type_selector = TypeSelector(dtype = np.number)
print(obj_type_selector.fit_transform(df))

obj_type_selector = TypeSelector(dtype = 'O')
print(obj_type_selector.fit_transform(df))

obj_type_selector = TypeSelector(dtype = 'datetime')
print(obj_type_selector.fit_transform(df))


   boolean_column
0            True
1           False
2            True
3           False
   float_column  integer_column
0           5.0               1
1           2.5               2
2           3.0               3
3           4.5               4
  categorical_column
0                  a
1                  b
2                  a
3                  c
         time_columns
0 2010-01-01 00:00:00
1 2010-01-02 00:10:00
2 2010-01-03 00:00:00
3 2010-01-04 00:20:00


### 3. Union of the standalized variables

In [5]:
from sklearn.pipeline import Pipeline
?Pipeline

In [6]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

list_mission_1 = [
    ('features', FeatureUnion(n_jobs=1, transformer_list=[
                                ('numericals_normality_2', Pipeline([
                                    ('selector', TypeSelector(np.number)),
                                    ('scaler', StandardScaler(with_mean=True, with_std=False))
                                ])),
        
                                ('numericals_normality_1', Pipeline([
                                    ('selector', TypeSelector(np.number)),
                                    ('scaler', StandardScaler(with_mean=False, with_std=True))
                                ]))]
                             )
    )]

data_preparation_1 = Pipeline(steps=list_mission_1)

In [7]:
data_preparation_1.fit_transform(df)

array([[ 1.25      , -1.5       ,  4.8507125 ,  0.89442719],
       [-1.25      , -0.5       ,  2.42535625,  1.78885438],
       [-0.75      ,  0.5       ,  2.9104275 ,  2.68328157],
       [ 0.75      ,  1.5       ,  4.36564125,  3.57770876]])

### 4. Feature Creation

In [8]:
#add a supplementaire column in data preparation step new_column = cos(float_column)

#we creat a class with the 'fit' and 'transform' methods in order to play in a pipeline structure

class Define_cos_column(BaseEstimator, TransformerMixin):
    def __init__(self, columns_name):
        self.columns_name = columns_name
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        assert self.columns_name in X.columns
        X[self.columns_name + '_cos'] = np.cos(X[self.columns_name])
        return X

In [9]:
list_mission_2 = [('add_cos_column', Define_cos_column(columns_name = 'float_column'))] + list_mission_1
data_preparation_2 = Pipeline(steps=list_mission_2)
data_preparation_2.fit_transform(df)

array([[ 1.25      , -1.5       ,  0.71322962,  4.8507125 ,  0.89442719,
         0.56487176],
       [-1.25      , -0.5       , -0.37157618,  2.42535625,  1.78885438,
        -1.59536036],
       [-0.75      ,  0.5       , -0.56042507,  2.9104275 ,  2.68328157,
        -1.97142529],
       [ 0.75      ,  1.5       ,  0.21877163,  4.36564125,  3.57770876,
        -0.41976901]])

### 5. Support Vector Machine Modelling

In [26]:
from sklearn.svm import LinearSVC

list_mission_3 = list_mission_2 + [('estimator', LinearSVC(random_state=999))]
list_mission_3
data_preparation_3 = Pipeline(steps=list_mission_3)

data_preparation_3.fit(X = df.iloc[:,1:], y= df.iloc[:,0] )
data_preparation_3.predict(X = df.iloc[:,1:])

print('observation -----------')
print(df.iloc[:,0].values)

print('prediction -----------')
print(data_preparation_3.predict(X = df.iloc[:,1:]))

observation -----------
[ True False  True False]
prediction -----------
[ True False False False]


### 6. Tunning

In [25]:
from sklearn.model_selection import GridSearchCV

list_mission_4 = list_mission_2 + [('estimator', 
                                    GridSearchCV(LinearSVC(random_state=999), {'C': [0.0001, 0.1, 1, 100, 100000]}, cv = 2)
                                   )]
data_preparation_4 = Pipeline(steps=list_mission_4)
data_preparation_4.fit(X = df.iloc[:,1:], y= df.iloc[:,0] )
data_preparation_4.predict(X = df.iloc[:,1:])

print('observation -----------')
print(df.iloc[:,0].values)

print('prediction -----------')
print(data_preparation_3.predict(X = df.iloc[:,1:]))


observation -----------
[ True False  True False]
prediction -----------
[ True False False False]
