
----------------------------


# DATA  PREPROCESSING
- ### A) DEALING WITH NULL VALUES
- ### B)CATEGORICAL DATA INTO SHAPE FOR ML 
- ### C)FEATURE SELECTION

In [95]:
from io import StringIO

csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
0.0,11.0,12.0,'''

#in python 2.7 we need to transorm it to unicode
csv_data = unicode(csv_data)
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,5,6,,8.0
2,0,11,12.0,


### A) Dealing with null values
#### Removing rows / Columns that contain NaN, null values

In [96]:
# FIRST INSPECTION: IF OUR DATAFRAME HAS NULL VALUES. DROP THEM
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [97]:
# One solution is to remove the rows (samples) or columns (features) that ontain null values
df.dropna()

Unnamed: 0,A,B,C,D
0,1,2,3,4


In [99]:
#set that we want to opperate with columns
df.dropna(axis=1)

Unnamed: 0,A,B
0,1,2
1,5,6
2,0,11


In [102]:
# Removing  NaN that are contained in a certain column. ex (D)
df.dropna(subset= ['D'])

Unnamed: 0,A,B,C,D
0,1,2,3.0,4
1,5,6,,8


Since Remove the values is not always a possibility due that we may run out of data, other possibilities may be taken into consideration.

#### Imputing missing values: replace missing values for values such as mean or median..

In [116]:
from sklearn.preprocessing import Imputer
#axis = 1 will calculate means by row
imr = Imputer(missing_values ='NaN',strategy='mean', axis = 0)
imr = imr.fit(df)
imputed_data = imr.transform(df.values)
#we get a list of arrays, rows... convert it to DF
pd.DataFrame(imputed_data,columns=['A','B','C','D'])

Unnamed: 0,A,B,C,D
0,1,2,3.0,4
1,5,6,7.5,8
2,0,11,12.0,6


### B) Handling Categorical data

In [143]:
import pandas as pd
df = pd.DataFrame([['yellow','S',22.4, 'model1'],
                  ['purple','M',25.4, 'model1'],
                  ['pink','L',29.4, 'model2']],columns =['color','size','price','model'])
df

#As we see, the feature "price" is numerical, however the other are ordinal (not numerical)
# In order to ensure the correct behaviour or certain ML algorithms we need to perform a mapping 

Unnamed: 0,color,size,price,model
0,yellow,S,22.4,model1
1,purple,M,25.4,model1
2,pink,L,29.4,model2


In [179]:
#since in this case the size means that one is begger than other, we do not transform it into dummy variable
size_mapping = {'L':3, 'M':2, 'S':1}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,model
0,yellow,1,22.4,0
1,purple,2,25.4,0
2,pink,3,29.4,1


### Label Encoding: It is convinient to perform the same kind of mapping with non numeric labels

In [140]:
# We use a similar approach to the obe followed before
import numpy as np
model_mapping = {label:idx for idx,label in enumerate (np.unique(df['model']))}
model_mapping

{'model1': 0, 'model2': 1}

In [141]:
#transofrm the class labels into ingerers:
df['model'] = df['model'].map(model_mapping)

In [142]:
df

Unnamed: 0,color,size,price,model
0,yellow,1,22.4,0
1,purple,2,25.4,0
2,pink,3,29.4,1


In [153]:
# NOTE: there is a "labelEncoder" class implemented in scikit learn that provides the same result:
df = pd.DataFrame([['yellow','S',22.4, 'model1'],
                  ['purple','M',25.4, 'model1'],
                  ['pink','L',29.4, 'model2']],columns =['color','size','price','model'])
df
from sklearn.preprocessing import LabelEncoder
model_le = LabelEncoder()
y = model_le.fit_transform(df['model'].values)

In [154]:
y

array([0, 0, 1])

In [180]:
# we can recover the original text string for the label calc. the inverse
model_le.inverse_transform(y)

array(['model1', 'model1', 'model2'], dtype=object)

### NEW TRY:  with DUMMY FEATURES

Unfortunately we cannot apply the same encoding for nominal features such as the color row. If we do so (assume yellow = 0, purple =1, pink=2), the learning algorithm will assume that PINK is larger than yellow and purple.

So, we need to use a techinque called "one-hot encoding". This approach will create a new dummy feature for each unique value in the nominal feature column.

We can use "OneHotEncoder" from the scikit-learn.preprocessing library, or even better, PANDAS:

In [186]:
# we define the column position that we want to transform
df = pd.DataFrame([['yellow','S',22.4, 'model1'],
                  ['purple','M',25.4, 'model1'],
                  ['pink','L',29.4, 'model2']],columns =['color','size','price','model'])

# first we transform the label, same as before
from sklearn.preprocessing import LabelEncoder
model_le = LabelEncoder()
y = model_le.fit_transform(df['model'].values)
df['model'] = y
df


Unnamed: 0,color,size,price,model
0,yellow,S,22.4,0
1,purple,M,25.4,0
2,pink,L,29.4,1


In [187]:
# perform mapping for nominal feature that has certain order, S < M < L. 
# For other use dummy variables
size_mapping = {'L':3, 'M':2, 'S':1}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,model
0,yellow,1,22.4,0
1,purple,2,25.4,0
2,pink,3,29.4,1


In [188]:
# PANDAS approach

# now we transform the remaning features (strings) to be transformed into dummy variables
pd.get_dummies(df[['size','color','price','model']])

Unnamed: 0,size,price,model,color_pink,color_purple,color_yellow
0,1,22.4,0,0,0,1
1,2,25.4,0,0,1,0
2,3,29.4,1,1,0,0


In [190]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data',
                      header=None)

In [192]:
df_wine.columns= ['Class label', 'Alcohol',
                 'Malic acid', 'Ash',
                 'Alcalinity of ash',' Magnesium',
                 'Total phenols', 'Flavanoids', 
                 'Nonflavanoid phenols',
                 'Proanthocyasnins',
                 'Color intensity', 'Hue',
                 'OD280/OD315 of diluted wines',
                 'Proline']

In [194]:
df_wine.head()


Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyasnins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [196]:
# Exploring data:  differnet class labels
print 'Class labels: ',np.unique(df_wine['Class label'])

Class labels:  [1 2 3]


## Partition dataset into training and test
### using cross_validation library, train_test_split

In [202]:

from sklearn.cross_validation import train_test_split
#first , seprate label and features
X,y = df_wine.iloc[:,1:].values, df_wine.iloc[:,0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

array([[  1.42300000e+01,   1.71000000e+00,   2.43000000e+00, ...,
          1.04000000e+00,   3.92000000e+00,   1.06500000e+03],
       [  1.32000000e+01,   1.78000000e+00,   2.14000000e+00, ...,
          1.05000000e+00,   3.40000000e+00,   1.05000000e+03],
       [  1.31600000e+01,   2.36000000e+00,   2.67000000e+00, ...,
          1.03000000e+00,   3.17000000e+00,   1.18500000e+03],
       ..., 
       [  1.32700000e+01,   4.28000000e+00,   2.26000000e+00, ...,
          5.90000000e-01,   1.56000000e+00,   8.35000000e+02],
       [  1.31700000e+01,   2.59000000e+00,   2.37000000e+00, ...,
          6.00000000e-01,   1.62000000e+00,   8.40000000e+02],
       [  1.41300000e+01,   4.10000000e+00,   2.74000000e+00, ...,
          6.10000000e-01,   1.60000000e+00,   5.60000000e+02]])

## Transforming features onto the same scale
### Two common methods to bring different features to the same scale: 
- ### Normalization
 
 Features between [0,1]
 
 Normalization via min-max scaling commonly used, values bounded interval, standarization can be more practical for many ML algorithms.

   X_norm_(i) = X(i) - X(min) / X(max) - X(min)
   
- ### Standarization

    Feature columns centered at mean 0 with standard deviation 1, so columns take the form of  a normal distribution. It maintains useful information about outliers and makes the algorithm lesss sensitive to them in contrast to min-max scaling
  
  X_std_(i)= X(i) - U (mean) / STD

In [211]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

#The output are numpy arrays. LEt's transform it to DataFrames (pandas) and explore them
pd.DataFrame(X_train_norm).head()


# as we see the values are normalized between [0,1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.72043,0.203782,0.537634,0.309278,0.336957,0.543165,0.737003,0.25,0.401899,0.240688,0.487179,1.0,0.585425
1,0.319892,0.084034,0.311828,0.43299,0.23913,0.453237,0.480122,0.480769,0.525316,0.135626,0.273504,0.641026,0.0
2,0.602151,0.712185,0.483871,0.484536,0.543478,0.176259,0.067278,0.557692,0.294304,0.851958,0.042735,0.106227,0.423482
3,0.572581,0.563025,0.424731,0.536082,0.347826,0.143885,0.024465,0.557692,0.278481,0.25979,0.051282,0.150183,0.419433
4,0.760753,0.130252,0.704301,0.742268,0.173913,0.665468,0.730887,0.134615,0.458861,0.200573,0.700855,0.692308,0.079352


In [213]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

#The output are numpy arrays. LEt's transform it to DataFrames (pandas) and explore them
pd.DataFrame(X_train_std).head()

# as we see the values are standarized , the mean is at 0 and STD = 1


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.910831,-0.462599,-0.011426,-0.820679,0.062417,0.588204,0.935654,-0.761914,0.130072,-0.512387,0.657066,1.943545,0.93701
1,-0.956099,-0.966087,-1.537254,-0.147487,-0.554019,0.169986,0.072432,0.207913,0.784626,-0.982107,-0.408595,0.58118,-1.413367
2,0.359522,1.675016,-0.374718,0.13301,1.363782,-1.118128,-1.314889,0.531189,-0.440566,2.220529,-1.559509,-1.448466,0.286837
3,0.221695,1.047864,-0.77434,0.413506,0.13091,-1.268687,-1.458759,0.531189,-0.524483,-0.426984,-1.516883,-1.281645,0.270582
4,1.098777,-0.771758,1.11478,1.535493,-0.964977,1.156982,0.915101,-1.246827,0.432174,-0.691735,1.722727,0.775804,-1.094782


# Regularization : regularization and diminsionality reduction
zero out irrelevant features (L1)

### Regularization with L1:
L1 regularization obtains sparse feature vectors. Most feature weights will be zero. So weights vectos are SPARSE (a few non-zero entries). Useful as dimensionality reduction.

### Regularization with L2: 
One approach to reduce the complexity of a model by penalizing large indicidual weights.
It replaces the square of the weights by the sum of the absolute values of the weigths. 


In [224]:
# L1 regularized logistic regression 
#http://scikit-learn.org/stable/auto_examples/linear_model/plot_logistic_l1_l2_sparsity.html
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1', C=0.1)
lr.fit(X_train_std, y_train)

print 'Training accuracy: ',lr.score(X_train_std, y_train)
print 'Test accuracy: ',lr.score(X_test_std, y_test)
#test and train accuracy does not indicate any overfitting

print 'Intercept:',lr.intercept_
# Multiclass dataset: it uses One-vs-Rest approach. First intercept class 1 versus 2 and 3.
# Second inteircept class 2 versus 1 and 3
# Third intercept class 3 versus 1 and 2
print 'Weight - scope: ',lr.coef_
# Each row has 13 weights where each weight is multipled by the respective feature.


Training accuracy:  0.983870967742
Test accuracy:  0.981481481481
Intercept: [-0.38378766 -0.15811471 -0.70038748]
Weight - scope:  [[ 0.28029291  0.          0.         -0.02805272  0.          0.
   0.71007683  0.          0.          0.          0.          0.
   1.23591472]
 [-0.64384698 -0.06886261 -0.05718511  0.          0.          0.          0.
   0.          0.         -0.92703131  0.05994892  0.         -0.3710137 ]
 [ 0.          0.06140729  0.          0.          0.          0.
  -0.63726805  0.          0.          0.49852684 -0.35811144 -0.57011968
   0.        ]]


* #### Note that the weight vectors are sparse. It means that they only have a few non-zero entries. 

+ #### As a result of L1 regularization, which serves as method for FEATURE SELECTION, the model is trained to potentially irrelevant features.


# C) FEATURE SELECTION: 

## Sequential feature selection algorithms:

Alternative way to reduce complexity of models and avoid overfitting: dimensionality reduction via FEATURE SELECTION. Useful for unregularized models or algorithms that do not support regularization.

### TWO MAIN TECHNIQUES OF DIMENSIONALITY REDUCTION 
+ #### Feature selection
    Here it is selected a subset of the original features.
+ #### Feature extraction
    Here it is obtained information from the feature set to construct a new feature subspace.
    
    Aim : compress a dataset onto a lower dimiensional feature subspace.
    
    
    
Sequential feature selection algorithms are a kind of greed search algorithms that are used to reduce an initial d-diminsional feature space to a k-dimiensional feature subspace where k<d

AIM feature selection algorithms: 
+ select a subset of features that are most relevant to the problem to improve computational efficiency
+ Reduce the generalization error of the model by removing irrelevant features or noise. Useful for algorithms that do not support regulrization


##### A sequential feature algorithm: SEQUENTIAL BACKWARD SELECTION (SBS)

+ Aims to reduce dimensionality of the initial feature subspace with a minumum decay in performance of the classifier to improve upon computational efficientcy.
+ Under certain condition, SBS can even improve the predictive power fot he model if it suffers from overfitting.
    
+ HOW IT WORKS: 
        It's pretty simple. The algorithm start removing features from the features subset until the new feature subspace contains the desired number of features. In order to decide which features to remove, in each step it will remove the feature that causes the least performance lost after removal.

#### Obtaining FEATURE importance using RANDOM FORESTS

Above we used L1 regularization to zero out irrelevant features via Logistic Regression and use the SBS algorithm for feature selection. 

Another approach to select the most relevant features is using RANDOM FORESTS. A Random forest ensables a bunch of decission trees (note that non of these require normalization/standarization: same scale). The output of random forests collects the importance of each feature.

Furthermore random forests provides us with an extra hint. If there is a feature hightly correlated with other/s , these will not show up. 

## Feature extraction: compreessing Data via Dimensionality Reduction

Summarize the information content of a dataset by transforming it onto
a new deature dubspace of lower dimensionality than the orifinal one.

Data comporession is an important topic in ML. Store and analyze inceeasin amounts of data.
+ Principal component analysis (PCA)
    For unsupervised data compression
+ Linear Discriminant Analysis (LDA)
    As a supervised dimensionality reduction for maximizing class separability
+ Non-linear dimensionality reduction via KERNEL PRINCIPAL ANALYSIS


While we mantained the original features in FEATURE SELECTION ALGORITHMS (sequential backwared selection), in FEATURE EXTRACTION we transform or porject the data onto a new feature space.
Defined as an approach to data compression with the goal of mantaining most of the relevant information.
+ Used to improve computational efficiency
+ Bug can also help to reduce the dimensionality, especially in nonregularized models. 



+ ### PCA:
Sensitive to data scale. We need to standarize the feeatures prior to PCA in oder to assign equal importance to all features. 

 1. Standarize the d-dimensional dataset
 2. Construct the covarnance matrix.
 3. Decomprose the covariance matrix into its eigenvectors and eigenvalues.
 4. Select k eigenvectors that correspond to the k larges eigenvalues , where k is the dimensionality of the new featuure subspace (k<= d)
 5. Construc a projection matrix W from the "top" k eigenvectors.
 6. Transform the d-dimensional input dataset X using the projection matrix W to obtain the new k-dimensional feature subspace.
 
 
 Projects data onto a lower-dimensional subspace to maximize the varianze along he orhogonal feature axes while ignoring the class labels.
 

In [232]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
lr = LogisticRegression()
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
lr.fit(X_train_pca,y_train)

print 'Training accuracy: ',lr.score(X_train_pca, y_train)
print 'Test accuracy: ',lr.score(X_test_pca, y_test)
#test and train accuracy does not indicate any overfitting

print 'Intercept:',lr.intercept_
# Multiclass dataset: it uses One-vs-Rest approach. First intercept class 1 versus 2 and 3.
# Second inteircept class 2 versus 1 and 3
# Third intercept class 3 versus 1 and 2
print 'Weight - scope: ',lr.coef_
# Each row has 13 weights where each weight is multipled by the respective feature.


 Training accuracy:  0.967741935484
Test accuracy:  0.981481481481
Intercept: [-1.627888  -1.0724269 -1.9969406]
Weight - scope:  [[-1.59954057  1.44963046]
 [-0.06658775 -2.90262559]
 [ 1.51638726  1.57383394]]


+ ### LDA:

    In contrast to PCA, is a techinique for supervised dimensionality reduction. It considers class information in the training dataset to attempt to maximize the class-separability in a linear feature space. 

+ ### Kernel version of PCA:
    
    Allows to map nonlinear datasets onto a lower-dimensional deature space where the classes become linearly separable.
    
    
    
    
    
    
### PIPELINE

+ Split data into X= features, Y = target/label. Train & Test subsets
+ Pipeline:
    1. Scaling: Standarization/Nornalization
    2. Dimensionality Reduction: PCA / others
    3. Learning Algorithm
    4. Predictive model. Predict