In [21]:
import pandas as pd
import numpy as np
diabetes = pd.read_csv('../input/diabetes.csv')
X = diabetes.drop('Outcome',axis=1)
y = diabetes['Outcome']


In [11]:
#   Can drop missing values but in this case if drops half of the data
#

X.Insulin.replace(0, np.nan, inplace=True)
X.SkinThickness.replace(0, np.nan, inplace=True)
X.BMI.replace(0, np.nan, inplace=True)
print(X.info())
print(X.dropna().info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 8 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               541 non-null float64
Insulin                     394 non-null float64
BMI                         757 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
dtypes: float64(4), int64(4)
memory usage: 48.1 KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 393 entries, 3 to 765
Data columns (total 8 columns):
Pregnancies                 393 non-null int64
Glucose                     393 non-null int64
BloodPressure               393 non-null int64
SkinThickness               393 non-null float64
Insulin                     393 non-null float64
BMI                         393 non-null float64
DiabetesPedigreeFunction    393 non-null float64
Age            

In [24]:
#   Handle missing data with imputation
#
# Below code is now deprecated and replaced with SimpleImputer
#
# from sklearn.preprocessing import Imputer
# imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
# imp.fit(X)
# X = imp.transform(X)
# X

from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X)
X = imp.transform(X)
X


array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [27]:
#
# Pipeline
#
X = diabetes.drop('Outcome',axis=1)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
logreg = LogisticRegression(solver='liblinear')

steps = [('imputation', imp),
         ('logistic_regression', logreg),]
pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

pipeline.score(X_test, y_test)


0.7532467532467533

In [29]:
import pandas as pd

namelist = [
    'party',
    'infants',
    'water',
    'budget',
    'physician',
    'salvador',
    'religious',
    'satellite',
    'aid',
    'missile',
    'immigration',
    'synfuels',
    'education',
    'superfund',
    'crime',
    'duty-free-exports',
    'eaa-rsa',]

partylist = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data', names=namelist, delimiter=',')
partylist = partylist.replace({'n': 0, 'y': 1, '?':0}, None)
print(partylist.head())

# Import necessary modules
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# Setup the pipeline steps: steps
steps = [('imputation', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)),
        ('SVM', SVC())]

# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit the pipeline to the train set
pipeline.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = pipeline.predict(X_test)

# Compute metrics
print(classification_report(y_test, y_pred))


        party  infants  water  budget  physician  salvador  religious  \
0  republican        0      1       0          1         1          1   
1  republican        0      1       0          1         1          1   
2    democrat        0      1       1          0         1          1   
3    democrat        0      1       1          0         0          1   
4    democrat        1      1       1          0         1          1   

   satellite  aid  missile  immigration  synfuels  education  superfund  \
0          0    0        0            1         0          1          1   
1          0    0        0            0         0          1          1   
2          0    0        0            0         1          0          1   
3          0    0        0            0         1          0          1   
4          0    0        0            0         1          0          1   

   crime  duty-free-exports  eaa-rsa  
0      1                  0        1  
1      1                  0     

  'precision', 'predicted', average, warn_for)


## Normalization

*Standardization* Subtract the mean and divide by the variance.
- all features are centered around zero and have variance one
Can also subtract the minimum and divide by the range
- Minimum zero and maximum one
Can also normalize fo the data ranges from -1 to +1


In [33]:
from sklearn.preprocessing import scale
X_scaled = scale(X['Glucose'])
print(np.mean(X['Glucose']), np.std(X['Glucose']))
print(np.mean(X_scaled), np.std(X_scaled))


120.89453125 31.95179590820272
-9.25185853854297e-18 1.0




In [35]:
# Import the necessary modules
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
        ('knn', KNeighborsClassifier())]
        
# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit the pipeline to the training set: knn_scaled
knn_scaled = pipeline.fit(X_train, y_train)

# Instantiate and fit a k-NN classifier to the unscaled data
knn_unscaled = KNeighborsClassifier().fit(X_train, y_train)

# Compute and print metrics
print('Accuracy with Scaling: {}'.format(knn_scaled.score(X_test, y_test)))
print('Accuracy without Scaling: {}'.format(knn_unscaled.score(X_test, y_test)))



Accuracy with Scaling: 0.7012987012987013
Accuracy without Scaling: 0.6883116883116883


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


In [37]:

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

# Setup the pipeline steps: steps
steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)),
         ('scaler', StandardScaler()),
         ('elasticnet', ElasticNet())]

# Create the pipeline: pipeline 
pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'elasticnet__l1_ratio':np.linspace(0,1,30)}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(pipeline, parameters)

# Fit to the training set
gm_cv.fit(X_train, y_train)

# Compute and print the metrics
r2 = gm_cv.score(X_test, y_test)
print("Tuned ElasticNet Alpha: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))




Tuned ElasticNet Alpha: {'elasticnet__l1_ratio': 0.0}
Tuned ElasticNet R squared: 0.22549670336491268
