In [2]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

def load_data(path):
    diabetes = pd.read_csv(path,header=0)
    return diabetes

def load_diabetes_data():
    PATH = "datasets/Diabetes/pima-indians-diabetes.csv"
    diabetes_df = load_data(PATH)
    feature_names = diabetes_df.columns.array.copy()
    return diabetes_df,feature_names
    
def Separate_X_y_data(data):
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len(data))
    data = data.iloc[shuffled_indices]
    y = data[["Target"]]
    X = data.drop(columns='Target')
    return X,y

# This returns X_train, X_test, y_train, y_test
def produce_train_test_data(X,y,test_ratio):
    test_set_size = int(len(data)*test_ratio)
    return X[test_set_size:], X[:test_set_size],y[test_set_size:],y[:test_set_size]

class RemoveInvalidZeroValuesAttributes(BaseEstimator, TransformerMixin):
    def __init__(self,attributes):
        self.attributes = attributes
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        attrs = self.attributes
        for attribute in attrs:
            X= X[X[attribute]>0]
        return X

class FeatureSelection(BaseEstimator, TransformerMixin):
    def __init__(self,features):
        self.features = features
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        features = self.features
        X = X[features]
        return X

def preprocess_data(X,features,attributes):
    data_preprocess_pipeline = Pipeline([("feature_selection",FeatureSelection(features)),("clean_zero",RemoveInvalidZeroValuesAttributes(attributes)),("scaling",StandardScaler()),])
    X = data_preprocess_pipeline.fit_transform(X)
    X = pd.DataFrame.from_records(X)
    return X

data,features = load_diabetes_data()
X,y = Separate_X_y_data(data)

features=["Plasma","BMI","AGE","PEDIGREE","BP","NOP","SERUM_INSULIN","TSKIN_THICKNESS"]
attributes = ["Plasma","BMI","BP"]
X = preprocess_data(X,features,attributes)



  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [3]:
data_test = data[5:11]

In [4]:
data,features = load_diabetes_data()
data_test = data[5:11]

In [5]:
X_test,y_test = Separate_X_y_data(data_test)

In [6]:
X_test

Unnamed: 0,NOP,Plasma,BP,TSKIN_THICKNESS,SERUM_INSULIN,BMI,PEDIGREE,AGE
5,5,116,74,0,0,25.6,0.201,30
6,3,78,50,32,88,31.0,0.248,26
10,4,110,92,0,0,37.6,0.191,30
7,10,115,0,0,0,35.3,0.134,29
9,8,125,96,0,0,0.0,0.232,54
8,2,197,70,45,543,30.5,0.158,53


In [7]:
y_test

Unnamed: 0,Target
5,0
6,1
10,0
7,0
9,1
8,1


In [8]:
data_test
features = ["Plasma","BMI","BP"]
attributes = ["BP"]

class FeatureSelection(BaseEstimator, TransformerMixin):
    def __init__(self,features):
        self.features = features
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        features = self.features
        X = X[features]
        return X

def preprocess_data_test(X,y,features,attributes):
    data_preprocess_pipeline = Pipeline([("clean_zero",RemoveInvalidZeroValuesAttributes(attributes)),])
    X = data_preprocess_pipeline.fit_transform(X)
    return X


X_test2= preprocess_data_test(X_test,y_test,features,attributes)
data_test

Unnamed: 0,NOP,Plasma,BP,TSKIN_THICKNESS,SERUM_INSULIN,BMI,PEDIGREE,AGE,Target
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1
10,4,110,92,0,0,37.6,0.191,30,0


In [23]:
from sklearn.compose import ColumnTransformer
allFeatures=["NOP","Plasma","BP","TSKIN_THICKNESS","SERUM_INSULIN","BMI","PEDIGREE","AGE","Target"]
#[("feature_selection",FeatureSelection(features)),("clean_zero",RemoveInvalidZeroValuesAttributes(attributes)),("scaling",StandardScaler()),])
ct = ColumnTransformer([("feature_selection",FeatureSelection(features),features)],remainder='passthrough')

In [24]:
data_test_r = ct.fit_transform(data_test)

In [25]:
features

['Plasma', 'BMI', 'BP']

In [26]:
data_test_r

array([[1.16e+02, 2.56e+01, 7.40e+01, 5.00e+00, 0.00e+00, 0.00e+00,
        2.01e-01, 3.00e+01, 0.00e+00],
       [7.80e+01, 3.10e+01, 5.00e+01, 3.00e+00, 3.20e+01, 8.80e+01,
        2.48e-01, 2.60e+01, 1.00e+00],
       [1.15e+02, 3.53e+01, 0.00e+00, 1.00e+01, 0.00e+00, 0.00e+00,
        1.34e-01, 2.90e+01, 0.00e+00],
       [1.97e+02, 3.05e+01, 7.00e+01, 2.00e+00, 4.50e+01, 5.43e+02,
        1.58e-01, 5.30e+01, 1.00e+00],
       [1.25e+02, 0.00e+00, 9.60e+01, 8.00e+00, 0.00e+00, 0.00e+00,
        2.32e-01, 5.40e+01, 1.00e+00],
       [1.10e+02, 3.76e+01, 9.20e+01, 4.00e+00, 0.00e+00, 0.00e+00,
        1.91e-01, 3.00e+01, 0.00e+00]])

In [151]:
data = X_test.join(y_test)

 

In [152]:
data

Unnamed: 0,NOP,Plasma,BP,TSKIN_THICKNESS,SERUM_INSULIN,BMI,PEDIGREE,AGE,Target
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
10,4,110,92,0,0,37.6,0.191,30,0
7,10,115,0,0,0,35.3,0.134,29,0
9,8,125,96,0,0,0.0,0.232,54,1
8,2,197,70,45,543,30.5,0.158,53,1


In [153]:
y_test

5     0
6     1
10    0
7     0
9     1
8     1
Name: Target, dtype: int64

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
log_clf = LogisticRegression()
svm_clf = SVC()

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
def preprocess_data(data,ratio,attributesToClean):
    data_preprocess_pipeline = Pipeline([("purge_zero",RemoveInvalidZeroValuesAttributes(attributesToClean)),])
    data = data_preprocess_pipeline.fit_transform(data)
    data = pd.DataFrame.from_records(data)
    X, y = Separate_X_y_data(data)
    X_train, X_test, y_train, y_test = produce_train_test_data(X,y,ratio)
    return X_train, X_test, y_train, y_test




import numpy as np
# This returns X_train, X_test, y_train, y_test
def produce_train_test_data(X,y,test_ratio):
    test_set_size = int(len(data)*test_ratio)
    return X[test_set_size:], X[:test_set_size],y[test_set_size:],y[:test_set_size]

def data_selected_feature_and_scaling(features,cleanAttributes,scaling = True,ratio=0.1):
    data = load_diabetes_data()
    data_preprocess_pipeline = Pipeline([("purge_zero",RemoveInvalidZeroValuesAttributes(cleanAttributes)),])
    data = data_preprocess_pipeline.fit_transform(data)
    data = pd.DataFrame.from_records(data)
    data = data[features]
    X, y = Separate_X_y_data(data)
    scaler = StandardScaler()
    X=scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = produce_train_test_data(X,y,ratio)
    return X_train, X_test, y_train, y_test

features=["Plasma","BMI","AGE","PEDIGREE","BP","NOP","SERUM_INSULIN","Target","TSKIN_THICKNESS"]
cleanAttributes = ["Plasma","BMI","BP"]
X_train, X_test, y_train, y_test = data_selected_feature_and_scaling(features,cleanAttributes)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [45]:
log_clf.fit(X_train,y_train)
svm_clf.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [46]:
log_clf.coef_

array([[ 1.11061389,  0.7121622 ,  0.1622974 ,  0.35501381, -0.10553732,
         0.42996716, -0.06959884, -0.0613727 ]])

In [47]:
feature_names = X_train.columns.get_values()

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [48]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(X_train,y_train)
for name,score in zip(feature_names, rnd_clf.feature_importances_):
    print(name,score)
    


('NOP', 0.25293392328326364)
('Plasma', 0.16664728494846462)
('BP', 0.13697947975255406)
('TSKIN_THICKNESS', 0.12582020892782955)
('SERUM_INSULIN', 0.08629796494218565)
('BMI', 0.07950339990776505)
('PEDIGREE', 0.08075881623595042)
('AGE', 0.07105892200198671)


In [49]:
# 1: Plasma, 2:BMI, 3:AGE, 4:PEDIGREE ,5:BP, 6:NOP , 7:SERUM_INSULIN, 8: TSKIN_THICKNESS

In [50]:
# trying chapter 7 concepts

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score

rnd_clf = RandomForestClassifier()
log_clf = LogisticRegression()
svm_clf = SVC()
tree_clf = DecisionTreeClassifier()
KNN_clf = KNeighborsClassifier()
gradient_clf = GradientBoostingClassifier()
voting_clf = VotingClassifier(
    estimators = [('lr',log_clf),('rf',rnd_clf),('svc',svm_clf),('tree',tree_clf),('KNN',KNN_clf),('gradient',gradient_clf)],
    voting = 'hard')

voting_clf.fit(X_train,y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)), ('rf', RandomFo...    subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [51]:
from sklearn.metrics import accuracy_score
for clf in (log_clf,rnd_clf,svm_clf,voting_clf,tree_clf,KNN_clf,gradient_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

('LogisticRegression', 0.7631578947368421)
('RandomForestClassifier', 0.7368421052631579)
('SVC', 0.7236842105263158)
('VotingClassifier', 0.7631578947368421)
('DecisionTreeClassifier', 0.7631578947368421)
('KNeighborsClassifier', 0.7236842105263158)
('GradientBoostingClassifier', 0.7894736842105263)


In [141]:
df = pd.DataFrame([{"name":"hh"}])

In [143]:
df1 = pd.DataFrame([{"name2":"hh2"}])

In [144]:
dff = df.append(df1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [145]:
dff

Unnamed: 0,name,name2
0,hh,
0,,hh2
