## Objective
Run some naive models (both in terms of feature selection and hyperparameter tuning). This will set benchmark scores that more robustly designed models should outperform. 

In [1]:
%run data_package_loading.py # Code loads data as well as packages that are relevant across most project phases
%matplotlib inline

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold

In [3]:
data_target = [(Xuci_1, yuci_1, 'uci_1'), 
               (Xuci_2, yuci_2, 'uci_2'), 
               (Xuci_3, yuci_3, 'uci_3'), 
               (Xdb_1,  ydb_1, 'db_1'), 
               (Xdb_2,  ydb_2, 'db_2'), 
               (Xdb_3,  ydb_3, 'db_3')]

In [4]:
C = 10 ** 9
model_list = [LogisticRegression(C = C), 
              DecisionTreeClassifier(), 
              KNeighborsClassifier(), 
              SVC(C = C)]

In [41]:
def run_pipe_cv (X, y, data_name, model):
    
#     # Test Train split
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    
    # Set up Pipeline
    pipe = Pipeline([
                  ('scaler', StandardScaler()),
                  ('model', model)
                  ])
    
    # Set up Kfolds split
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state = 42)
    skf.get_n_splits(X, y)
    
    # Run pipe on Kfold
    train_scores = []
    test_scores = []
    for train_cv_index, val_cv_index in skf.split(X, y):
        X_train_temp = X.iloc[train_cv_index, :]
        y_train_temp = y[train_cv_index]
        X_test_temp = X.iloc[val_cv_index, :]
        y_test_temp = y[val_cv_index]
        
        pipe.fit(X_train_temp, y_train_temp)
        train_scores.append(pipe.score(X_train_temp, y_train_temp))
        test_scores.append(pipe.score(X_test_temp, y_test_temp))

    results.append({
                'test_train': 'train',
                'data': data_name,
                'scaler': pipe.named_steps['scaler'],
                'model': pipe.named_steps['model'],
                'score': np.mean(train_scores) #accuracy for classifications?
               })
    results.append({
            'test_train': 'test',
            'data': data_name,
            'scaler': pipe.named_steps['scaler'],
            'model': pipe.named_steps['model'],
            'score': np.mean(test_scores) #accuracy for classifications?
           })
        
    # How do I run the model on the test data?

In [42]:
results = []

for Xy in data_target:
    for model in model_list:
        run_pipe_cv(Xy[0], Xy[1], Xy[2], model)
        
results_df = pd.DataFrame(results)

In [43]:
# training scores
results_df[results_df['test_train'] == 'train'].sort_values('score', ascending=False)

Unnamed: 0,data,model,scaler,score,test_train
0,uci_1,"LogisticRegression(C=1000000000, class_weight=...","StandardScaler(copy=True, with_mean=True, with...",1.0,train
22,uci_3,"SVC(C=1000000000, cache_size=200, class_weight...","StandardScaler(copy=True, with_mean=True, with...",1.0,train
42,db_3,"DecisionTreeClassifier(class_weight=None, crit...","StandardScaler(copy=True, with_mean=True, with...",1.0,train
40,db_3,"LogisticRegression(C=1000000000, class_weight=...","StandardScaler(copy=True, with_mean=True, with...",1.0,train
38,db_2,"SVC(C=1000000000, cache_size=200, class_weight...","StandardScaler(copy=True, with_mean=True, with...",1.0,train
34,db_2,"DecisionTreeClassifier(class_weight=None, crit...","StandardScaler(copy=True, with_mean=True, with...",1.0,train
32,db_2,"LogisticRegression(C=1000000000, class_weight=...","StandardScaler(copy=True, with_mean=True, with...",1.0,train
30,db_1,"SVC(C=1000000000, cache_size=200, class_weight...","StandardScaler(copy=True, with_mean=True, with...",1.0,train
26,db_1,"DecisionTreeClassifier(class_weight=None, crit...","StandardScaler(copy=True, with_mean=True, with...",1.0,train
2,uci_1,"DecisionTreeClassifier(class_weight=None, crit...","StandardScaler(copy=True, with_mean=True, with...",1.0,train


In [44]:
# training scores
results_df[results_df['test_train'] == 'test'].sort_values('score', ascending=False)

Unnamed: 0,data,model,scaler,score,test_train
27,db_1,"DecisionTreeClassifier(class_weight=None, crit...","StandardScaler(copy=True, with_mean=True, with...",0.608064,test
35,db_2,"DecisionTreeClassifier(class_weight=None, crit...","StandardScaler(copy=True, with_mean=True, with...",0.594262,test
43,db_3,"DecisionTreeClassifier(class_weight=None, crit...","StandardScaler(copy=True, with_mean=True, with...",0.593401,test
15,uci_2,"SVC(C=1000000000, cache_size=200, class_weight...","StandardScaler(copy=True, with_mean=True, with...",0.584091,test
23,uci_3,"SVC(C=1000000000, cache_size=200, class_weight...","StandardScaler(copy=True, with_mean=True, with...",0.581951,test
11,uci_2,"DecisionTreeClassifier(class_weight=None, crit...","StandardScaler(copy=True, with_mean=True, with...",0.575,test
47,db_3,"SVC(C=1000000000, cache_size=200, class_weight...","StandardScaler(copy=True, with_mean=True, with...",0.564926,test
31,db_1,"SVC(C=1000000000, cache_size=200, class_weight...","StandardScaler(copy=True, with_mean=True, with...",0.553046,test
37,db_2,"KNeighborsClassifier(algorithm='auto', leaf_si...","StandardScaler(copy=True, with_mean=True, with...",0.552863,test
3,uci_1,"DecisionTreeClassifier(class_weight=None, crit...","StandardScaler(copy=True, with_mean=True, with...",0.552301,test
