In [27]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import env


# exercises

In [2]:
df = data("mpg")
target="trans"
df[target] = np.where(df.trans.str.startswith("auto"),"auto","manual")
train,test = train_test_split(df[["trans","displ","year",'cyl',"cty","hwy"]],test_size=.25,random_state=123,stratify=df["trans"])
X_train = train.drop(columns=target)
y_train=train[target]
X_test = test.drop(columns=target) 
y_test=test[target]
X_train.shape,X_test.shape

((175, 5), (59, 5))

In [21]:
df.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto,f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual,f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual,f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto,f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto,f,16,26,p,compact


In [19]:
def logistic_regression_classifier_blind_loop(  X_train,y_train,X_test,y_test,
            solver = ["newton-cg","lbfgs","liblinear","sag","saga"],
            max_iter = range(10,300,10),
            multi_class = ["auto","ovr","multinomial"],
            cv_folds=5):
    ''' 
    input:  
    '''
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LogisticRegression

    glm = LogisticRegression(random_state=123)

    params = {  "solver":solver,
                "max_iter":max_iter,
                "multi_class":multi_class,
                }


    grid = GridSearchCV(glm,params,cv=cv_folds)

    grid.fit(X_train,y_train)

    glm_model = grid.best_estimator_

    for params,score in zip(grid.cv_results_["params"], grid.cv_results_["mean_test_score"]):
        params["score"] = score
        

    return glm_model.score(X_test,y_test),glm_model.get_params(),pd.DataFrame(grid.cv_results_["params"]).sort_values("score",ascending=False)


In [20]:
logistic_regression_classifier_blind_loop(  X_train,y_train,X_test,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

(0.6779661016949152,
 {'C': 1.0,
  'class_weight': None,
  'dual': False,
  'fit_intercept': True,
  'intercept_scaling': 1,
  'l1_ratio': None,
  'max_iter': 130,
  'multi_class': 'multinomial',
  'n_jobs': None,
  'penalty': 'l2',
  'random_state': 123,
  'solver': 'sag',
  'tol': 0.0001,
  'verbose': 0,
  'warm_start': False},
      max_iter  multi_class     solver  score
 383       260          ovr        sag   0.68
 328       220  multinomial        sag   0.68
 313       210  multinomial        sag   0.68
 368       250          ovr        sag   0.68
 193       130  multinomial        sag   0.68
 ..        ...          ...        ...    ...
 372       250  multinomial  liblinear    NaN
 387       260  multinomial  liblinear    NaN
 402       270  multinomial  liblinear    NaN
 417       280  multinomial  liblinear    NaN
 432       290  multinomial  liblinear    NaN
 
 [435 rows x 4 columns])

In [10]:
def multi_layer_perceptron_classifier_blind_loop(  X_train,y_train,X_test,y_test,
            hidden_layer_sizes = (100,),
            activation = ['identity', 'logistic', 'tanh', 'relu'],
            solver = ["adam"],
            alpha = [.0001,.001,.01],
            learning_rate = ["adaptive"],
            max_iter = [50,100,150,200],
            cv_folds=5):
    ''' 
    input:  "hidden_layer_sizes":(100,),
            "activation":['identity', 'logistic', 'tanh', 'relu'],
            "solver":["lbfgs","sgd","adam"],
            "alpha":range(1,1000)/10_000,
            "learning_rate":["constant","invscaling","adaptive"],
            "max_iter":200,
    '''
    from sklearn.model_selection import GridSearchCV
    from sklearn.neural_network import MLPClassifier

    mlp = MLPClassifier(random_state=123)

    params = {  "hidden_layer_sizes":hidden_layer_sizes,
                "activation":activation,
                "solver":solver,
                "alpha":alpha,
                "learning_rate":learning_rate,
                "max_iter":max_iter,
                }


    grid = GridSearchCV(mlp,params,cv=cv_folds)

    grid.fit(X_train,y_train)

    mlp_model = grid.best_estimator_

    for params,score in zip(grid.cv_results_["params"], grid.cv_results_["mean_test_score"]):
        params["score"] = score
        

    return mlp_model.score(X_test,y_test),mlp_model.get_params(),pd.DataFrame(grid.cv_results_["params"]).sort_values("score",ascending=False)


In [11]:
multi_layer_perceptron_classifier_blind_loop(  X_train,y_train,X_test,y_test)



(0.6949152542372882,
 {'activation': 'logistic',
  'alpha': 0.01,
  'batch_size': 'auto',
  'beta_1': 0.9,
  'beta_2': 0.999,
  'early_stopping': False,
  'epsilon': 1e-08,
  'hidden_layer_sizes': 100,
  'learning_rate': 'adaptive',
  'learning_rate_init': 0.001,
  'max_fun': 15000,
  'max_iter': 100,
  'momentum': 0.9,
  'n_iter_no_change': 10,
  'nesterovs_momentum': True,
  'power_t': 0.5,
  'random_state': 123,
  'shuffle': True,
  'solver': 'adam',
  'tol': 0.0001,
  'validation_fraction': 0.1,
  'verbose': False,
  'warm_start': False},
    activation   alpha  hidden_layer_sizes learning_rate  max_iter solver  \
 21   logistic  0.0100                 100      adaptive       100   adam   
 23   logistic  0.0100                 100      adaptive       200   adam   
 26       tanh  0.0001                 100      adaptive       150   adam   
 20   logistic  0.0100                 100      adaptive        50   adam   
 17   logistic  0.0010                 100      adaptive       100

In [None]:
def k_neighbors_classifier_blind_loop(  X_train,y_train,X_test,y_test,
                                        n_neighbors=range(1,8),
                                        weights = ["uniform","distance"],
                                        algorithm = ['auto'],
                                        leaf_size = range(5,50,5),
                                        cv_folds=5):
    ''' 
    input:  "n_neighbors":range(2,8),
            "weights":["uniform","distance"],
            "algorithm":['auto', 'ball_tree', 'kd_tree', 'brute'],
            "leaf_size":range(10,50,5),
    '''
    from sklearn.model_selection import GridSearchCV
    from sklearn.neighbors import KNeighborsClassifier

    knc = KNeighborsClassifier()

    params = {  "n_neighbors":n_neighbors,
                "weights":weights,
                "algorithm":algorithm,
                "leaf_size":leaf_size,
                }


    grid = GridSearchCV(knc,params,cv=cv_folds)

    grid.fit(X_train,y_train)

    knc_model = grid.best_estimator_

    for params,score in zip(grid.cv_results_["params"], grid.cv_results_["mean_test_score"]):
        params["score"] = score
        

    return knc_model.score(X_test,y_test),knc_model.get_params(),pd.DataFrame(grid.cv_results_["params"]).sort_values("score",ascending=False)


In [None]:
k_neighbors_classifier_blind_loop(X_train,y_train,X_test,y_test)

(0.6101694915254238,
 {'algorithm': 'auto',
  'leaf_size': 5,
  'metric': 'minkowski',
  'metric_params': None,
  'n_jobs': None,
  'n_neighbors': 4,
  'p': 2,
  'weights': 'uniform'},
      algorithm  leaf_size  n_neighbors   weights     score
 34        auto         15            4   uniform  0.697143
 6         auto          5            4   uniform  0.697143
 20        auto         10            4   uniform  0.697143
 258    kd_tree          5            4   uniform  0.697143
 160  ball_tree         15            4   uniform  0.697143
 ..         ...        ...          ...       ...       ...
 350    kd_tree         40            1   uniform  0.582857
 113       auto         45            1  distance  0.582857
 112       auto         45            1   uniform  0.582857
 84        auto         35            1   uniform  0.582857
 99        auto         40            1  distance  0.582857
 
 [504 rows x 5 columns])

In [None]:
def decison_tree_blind_loop(X_train,y_train,X_test,y_test,
                            criterion=["gini","entropy"],
                            splitter=["best"],
                            max_depth=range(1,10),
                            min_samples_split=range(2,3),
                            min_samples_leaf=range(1,11),
                            max_features=[None],
                            cv_folds=5):
    ''' 
    inputs: (if unsure or looking for easy results, leave as is)
            criterion=["gini","entropy","log_loss"],
            splitter=["best","random"],
            max_depth=range(1,10),
            min_samples_split=range(2,10),
            min_samples_leaf=range(1,11),  
            max_features=["sqrt","log2",None]
    does a grid search on input features to determine best parameters on training set, and passes it to test set for accuracy check
    returns the score(float),best_params(dict),model_results(dataframe)
    '''
    from sklearn.model_selection import GridSearchCV
    from sklearn.tree import DecisionTreeClassifier

    dt = DecisionTreeClassifier(random_state=123)

    params = {  "criterion":criterion,
                "splitter":splitter,
                "max_depth":max_depth,
                "min_samples_split":min_samples_split,
                "min_samples_leaf":min_samples_leaf,
                "max_features":max_features
                }

    grid = GridSearchCV(dt,params,cv=cv_folds)

    grid.fit(X_train,y_train)

    dt_model = grid.best_estimator_

    for params,score in zip(grid.cv_results_["params"], grid.cv_results_["mean_test_score"]):
        params["score"] = score
        

    return dt_model.score(X_test,y_test),dt_model.get_params(),pd.DataFrame(grid.cv_results_["params"]).sort_values("score",ascending=False)

In [None]:
decison_tree_blind_loop(X_train,y_train,X_test,y_test)

(0.711864406779661,
 {'ccp_alpha': 0.0,
  'class_weight': None,
  'criterion': 'gini',
  'max_depth': 6,
  'max_features': None,
  'max_leaf_nodes': None,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'random_state': 123,
  'splitter': 'best'},
     criterion  max_depth max_features  min_samples_leaf  min_samples_split  \
 50       gini          6         None                 1                  2   
 51       gini          6         None                 2                  2   
 118   entropy          3         None                 9                  2   
 148   entropy          6         None                 9                  2   
 138   entropy          5         None                 9                  2   
 ..        ...        ...          ...               ...                ...   
 150   entropy          7         None                 1                  2   
 17       gini          2         None           

# from in class

In [None]:
df = data("mpg")

In [None]:
df.trans = np.where(df.trans.str.startswith("auto"),"auto","manual")
df.head()

((175, 5), (59, 5))

In [None]:
target = "trans"

In [None]:
train,test = train_test_split(df[["trans","displ","year",'cyl',"cty","hwy"]],test_size=.25,random_state=123,stratify=df["trans"])
X_train = train.drop(columns=target)
y_train=train[target]
X_test = test.drop(columns=target) 
y_test=test[target]
X_train.shape,X_test.shape

((175, 5), (59, 5))

In [None]:
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X_train,y_train)
tree.score(X_train,y_train)
tree.score(X_test,y_test)

0.6440677966101694

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
tree = DecisionTreeClassifier(max_depth=4,random_state=123)
cross_val_score(tree,X_train,y_train,cv=5)

array([0.68571429, 0.74285714, 0.65714286, 0.62857143, 0.6       ])

In [None]:
cross_val_score(tree,X_train,y_train,cv=5).mean()

0.6628571428571429

In [None]:
results = []
for x in range(1,21):
    tree = DecisionTreeClassifier(max_depth=x,random_state=123)
    score = cross_val_score(tree,X_train,y_train,cv=5).mean()
    results.append([x,score])
pd.DataFrame(results,columns=["max_depth","accuracy"])

Unnamed: 0,max_depth,accuracy
0,1,0.64
1,2,0.617143
2,3,0.645714
3,4,0.662857
4,5,0.651429
5,6,0.742857
6,7,0.645714
7,8,0.64
8,9,0.674286
9,10,0.691429


In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
knn = KNeighborsClassifier()
params = {'n_neighbors':range(1,21)}
grid = GridSearchCV(knn,params,cv=5)
grid.fit(X_train,y_train)
grid.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_n_neighbors', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [None]:
grid.cv_results_["mean_test_score"]

array([0.58285714, 0.67428571, 0.67428571, 0.69142857, 0.64      ,
       0.65142857, 0.61714286, 0.61714286, 0.62857143, 0.60571429,
       0.62857143, 0.61714286, 0.62285714, 0.61142857, 0.61714286,
       0.63428571, 0.62285714, 0.60571429, 0.60571429, 0.63428571])

In [None]:
grid.best_estimator_

In [None]:
knn_model=grid.best_estimator_
knn_model.score(X_test,y_test)

0.6440677966101694

In [None]:
from sklearn.model_selection import GridSearchCV

dt = DecisionTreeClassifier(random_state=123)

params = {  "criterion":["gini","entropy","log_loss"],
            "splitter":["best","random"],
            "max_depth":range(1,21),
            "min_samples_split":range(1,5),
            "min_samples_leaf":range(1,11),
            "max_features":["sqrt","log2",None]
            }

GridSearchCV(dt,params,cv=5)

grid = GridSearchCV(dt,params,cv=5)

grid.fit(X_train,y_train)

dt_model = grid.best_estimator_

dt_model.get_params()

dt_model.score(X_test,y_test)

NameError: name 'X_train' is not defined

In [None]:
grid = GridSearchCV(dt,params,cv=5)
grid.fit(X_train,y_train)

In [None]:
grid.cv_results_["mean_test_score"][:5]

array([0.64, 0.64, 0.64, 0.64, 0.64])

In [None]:
for params,score in zip(grid.cv_results_["params"],
                grid.cv_results_["mean_test_score"]):
    params["score"] = score

In [None]:
grid.cv_results_["params"][:5]

[{'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 1, 'score': 0.64},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 2, 'score': 0.64},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 3, 'score': 0.64},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 4, 'score': 0.64},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 5, 'score': 0.64}]

In [None]:
for params,score in zip(grid.cv_results_["params"],
                grid.cv_results_["mean_test_score"]):
    params["score"] = score
pd.DataFrame(grid.cv_results_["params"]).sort_values("score",ascending=False)

Unnamed: 0,criterion,max_depth,min_samples_leaf,score
50,gini,6,1,0.742857
170,gini,18,1,0.708571
180,gini,19,1,0.708571
190,gini,20,1,0.708571
160,gini,17,1,0.708571
...,...,...,...,...
352,entropy,16,3,0.605714
17,gini,2,8,0.600000
16,gini,2,7,0.600000
15,gini,2,6,0.600000


In [None]:
grid.best_estimator_

In [None]:
dt_model = grid.best_estimator_
dt_model.get_params()
dt_model.score(X_test,y_test)

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 6,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 123,
 'splitter': 'best'}

In [None]:
dt_model.score(X_test,y_test)

0.711864406779661

In [None]:
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder
le = LabelEncoder()

y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)
y_train_encoded

array([0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1])

In [None]:
# tensorflow imports
from tensorflow import keras
from keras import models,layers
# getting our data
from keras.datasets import mnist

import pandas as pd
import numpy as np



#acquisition
# basic ann model creation:
# make a model type --> invoked similary to the way we do in sklearn
network = models.Sequential()
# add layers into our model:

#middle layer
#512 is the middle layer and is reduced from the total (784) hence dense (not necessarily needed, but ideal if you're trying to reduce down)
# weight*activation + bias
network.add(layers.Dense(int(X_train.shape[1]*.5), activation="relu"))

#output layter
# only 10 in output layer due to only ten classes
network.add(layers.Dense(   len(y_train.unique()), 
                            activation="sigmoid" #softmax,sigmoid,relu
                            ))


network.compile(optimizer = "adam",
                loss= "binary_crossentropy",
                metrics=["accuracy"])


train_labels = keras.utils.to_categorical(y_train_encoded.astype("float"))
test_labels = keras.utils.to_categorical(y_test_encoded.astype("float"))


network.fit(X_train, 
            train_labels, 
            epochs=20,
            batch_size=int(len(y_train)/29))
            
test_loss, test_acc = network.evaluate(X_test,test_labels)
print(f'\n\n----------------\n\n\naccuracy of network on test set: {test_acc}')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


----------------


accuracy of network on test set: 0.6779661178588867
