In [5]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
import numpy as np


iris = load_iris()
X, y = iris.data, iris.target
features = iris.feature_names

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=111)
crossvalidation = KFold(n=X_train.shape[0], n_folds=5,shuffle=True, random_state=1)

In [23]:
#Logistic Regression
from sklearn import linear_model
maxScore = 0.0;
maxReg = 0.0;

for regularization in np.arange(1, 10, 0.5):
	logClassifier = linear_model.LogisticRegression(C=regularization,random_state=111)
	
	score = np.mean(cross_val_score(logClassifier, X_train, y_train, scoring='accuracy', cv=crossvalidation, n_jobs=1))
	if(score > maxScore):
			maxScore = score
			maxReg= regularization	
	print 'Regularization:{0}  Accuracy: {1:.3f}'.format(regularization,score) #to identify the best regularization iteratively

#taking C = 150 as regularization factor
logClassifier = linear_model.LogisticRegression(C=150,random_state=111)
logClassifier.fit(X_train, y_train)
predicted = logClassifier.predict(X_test)
print predicted
print y_test
print accuracy_score(y_test, predicted)  # 1.0 is 100 percent accuracy

print maxReg

#taking value found in loop above as regularization factor
logClassifier = linear_model.LogisticRegression(C=maxReg,random_state=111)
logClassifier.fit(X_train, y_train)
predicted = logClassifier.predict(X_test)
print predicted
print y_test
print accuracy_score(y_test, predicted)  # 1.0 is 100 percent accuracy

Regularization:1.0  Accuracy: 0.942
Regularization:1.5  Accuracy: 0.958
Regularization:2.0  Accuracy: 0.967
Regularization:2.5  Accuracy: 0.958
Regularization:3.0  Accuracy: 0.958
Regularization:3.5  Accuracy: 0.958
Regularization:4.0  Accuracy: 0.958
Regularization:4.5  Accuracy: 0.958
Regularization:5.0  Accuracy: 0.958
Regularization:5.5  Accuracy: 0.958
Regularization:6.0  Accuracy: 0.958
Regularization:6.5  Accuracy: 0.958
Regularization:7.0  Accuracy: 0.958
Regularization:7.5  Accuracy: 0.958
Regularization:8.0  Accuracy: 0.958
Regularization:8.5  Accuracy: 0.958
Regularization:9.0  Accuracy: 0.958
Regularization:9.5  Accuracy: 0.958
[0 0 2 2 1 0 0 2 2 1 2 0 1 2 2 0 2 1 0 2 1 2 1 1 2 0 0 2 0 2]
[0 0 2 2 1 0 0 2 2 1 2 0 2 2 2 0 2 1 0 2 1 2 1 1 2 0 0 1 0 2]
0.933333333333
2.0
[0 0 2 2 1 0 0 2 2 1 2 0 2 2 2 0 2 1 0 2 1 2 1 1 2 0 0 2 0 2]
[0 0 2 2 1 0 0 2 2 1 2 0 2 2 2 0 2 1 0 2 1 2 1 1 2 0 0 1 0 2]
0.966666666667


In [15]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
maxScore = 0.0;
maxTreeSet = 0;

for treeset in np.arange(10, 110, 10):
	clf = RandomForestClassifier(n_estimators=treeset)
	
	score = np.mean(cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=crossvalidation, n_jobs=1))
	if(score > maxScore):
			maxScore = score
			maxTreeSet= treeset	
	print 'n_estimators:{0}  Accuracy: {1:.3f}'.format(treeset,score) #to identify the best depth iteratively


## Building the model using loop max
clf = RandomForestClassifier(n_estimators=maxTreeSet)

## Training the classifier
clf.fit(X_train, y_train)

## Predicting the Species
predicted = clf.predict(X_test)

## Checking the accuracy
print(accuracy_score(predicted, y_test))

## Building the model using 100 trees
clf = RandomForestClassifier(n_estimators=100)

## Training the classifier
clf.fit(X_train, y_train)

## Predicting the Species
predicted = clf.predict(X_test)

## Checking the accuracy
print(accuracy_score(predicted, y_test))

n_estimators:10  Accuracy: 0.967
n_estimators:20  Accuracy: 0.958
n_estimators:30  Accuracy: 0.967
n_estimators:40  Accuracy: 0.967
n_estimators:50  Accuracy: 0.967
n_estimators:60  Accuracy: 0.967
n_estimators:70  Accuracy: 0.975
n_estimators:80  Accuracy: 0.967
n_estimators:90  Accuracy: 0.967
n_estimators:100  Accuracy: 0.967
0.9
0.9


In [24]:
#CART
from sklearn import tree
maxScore = 0.0;
maxDepth = 0;

for depth in range(1,10):
	tree_classifier = tree.DecisionTreeClassifier(max_depth=depth, random_state=0)
	if tree_classifier.fit(X_train, y_train).tree_.max_depth < depth:
		break
	score = np.mean(cross_val_score(tree_classifier, X_train, y_train, scoring='accuracy', cv=crossvalidation, n_jobs=1))
	if(score > maxScore):
			maxScore = score
			maxDepth = depth	
	print 'Depth:{0}  Accuracy: {1:.3f}'.format(depth,score) #to identify the best depth iteratively

tree_classifier = tree.DecisionTreeClassifier(max_depth=maxDepth, random_state=0)
tree_classifier.fit(X_train,y_train)

predicted = tree_classifier.predict(X_test)

print predicted
print y_test

## Checking the accuracy
print(accuracy_score(predicted, y_test))

Depth:1  Accuracy: 0.617
Depth:2  Accuracy: 0.975
Depth:3  Accuracy: 0.992
Depth:4  Accuracy: 0.983
[0 0 2 2 2 0 0 2 2 1 2 0 1 2 2 0 2 1 0 2 1 2 1 1 2 0 0 2 0 2]
[0 0 2 2 1 0 0 2 2 1 2 0 2 2 2 0 2 1 0 2 1 2 1 1 2 0 0 1 0 2]
0.9


In [14]:
#Neural Network
from sklearn.neural_network import MLPClassifier
maxSize = 0;
maxScore = 0.0;

for size in range(1,15):
	clf = MLPClassifier(solver='lbfgs', alpha=1e-5,  hidden_layer_sizes=(size,), random_state=1)
	
	score = np.mean(cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=crossvalidation, n_jobs=1))
	if(score > maxScore):
			maxScore = score
			maxSize = size	
	print 'hidden_layer_sizes:{0}  Accuracy: {1:.3f}'.format(size,score) #to identify the best hidden_layer_sizes iteratively
#Using value found above
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,  hidden_layer_sizes=(maxSize,), random_state=1)

clf.fit(X_train, y_train)

predicted = clf.predict(X_test)

## Checking the accuracy
print(accuracy_score(predicted, y_test))

#Using fixed size of 15 just to see the improvement above
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,  hidden_layer_sizes=(15,), random_state=1)

clf.fit(X_train, y_train)

predicted = clf.predict(X_test)

## Checking the accuracy
print(accuracy_score(predicted, y_test))



hidden_layer_sizes:1  Accuracy: 0.250
hidden_layer_sizes:2  Accuracy: 0.733
hidden_layer_sizes:3  Accuracy: 0.250
hidden_layer_sizes:4  Accuracy: 0.942
hidden_layer_sizes:5  Accuracy: 0.425
hidden_layer_sizes:6  Accuracy: 0.250
hidden_layer_sizes:7  Accuracy: 0.933
hidden_layer_sizes:8  Accuracy: 0.967
hidden_layer_sizes:9  Accuracy: 0.975
hidden_layer_sizes:10  Accuracy: 0.983
hidden_layer_sizes:11  Accuracy: 0.975
hidden_layer_sizes:12  Accuracy: 0.967
hidden_layer_sizes:13  Accuracy: 0.975
hidden_layer_sizes:14  Accuracy: 0.983
0.966666666667
0.933333333333


Report:

For each of the algorithms we have used KFold with fold = 5 for cross validation.

    1. Logistic Regression:
    
       We can observe from above that the regularization factor is used to find a better accuracy. We went in  a loop for different values of regularization and chose the best. It is quite surprising to note that the higher value of regularization factor C = 150, gives a lesser value of accuracy.
       The lesser value of C = 1, gives a higher accuracy.
       
    2. Random Forest:
        
       For Random Forest we used the forest size as a parameter for better fitting the data. The same as above a loop was used to find the best forest size. Higher forest size has given better accuracy. There may be some tree sizes which may not contibute much to the accuracy. But, the ones that do makes a huge difference and that is the reason we go for higher values of tree sizes in a random forest.
    
    3. CART:
    
       Using CART, we followed the same procedures above. This time the depth of the tree is the parameter that we used to choose a better model. The more the depth the more complex the tree becomes. It is also interesting to note that after a point of depth the accuracy starts decreasing. So we need to find the depth with the best score. In our example above we can see that depth = 3 gives the best results. Depth = 4 has a lower result than Depth =3.
       
    4. Neural Network:
    
      Using Neural networks we used MLPClassifier from scikit learn. Here the alpha and the hidden layer sizes were the parameters we used to find the best model. We can observe that alpha 1e-5 and hidden layer size lesser of 14 gives the most accurate result.
    
    Neural Networks is the best model here in our opinion. It gave a good accuracy.
    