In [1]:
#########################
#User Input
#########################
#Provide the training dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# url ='https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data'

#provide the column names. Class column should be the labels
#*******Label should be marked as 'label'

names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
# names=['age', 'work_class','fnlwt','edu','edu_num','mat_stat','occu','rela','race','sex','capital_gain','capital_los','hr_week','native','label']

########################
# Load libraries
########################
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler #for standardizing data
le=LabelEncoder()
seed = 10


############################
# Load dataset
############################
def load_data(url):
     dataset = pd.read_csv(url,names=names ,engine='python')
     return dataset

#Run the function

dataset =load_data(url)
# dataset = dataset.iloc[1:100,:]




##########################
#Data Exploration
#########################
#Numeric exploration

def explore(dataset):
    #shape
    print("The dimention of the dataset is: "+ str(dataset.shape))
    print(' ')
    #Data type of each class
    print("The data types of the different classes are")
    print(dataset.dtypes)
    print(' ')
    
   
    #Data type counts, datatypes, memory usage, number of elements, if null
    print(dataset.info())
    print(' ')
    
    #Correlation between attributes
    print("The correlation between attributes are")
    pd.set_option('precision', 3)
    corr = dataset.corr(method='pearson')
    sns.heatmap(corr, xticklabels=corr.columns.values,yticklabels=corr.columns.values)
    sns.plt.show()

    print(' ')
    
    #Sqewness of univariate distribution
    #Skew refers to a distribution that is assumed Gaussian (normal or bell curve)
    #that is shifted or squashed in one direction or another.
    print('The Sqew of the attributes from Gaussian are')
    skew = dataset.skew()
    print(skew)
    print(' ')
    
    #summary statistics
    print("The summary statistics of the dataset is: ")
    print(dataset.describe())
    print(' ')
    
        

    #Number of rows for each class. 
    #This show how many different types of data are there

    d=data.groupby(list(data)[-1]).size() # group by last column
    print("The labels in the data are: ")
    
    print("There are %s labels in the dataset \n"% (len(d)))

    for i in range(0,len(d)):
        print("The label %s has %s elements \n" %( d.index[i],d[i]))

    a,b= d.nlargest(2)
    if(int(b*2)<=int(a)):
        print("**********There is huge difference in the counts between the classes****** \n")
        print("**********Need to fix this before modeling****** ")
    else: 
        print("The count of the largest class is %s times the second largest class \n" % (a/b)) 
    

    #Top10 rows
    print(' ')
    
    print('The first 10 rows are: ')
    print(dataset.head(10))
    print(' ')
    

#Run the function
# explore(dataset)

#Data Exploration by visualzation

def data_vis_all(dataset):
    #box_whisker plot
    #2 rows and 2 colmns
    print("Univariate plots and Multivariate plots:")
    dataset.plot(kind='box', subplots=True,figsize= (20,5), layout=(1,4), sharex=False, grid= True,sharey=False)
    #histogram
    dataset.plot(kind='hist', subplots=True,figsize= (20,5), layout=(1,4),sharex=False, grid= True, sharey=False)
    scatter_matrix(dataset,figsize=(10, 10))
    plt.show()

#Run the function
#data_vis_all(data)

#plot just one component

def data_vis_one(dataset, col1):
    fig, ax = plt.subplots(1, 2)
    ax[0].hist(dataset1.col1, 10, facecolor='red', alpha=0.5, label="Give me a label")
    ax[1].hist(dataset1.alcohol, 10, facecolor='white', ec="black", lw=0.5, alpha=0.5, label="Give me a label")
    fig.subplots_adjust(left=0, right=1, bottom=0, top=0.5, hspace=0.05, wspace=1)
    ax[0].set_ylim([0, 1000])
    ax[0].set_xlabel("This is my xlabel")
    ax[0].set_ylabel("Frequency")
    ax[1].set_xlabel("This is my Y label")
    ax[1].set_ylabel("Frequency")
    #ax[0].legend(loc='best')
    #ax[1].legend(loc='best')
    fig.suptitle("Distribution")
    plt.show()
    
# data_vis_one(dataset,age)


#########################
#Data processing
#########################

#Handing columns with unusual numbers
#Here we replace unsual number '0' with NaN
# dataset[['col1','col2']]= dataset[['col1','col2']].replace(0, np.NaN)

#############
#Handling null values
nrows =sum(dataset.apply(lambda x: sum(x.isnull().values), axis = 1)>0)
print(' ')
print("There are "+str(nrows)+ " rows with null values")
print(' ')

#removing rows with null values
# dataset.dropna(inplace=True)

# fill missing values with mean column values
dataset.fillna(dataset.mean(), inplace=True)
##############

#Convert catagorical attributes to numeric values

for col in dataset.columns.values:
     if dataset[col].dtypes=='object':
            
        le.fit(dataset[col].values)
        dataset[col]=le.transform(dataset[col])
        

###########################
#Test and Train data split
##########################



def test_train(dataset,seed):
    #Number of columns
    n= len(dataset.columns)
    array = dataset.values
    X = array[:,0:n-1]
    y = array[:,n-1]
    validation_size = 0.20
    X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, y, test_size=validation_size, random_state=seed)
    return  X_train, X_validation, Y_train, Y_validation

#Run the function
X_train, X_validation, Y_train, Y_validation = test_train(dataset,seed)
print(' ')
print("Data split done")
print(' ')

###########################################
#Preprocessing data before modeling
#We dp data standerization. Standardization is a way to deal with these values that lie so far apart
#0 and standard deviation of 1
###########################################

def standard(X_train,X_test):
# Define the scaler 
    scaler = StandardScaler().fit(X_train)

    # Scale the train set
    X_train = scaler.transform(X_train)

    # Scale the test set
    X_test = scaler.transform(X_test)

    return X_train, X_test

#################################
#Building models for classification
#################################

def crossval(X_train, Y_train,seed):
    models= []
    models.append(('LR', LogisticRegression()))
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('NB', GaussianNB()))
    models.append(('SVM', SVC()))

    results=[]
    names =[]
    scoring = 'accuracy'
    result_mean ={}
    
    for name, model in models:
        kfold=model_selection.KFold(n_splits=10, random_state=seed)
        cv_result= model_selection.cross_val_score(model,X_train, Y_train, cv=kfold, scoring=scoring)
        results.append(cv_result)
        names.append(name)
        msg = "%s:%f (%f)" % (name,cv_result.mean(), cv_result.std())
        result_mean[name]=cv_result.mean()
        #print(msg)

    return results, names,result_mean
        

#Run the function
results, names, result_mean=crossval(X_train, Y_train,seed)



# print the best classifier
max_key = max(result_mean, key=lambda k: result_mean[k])


print("Cross validation done")
print(' ')
###########################
#Plotting results of cross_validation
#########################

# fig = plt.figure()
# fig.suptitle('Algorithm Comparison')
# ax= fig.add_subplot(111)
# plt.boxplot(results)
# ax.set_xticklabels(names)
# plt.show()

################################
#Using the best classifier
################################

def best_class(max_key, X_train, Y_train, X_validation, Y_validation,seed):
    
     if(max_key)=='KNN':      

        # Make predictions on validation dataset
        knn = KNeighborsClassifier()
        knn.fit(X_train, Y_train)
        predictions = knn.predict(X_validation)
        
            
     elif(max_key)=='LR':
        
        lr = LogisticRegression()
        lr.fit(X_train, Y_train)
        predictions = lr.predict(X_validation)
        classifier =lr
        
        
     elif(max_key)=='LDA':
        lda = LinearDiscriminantAnalysis()
        lda.fit(X_train, Y_train)
        predictions = lda.predict(X_validation)
        classifier =lda

     elif(max_key)=='CART':
        cart= DecisionTreeClassifier()
        cart.fit(X_train, Y_train)
        predictions = cart.predict(X_validation)
        classifier =cart

     elif(max_key)=='NB':
        nb= GaussianNB()
        nb.fit(X_train, Y_train)
        predictions = nb.predict(X_validation)
        classifier =nb

     elif(max_key)=='SVM':
        svm= SVC()
        svm.fit(X_train, Y_train)
        predictions = svm.predict(X_validation)
        classifier =svm
    
     else:
            print('Error in model selection')
    
     #accuracy shows the number of observtions predicted correctly / total number of observations
     accuracy=accuracy_score(Y_validation,predictions)
    
     #confusion matrix shows the corectly and incorrect predictions for each class. 
     confusion=confusion_matrix(Y_validation,predictions)
    
     #Precision is the  number of positive predictions divided by the total number of positive class values predicted
     # Also called  Positive Predictive Value (PPV). Calculated as (TP)/(TP+FP)
     #Recall (Sensitivity) number of positive predictions divided by the number of positive class values in
     #the test data (Total cases). Calculated as (TP)/(TP+FN)
     #The F1 Score is the 2*((precision*recall)/(precision+recall))
     report = classification_report(Y_validation,predictions)
    
     return accuracy, confusion,report, classifier

#Run the function
accuracy, confusion,report,classifier= best_class(max_key, X_train, Y_train, X_validation, Y_validation,seed)

print("Model fitting done")
print(' ')

def results(accuracy, confusion,report,max_key):
    print("The best classifier is")
    print(max_key)    
    print("The accuracy of the model is: ")
    print(accuracy*100)
    print(' ')
    print("Confusion Matrix:")
    print(confusion)
    print(' ')
    print("Classification Report: ")
    print(report)
    
#Run function
results(accuracy, confusion,report,max_key)

####################################
#Model into action
###################################

def classify_data(classifier, data):
    print("According to the model the correct classification is:")
    print((classifier.predict(data)[0]))

#Give the data here

# data= [[6.7,3.1,5.6,2.4]]
# data =[[50,5,83311,9 ,13,2,4,0,4,1,0,0,13,25]]

#Here you go the results

#classify_data(classifier, data)



 
There are 0 rows with null values
 
 
Data split done
 
Cross validation done
 
Model fitting done
 
The best classifier is
LDA
The accuracy of the model is: 
100.0
 
Confusion Matrix:
[[10  0  0]
 [ 0 13  0]
 [ 0  0  7]]
 
Classification Report: 
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00        10
        1.0       1.00      1.00      1.00        13
        2.0       1.00      1.00      1.00         7

avg / total       1.00      1.00      1.00        30



In [16]:
dataset.tail(3)


Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2
149,5.9,3.0,5.1,1.8,2


In [2]:
data=[[4.6 ,3.1, 1.5, 0.2   ]]
classify_data(classifier, data)

According to the model the correct classification is:
0.0
