<h1 align=center>Project 4</h1>
<br>
<div align=center>
$$
\textbf{Team G} \\ 
\text{Evangelou Sotiris 2159} \\ 
\text{Kalais Konstantinos 2146} \\ 
\text{Chatziefremidis Leuteris 2209} \\ 
$$
<div>

<h1>Problem 1 - Decision Trees</h1>

# $\triangleright$ Libraries

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn import preprocessing
from collections import deque

# $\triangleright$ Read from csv

In [2]:
#Get the csv data and  split into features,labels
df = pd.read_csv('./tennis.csv')
print(df.head())

    Outlook Temperature Humidity    Wind PlayTennis
0     Sunny         Hot     High    Weak         No
1     Sunny         Hot     High  Strong         No
2  Overcast         Hot     High    Weak        Yes
3      Rain        Mild     High    Weak        Yes
4      Rain        Cool   Normal    Weak        Yes


# $\triangleright$ Returns the labels of a column

In [3]:
def getLabelsFromFeature(feature):
   # print(feature)
    le = preprocessing.LabelEncoder()
    le.fit(feature)
   # print(le.classes_.tolist())
    return le.classes_.tolist()

# $\triangleright$ Entropy of all dataset

In [4]:
def full_dataset_entropy(dataset):
    
    #Parameters we will use:
    entropy = 0
    numOfYes = 0
    numOfNo = 0
    
    #Retrieve only the labels 
    df = pd.DataFrame(dataset,columns=['PlayTennis'])
    labels = np.array(df)
    
    for item in labels:
        if(item[0] =='Yes'):
            numOfYes+=1
        else:
            numOfNo+=1
    
    #Calculate the probabilities of Yes,No
    p_plus = numOfYes/len(labels)
    p_minus = numOfNo/len(labels)
    
    if(p_plus == 0 and p_minus == 0):
        return 0
    if(p_plus == 0):
        return - p_minus*math.log(p_minus,2)
    if(p_minus == 0):
        return -p_plus*math.log(p_plus,2)
    
    #Return the entropy 
    return -p_plus*math.log(p_plus,2) - p_minus*math.log(p_minus,2)

# $\triangleright$ Entropy of a feature's values

In [5]:
def entropy_for_feature_value(dataset,feature_name,value):
    
    #Retrieve the feature column with also labels included
    feature_with_labels_df = pd.DataFrame(dataset,columns=[feature_name,'PlayTennis'])
    aboveArr = np.array(feature_with_labels_df)

    #Calculate how many positive and negative a value in a column has
    #in order to compute the probability
    countedValues = 0
    isPositive = 0
    isNegative = 0
    for item in aboveArr:
        if(item[0] == value):
            countedValues+=1
        if(item[0]==value and item[1]=='Yes'):
            isPositive+=1
        if(item[0]==value and item[1]=='No'):
            isNegative+=1
    
    p_plus = isPositive/countedValues
    p_minus = isNegative/countedValues
    
    #Return the entropy
    if(p_plus == 0 and p_minus == 0):
        return 0
    if(p_plus == 0):
        return - p_minus*math.log(p_minus,2)
    if(p_minus == 0):
        return -p_plus*math.log(p_plus,2)
    
    return -p_plus*math.log(p_plus,2) - p_minus*math.log(p_minus,2)

# $\triangleright$ Entropy of a feature

In [6]:
#Calculates how many times a value appears in a feature
def countHowManyItAppears(col,value):
    count=0
    
    for item in col:
        if(item == value):
            count+=1
    return count
def entropy_for_feature(dataset,feature_name):
    
    #Get the values that a feature has
    values = getLabelsFromFeature(dataset[feature_name])
    col = dataset[feature_name]
    
    entropy =0
    
    #Calculate the propability and then the entropy
    for v in values:
        prop = (countHowManyItAppears(col,v)/len(col))
        entropy+= prop * entropy_for_feature_value(dataset,feature_name,v) 
    return entropy

# $\triangleright$ Information Gain

In [7]:
def information_gain(dataset,column_name):
    return full_dataset_entropy(dataset) - entropy_for_feature(dataset,column_name)

In [8]:
def findMaxGain(attributes,dataset):
    
    best_attr = 0
    best_attr_name=None
    
    for item in attributes:
        if(information_gain(dataset,item) >= best_attr):
            best_attr = information_gain(dataset,item)
            best_attr_name=item
    return best_attr_name     

# $\triangleright$ Class of tree node

In [9]:
class Node(object):
    def __init__(self):
        self.value = None
        self.next = None
        self.childs = None

# $\triangleright$ Check if the dataset has only one label

In [10]:
def onlyOneLabel(dataset):
    
    #Get the labels
    df = pd.DataFrame(dataset,columns=['PlayTennis'])
    arr = np.array(df)
    
    #Check if all the labels are Yes or No
    lab = arr[0][0]
    
    for i in range(0,len(arr)):
        if(arr[i][0] !=lab):
            return False
    return True

# $\triangleright$ Get the dominant label in a dataset

In [11]:
def getDominantLabel(dataset):
    
    dataset = pd.DataFrame(dataset,columns=['PlayTennis'])
    data = np.array(dataset)
    
    countYes = 0 
    countNo = 0
    for item in data:
        if(item =='Yes'):
            countYes+=1
        else:
            countNo+=1
    if(countYes > countNo):
        return 'Yes'
    return 'No'

# $\triangleright$ Leaf node dataset

In [12]:
def leafNodeDataset(dataset,column_name,value):
    return dataset.loc[dataset[column_name] == value] 

# $\triangleright$ ID3 Algorithm

In [13]:
def ID3(dataset,attributes):
    root = Node()
    
    #Check if all is Yes or No
    if(onlyOneLabel(dataset)):
        #print("Only One Label")
        root.value = np.array(pd.DataFrame(dataset,columns=['PlayTennis']))[0]
        return root
    
    #If there are no features to test pick the dominant
    if(len(attributes) == 0):
        #print("No attributes")
        root.value = getDominantLabel(dataset)
        return root
    
    #Calculate the best attribute based on the information gain
    bestAttribute = findMaxGain(attributes,dataset)
    #print(bestAttribute)
    
    #Set to the root the best attribute
    #print("Best Attribute: ",bestAttribute)
    root.value = bestAttribute
    root.childs = []
    
    #Get values of feature
    values = getLabelsFromFeature(dataset[bestAttribute])
    
    #print(values)
    for v in values:
        
        #Create a child node for each value that
        #the feature has 
        child = Node()
        child.value = v
        
        #Append each child to the root with the
        #best information gain
        root.childs.append(child)
        
        #Get the samples who has the values of 
        #best attribute equal to True
        nextDataset = leafNodeDataset(dataset,bestAttribute,v)
        
        #If we are on leaf
        if(len(nextDataset) == 0):
            child.next = getDominantLabel(dataset)
        else:
            at_copy  = list(attributes)
            at_copy.remove(bestAttribute)
            child.next = ID3(nextDataset,at_copy)
    return root

# $\triangleright$ Print tree

In [14]:
def printTree(root):
    if root:
        roots = deque()
        roots.append(root)
        
        while( len(roots) > 0):
            root = roots.popleft()
            print(root.value)
            if(root.childs):
                for child in root.childs:
                    print('({})'.format(child.value))
                    roots.append(child.next)
            elif root.next:
                print(root.next)

# $\triangleright$ Train

In [15]:
def train(df):
    
    #Get attributes
    df_c = df
    df_c = df_c.drop(columns=['PlayTennis'])
    cols = df_c.columns.tolist()
    
    #Return the root of the tree
    return ID3(df,cols)

# $\triangleright$ Predict

In [16]:
#It can predict one instance at a time
def predict(instance,root,oldVal):
    
    predictions=[]
    
    #print(root.value)
    if(root.value =='Yes' or root.value == 'No'):
        return root.value

    #Get the value of the attribute
    rootVal = instance[root.value].values[0]
    #print(rootVal)
    for child in root.childs:
        if(child.value == rootVal):
            if(child.next):
                oldVal = predict(instance,child.next,oldVal)
    return oldVal

# $\triangleright$ Test

In [17]:
def test(df,root):
    
    #Get the labels of df
    labels = df['PlayTennis']
    
    predicted = []
    
    for i in range(len(df)):
        tempDf = pd.DataFrame(df,index=[i])
        oldVal = None
        predicted.append(predict(tempDf,root,oldVal))
        
    countMatch = 0
    
    for i in range(len(labels)):
        if(predicted[i] == labels[i]):
            countMatch+=1
    return countMatch/len(labels)

# $\triangleright$ Exercise 1

In [18]:
#Give full set
root = train(df)
acc = test(df,root)
print("Full Set Accuracy: ",acc*100)

Full Set Accuracy:  100.0


# $\triangleright$ Exercise 2

In [19]:
#The temperature could be included at the learned tree if we remove outlook
#because of the information gain
ex2 = pd.DataFrame(df,columns=['Temperature','Humidity','Wind','PlayTennis'])
root = train(ex2)
printTree(root)
acc = test(ex2,root)
print("Full Set Accuracy: ",acc*100)

Humidity
(High)
(Normal)
Wind
(Strong)
(Weak)
Wind
(Strong)
(Weak)
Temperature
(Hot)
(Mild)
Temperature
(Hot)
(Mild)
Temperature
(Cool)
(Mild)
['Yes']
['No']
No
No
No
No
['Yes']
Full Set Accuracy:  71.42857142857143


# $\triangleright$ Exercise 3

In [20]:
#Give D1-D7
df_copy = df
df = pd.DataFrame(df,index=[0,1,2,3,4,5,6])
root = train(df)

#Training accuracy
acc = test(df,root)
print("D1-D7 Training Accuracy: ",acc*100)

#Test accuracy
acc = test(df_copy,root)
print("D1-D7 Test Accuracy: ",acc*100)
print 
print("So we have overfitting.")

D1-D7 Training Accuracy:  100.0
D1-D7 Test Accuracy:  85.71428571428571
So we have overfitting.


### Explanation:
Training Accuracy is 100.0%, because we train our model on these instances, thus the model learns perfectly to predict the training set. On the other hand, the test instances are unknown to the model and it has to base the predictions on the knowledge it gains from the training set, so we have lower accuracy.

# $\triangleright$ Exercise 4 Pruning stategy

<h3>$\bullet$  Reduced error pruning</h3>
One of the simplest forms of pruning is reduced error pruning. Starting at the leaves, each node is replaced with its most popular class. If the prediction accuracy is not affected then the change is kept. While somewhat naive, reduced error pruning has the advantage of simplicity and speed.
<h3>$\bullet$  Cost complexity pruning</h3>
Cost complexity pruning generates a series of trees $T_0...T_m$ where $T_0$ is the initial tree and $T_m$ is the root alone.At step $i$ the tree is created by removing a subtree from tree $i-1$ and replacing it with a leaf node with value chosen as in the tree building algorithm. The subtree that is removed is chosen as follows:

* Define the error rate of tree $T$ over data set $S$ as $err(T,S)$
* The subtree that minimizes $\frac{err(prune(T,s),S) - err(T,S))}{\left | leaves(T)  \right | - \left | leaves(prune(T,t))  \right |}$ is chosen for removal

The function $prune(T,t)$ defines the tree gotten by pruning the subtrees $t$ from the tree $T$.

<h1>Problem 2 - Neural Networks</h1>

In [21]:
import numpy as np
import math

class Perceptron():
    def __init__(self,*args):
        if(len(args)==1):
            self.weights = np.random.uniform(0,1,2)
            self.bias = - 0.5
            self.lr = args[0]
        elif(len(args))== 2:
            self.weights = args[1]
            self.bias = - 0.5
            self.lr = args[0]
        else:
            print("Constructor hasn't initialized properly!")
    def activation_function(self,z):
        if(z >0):
            return 1
        else:
            return 0
    def predict(self,inputs):
        out = 0
        for i in range(len(self.weights)):
            out+=self.weights[i]*inputs[i]
        return self.activation_function(out + self.bias)
    def train(self,inputs,target):
        
        #Make a prediction
        pred = self.predict(inputs)
        
        #Calculate the error
        err = target -pred

        #Update the weights
        for i in range(len(self.weights)):
            self.weights[i]+= self.lr *err *inputs[i]
           
        #Update the bias
        self.bias = self.bias + self.lr * err
        return [pred,err]

# $\triangleright$ Exercise 1

In [22]:
#We set rate 0.2 W_1  0.1 W_2 0.3 
inputs = [[0,0],[0,1],[1,0],[1,1]]
targets =[0,1,1,1]
br  = Perceptron(0.2,[0.1,0.3])

iterUntilConverge=0
#Train until it predicts everything corrent
results = []
while True:
    correctAtTrain=0
    iterUntilConverge+=1
   #Check if the error is zero then added it to correct ones
    for i in range(len(targets)):
        
        oldW1 = br.weights[0]
        oldW2 = br.weights[1]
        
        returnedVals = br.train(inputs[i],targets[i])
        if(returnedVals[1]== 0):
            correctAtTrain+=1
        
        #Save the changes into a array
        results.append([inputs[i][0],inputs[i][1],oldW1,oldW2,returnedVals[0],targets[i],
                        returnedVals[1],br.weights[0],br.weights[1]])
    #If all inputs are corrent then quit training
    if(correctAtTrain == len(inputs)):
        break
print("Iterations until converge: ",iterUntilConverge)
df = pd.DataFrame(results,columns=['X_1','X_2','W_1','W_2','Z','Y','Error','W_1','W_2'])
df

Iterations until converge:  2


Unnamed: 0,X_1,X_2,W_1,W_2,Z,Y,Error,W_1.1,W_2.1
0,0,0,0.1,0.3,0,0,0,0.1,0.3
1,0,1,0.1,0.3,0,1,1,0.1,0.5
2,1,0,0.1,0.5,0,1,1,0.3,0.5
3,1,1,0.3,0.5,1,1,0,0.3,0.5
4,0,0,0.3,0.5,0,0,0,0.3,0.5
5,0,1,0.3,0.5,1,1,0,0.3,0.5
6,1,0,0.3,0.5,1,1,0,0.3,0.5
7,1,1,0.3,0.5,1,1,0,0.3,0.5


# $\triangleright$ Exercise 2

The prediction formula for simple Perceptron is defined as :


$$Ζ_{i}=\tilde{y} = \sigma(X_{i,1}W_1 + X_{i,2}W_2 + b)$$

So with the above formula we can define the  squared error function E:
$$
Ε = \frac{1}{n}\sum_{i=1}^{n} (y_i - \tilde{y})^{2}
$$

$$
Ε = \frac{1}{n}\sum_{i=1}^{n} (y_i - \sigma(X_{i,1}W_1 + X_{i,2}W_2 + b))^{2}
$$

The derivative of the sigmoid s(x) function has the property : 

$$
s'(x) = s(x)[ 1 - s(x)]
$$

With the above property the nabla of  squared error E is defined as:

$$
\bigtriangledown E(w) = \frac{2}{n}\sum_{i=1}^{n} (y_i - \tilde{y})[- \tilde{y} (1 - \tilde{y})(X_{i,1} + X_{i,2})]
$$
So the weights update rule :

$$
 w(t+1) = w(t) - n\bigtriangledown E(w)
$$

$$
    w(t+1) = w(t) - n\frac{2}{n}\sum_{i=1}^{n} (y_i - \tilde{y})[- \tilde{y} (1 - \tilde{y})(X_{i,1} + X_{i,2})]
$$

If we simplify the above formula we will get the weights update rule for the perceptron.

# $\triangleright$ Exercise 3

If we run the above code the algorithm will not converge because we added the noisy instance.

<h1>Problem 3 - Naive Bayes</h1>

# $\triangleright$ Libraries

In [23]:
#import nltk
#nltk.download('stopwords')
#nltk.download('punkt')

In [24]:
from os import listdir
from os.path import isfile, join
import sys
import re
import string
import math
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import json
import random
from random import shuffle

# $\triangleright$ Get the files and their category

In [25]:
def getFilesFromDirectory(dirName):
    
    #Retrieve the name of the files that dirName contains
    msgFiles = [f for f in listdir(dirName) if isfile(join(dirName, f))]
    
    #Check if it is spam or not spam
    category = []
    for f in msgFiles:
        
        if f.startswith("sp"):
            category.append("SPAM")
        else:
            category.append("MAIL")
    return category,msgFiles

# $\triangleright$ Text preprocess and creation of dictionary

In [26]:
def readAllTheFile(fileName):
    contents = ""
    with open(fileName) as f:
        for line in f.readlines():
            contents += line
    return contents
def processEachTextFile(fileContent):
    
    #Turn to lower 
    fileContent = fileContent.lower()

    #Remove irrelevant numbers
    fileContent = re.sub(r'\d+', '', fileContent)
    
    #Remove punctuation
    fileContent = fileContent.translate(str.maketrans("","",string.punctuation))
    
    #Remove whitespaces
    fileContent = fileContent.strip()
    
    #Remove stop words and tokenization
    stop_words = set(stopwords.words('english'))
    
    tokens = word_tokenize(fileContent)
    fileContent = [i for i in tokens if not i in stop_words]

    return fileContent
def createTheDictionary(dirName,category,msgFiles):
    
    #Create a global diction
    globalDictionary = {}
    
    
    for k in range(0,len(msgFiles)):
        
        #Create the full path
        fullPath = dirName+'/'+msgFiles[k]
        
        #Get the words of the file we process
        wordsFromFile = processEachTextFile(readAllTheFile(fullPath))
        
        for i in range(0,len(wordsFromFile)):
            
            #Get the word
            word = wordsFromFile[i]
            
            #If the word doesn't exist then add to dictionary
            if globalDictionary.get(word)== None :
                
                #Put a json object inside dictionary
                if(category[k] == "MAIL"):
                    globalDictionary[word] = json.dumps({"MAIL":1,"SPAM":0})
                else:
                    globalDictionary[word] = json.dumps({"MAIL":0,"SPAM":1})
            else :
                
                #Get the old values
                tempWordMail = json.loads(globalDictionary.get(word))["MAIL"]
                tempWordSpam = json.loads(globalDictionary.get(word))["SPAM"]
                
                #Update the word fields from spam and mail based on what category
                #our file is.
                if(category[k] == "MAIL"):
                    globalDictionary[word] = json.dumps({"MAIL":tempWordMail+1,"SPAM":tempWordSpam})
                else:
                    globalDictionary[word] = json.dumps({"MAIL":tempWordMail,"SPAM":tempWordSpam+1})
    return globalDictionary

# $\triangleright$ N(Class) and P(word | class)

In [27]:
def countClassSize(category,dictionary):
    
    countClass = 0
    for item in dictionary:
        
        #Retrieve each object
        getJSON = json.loads(dictionary.get(item))
        
        if getJSON[category] > 0 :
            countClass+=1
    return countClass
def wordPropability(word,category,countClass,dictionary):
    
    #Retrieve the word
    if(category == "MAIL"):
        countWClass = json.loads(dictionary.get(word))["MAIL"]
    else:
        countWClass = json.loads(dictionary.get(word))["SPAM"]

    return (countWClass + 1)/(countClass + len(dictionary))

# $\triangleright$ Train on full set

In [28]:
def train(dirName):
    
    #Get the file names and categories
    category,msgFiles = getFilesFromDirectory(dirName)
    
    #Create the dictionary
    wordDictionary = createTheDictionary(dirName,category,msgFiles)

    #Calculate the N of each class
    Nspam = countClassSize("SPAM",wordDictionary)
    Nmail = countClassSize("MAIL",wordDictionary)
    
    propabilityDictionary = {}
    
    for item in wordDictionary:
        
        #Calculate the two probabilities
        spamPropability = wordPropability(item,"SPAM",Nspam,wordDictionary)
        mailPropability = wordPropability(item,"MAIL",Nmail,wordDictionary)
        
        #Save the probabilities into a dictionary
        propabilityDictionary[item] = json.dumps({"MAIL":mailPropability,"SPAM":spamPropability})
    return propabilityDictionary,wordDictionary

#Train the model and create the dictionary 
#with the probabilities
probabilities,wordDictionary = train("./Email_spam/train")

# $\triangleright$ Test on full set

In [29]:
def test(probabilities,dirName,wordDictionary):
    
    #Get the file names and categories
    category,msgFiles = getFilesFromDirectory(dirName)
    
    #Calculate the N of each class
    Nspam = countClassSize("SPAM",wordDictionary)
    Nmail = countClassSize("MAIL",wordDictionary)
    
    results = []
    for k in range(0,len(msgFiles)):
        
        #Create the full path
        fullPath = dirName+'/'+msgFiles[k]
    
        #Get the words of the file we process
        wordsFromFile = processEachTextFile(readAllTheFile(fullPath))
    
        spamProbaForFile = 0
        mailProbaForFile = 0
        for word in wordsFromFile:
            
            #Check if exists at the probability dictionary
            if(probabilities.get(word)!= None):
                
                #Get the two probabilities
                spamP = json.loads(probabilities.get(word))["SPAM"]
                mailP = json.loads(probabilities.get(word))["MAIL"]
                
                #Add to the class of the file
                spamProbaForFile += math.log(spamP)
                mailProbaForFile += math.log(mailP)
            else:
                
                #Calculate the probabilities for the new word
                spamP = 1 /(Nspam + len(wordDictionary))
                mailP = 1 /(Nmail + len(wordDictionary))
                
                #Add to the total
                spamProbaForFile += math.log(spamP)
                mailProbaForFile += math.log(mailP)
        
        #Added the propability of the class
        spamProbaForFile = spamProbaForFile + math.log(Nspam / len(wordDictionary),2)
        mailProbaForFile = mailProbaForFile + math.log(Nmail / len(wordDictionary),2)
        
        
        if(spamProbaForFile >= mailProbaForFile):
            results.append("SPAM")
        else:
            results.append("MAIL")
        
    countCorrent = 0

    #Calculate the accuracy
    for i in range(0,len(results)):
        if results[i] == category[i]:
            countCorrent+=1

    acc = countCorrent/len(results)
    return acc


trainAcc = test(probabilities,"./Email_spam/train",wordDictionary)
print("Train Accuracy: ",trainAcc*100,"%")
testAcc = test(probabilities,"./Email_spam/test",wordDictionary)
print("Test Accuracy: ",testAcc*100,"%")
print("The test accuracy is higher because the test size is smaller than the train size.")

Train Accuracy:  99.50083194675541 %
Test Accuracy:  99.50248756218906 %
The test accuracy is higher because the test size is smaller than the train size.


# $\triangleright$ Train with subset of the dataset

In [30]:
def getFilesFromDirectorySubset(dirName,percent):
    
    #Retrieve the name of the files of a subset
    msgFiles = [f for f in listdir(dirName) if isfile(join(dirName, f))]
    
    #Shuffle the files before getting one percent
    shuffle(msgFiles)
    msgFiles = msgFiles[0:int(percent*len(msgFiles))]
    
    #Check if it is spam or not spam
    category = []
    for f in msgFiles:
        
        if f.startswith("sp"):
            category.append("SPAM")
        else:
            category.append("MAIL")
    return category,msgFiles

def trainWithSubset(dirName,percent):
    
    #Get the file names and categories
    category,msgFiles = getFilesFromDirectorySubset(dirName,percent)
    
    #Create the dictionary
    wordDictionary = createTheDictionary(dirName,category,msgFiles)

    #Calculate the N of each class
    Nspam = countClassSize("SPAM",wordDictionary)
    Nmail = countClassSize("MAIL",wordDictionary)
    
    propabilityDictionary = {}
    
    for item in wordDictionary:
        
        #Calculate the two probabilities
        spamPropability = wordPropability(item,"SPAM",Nspam,wordDictionary)
        mailPropability = wordPropability(item,"MAIL",Nmail,wordDictionary)
        
        #Save the probabilities into a dictionary
        propabilityDictionary[item] = json.dumps({"MAIL":mailPropability,"SPAM":spamPropability})
    return propabilityDictionary,wordDictionary

print("Train with 20 % of dataset")
#Train with 20 %
probabilities,wordDictionary = trainWithSubset("./Email_spam/train",0.2)

trainAcc = test(probabilities,"./Email_spam/train",wordDictionary)
print("Train Accuracy: ",trainAcc*100,"%")
testAcc = test(probabilities,"./Email_spam/test",wordDictionary)
print("Test Accuracy: ",testAcc*100,"%")

print()

print("Train with 40 % of dataset")
#Train with 40 %
probabilities,wordDictionary = trainWithSubset("./Email_spam/train",0.4)

trainAcc = test(probabilities,"./Email_spam/train",wordDictionary)
print("Train Accuracy: ",trainAcc*100,"%")
testAcc = test(probabilities,"./Email_spam/test",wordDictionary)
print("Test Accuracy: ",testAcc*100,"%")

print()

print("Train with 60 % of dataset")
#Train with 60 %
probabilities,wordDictionary = trainWithSubset("./Email_spam/train",0.6)

trainAcc = test(probabilities,"./Email_spam/train",wordDictionary)
print("Train Accuracy: ",trainAcc*100,"%")
testAcc = test(probabilities,"./Email_spam/test",wordDictionary)
print("Test Accuracy: ",testAcc*100,"%")

print()

print("Train with 80 % of dataset")
#Train with 80 %
probabilities,wordDictionary = trainWithSubset("./Email_spam/train",0.8)

trainAcc = test(probabilities,"./Email_spam/train",wordDictionary)
print("Train Accuracy: ",trainAcc*100,"%")
testAcc = test(probabilities,"./Email_spam/test",wordDictionary)
print("Test Accuracy: ",testAcc*100,"%")

print()

print("Train with 100 % of dataset")
#Train with 100 %
probabilities,wordDictionary = trainWithSubset("./Email_spam/train",1)

trainAcc = test(probabilities,"./Email_spam/train",wordDictionary)
print("Train Accuracy: ",trainAcc*100,"%")
testAcc = test(probabilities,"./Email_spam/test",wordDictionary)
print("Test Accuracy: ",testAcc*100,"%")

print()

Train with 20 % of dataset
Train Accuracy:  96.00665557404326 %
Test Accuracy:  99.50248756218906 %

Train with 40 % of dataset
Train Accuracy:  99.00166389351082 %
Test Accuracy:  100.0 %

Train with 60 % of dataset
Train Accuracy:  99.33444259567388 %
Test Accuracy:  99.50248756218906 %

Train with 80 % of dataset
Train Accuracy:  99.33444259567388 %
Test Accuracy:  99.50248756218906 %

Train with 100 % of dataset
Train Accuracy:  99.50083194675541 %
Test Accuracy:  99.50248756218906 %



In this particular example, we are training the model using subsets of the training set. We test the accuracy of the training set based on the subset-trained model, thus the more percentage of the training set we use, the better the training accuracy is. The testing accuracy, though, is stable and almost perfect because even a small training subset is able to represent the general behaviour of the model, and consequently provide high testing accuracy.

# $\triangleright$ Cross Validation with 10 subsets

In [31]:
def splitToPortions(dirName):
    
    #Retrieve all the files inside the train
    msgFiles = [f for f in listdir(dirName) if isfile(join(dirName, f))]
    
    #Split into 10 subsets
    splitSize = int(len(msgFiles)/10)

    crossValidationSet = []
    category = []
    for i in range(0, len(msgFiles)-1, splitSize):
        
        chunk = msgFiles[i:i + splitSize]
        
        #Check if the chunk files is spam or not spam
        categoryChunk = []
        for f in chunk:

            if f.startswith("sp"):
                categoryChunk.append("SPAM")
            else:
                categoryChunk.append("MAIL")
        
        #Add our chucks inside the total 
        crossValidationSet.append(chunk)
        category.append(categoryChunk)
    
    return crossValidationSet,category

def testAtCrossValidation(probabilities,msgFiles,category,dirName,wordDictionary):
    
    #Calculate the N of each class
    Nspam = countClassSize("SPAM",wordDictionary)
    Nmail = countClassSize("MAIL",wordDictionary)
    
    results = []
    for k in range(0,len(msgFiles)):
        
        #Create the full path
        fullPath = dirName+'/'+msgFiles[k]
    
        #Get the words of the file we process
        wordsFromFile = processEachTextFile(readAllTheFile(fullPath))
    
        spamProbaForFile = 0
        mailProbaForFile = 0
        for word in wordsFromFile:
            
            #Check if exists at the probability dictionary
            if(probabilities.get(word)!= None):
                
                #Get the two probabilities
                spamP = json.loads(probabilities.get(word))["SPAM"]
                mailP = json.loads(probabilities.get(word))["MAIL"]
                
                #Add to the class of the file
                spamProbaForFile += math.log(spamP)
                mailProbaForFile += math.log(mailP)
            else:
                
                #Calculate the probabilities for the new word
                spamP = 1 /(Nspam + len(wordDictionary))
                mailP = 1 /(Nmail + len(wordDictionary))
                
                #Add to the total
                spamProbaForFile += math.log(spamP)
                mailProbaForFile += math.log(mailP)
        
        #Added the propability of the class
        spamProbaForFile = spamProbaForFile + math.log(Nspam / len(wordDictionary),2)
        mailProbaForFile = mailProbaForFile + math.log(Nmail / len(wordDictionary),2)
        
        if(spamProbaForFile >= mailProbaForFile):
            results.append("SPAM")
        else:
            results.append("MAIL")
        
    countCorrent = 0

    #Calculate the accuracy
    for i in range(0,len(results)):
        if results[i] == category[i]:
            countCorrent+=1

    acc = countCorrent/len(results)
    return acc

def crossValidation():
    
    #Prepare the data for cross validation
    crossValidation,category=splitToPortions("./Email_spam/train")
    
    for i in range(0,len(crossValidation)):
        
        #Separate test set from dataset
        testSet = []
        testSetCategory =[]
        trainSet = []
        trainSetCategory = []
        for k in range(0,len(crossValidation)):
            
            if(k != i):
                trainSet.append(crossValidation[k])
                trainSetCategory.append(category[k])
            else:
                testSet.append(crossValidation[k])
                testSetCategory.append(category[k])
        
        testSet=np.array(testSet).flatten()
        testSetCategory =np.array(testSetCategory).flatten()
        trainSet = np.array(trainSet).flatten()
        trainSetCategory = np.array(trainSetCategory).flatten()
        
        #######################   Training  #########################
        
        #Create the dictionary
        wordDictionary = createTheDictionary("./Email_spam/train",trainSetCategory,trainSet)

        #Calculate the N of each class
        Nspam = countClassSize("SPAM",wordDictionary)
        Nmail = countClassSize("MAIL",wordDictionary)

        propabilityDictionary = {}

        for item in wordDictionary:

            #Calculate the two probabilities
            spamPropability = wordPropability(item,"SPAM",Nspam,wordDictionary)
            mailPropability = wordPropability(item,"MAIL",Nmail,wordDictionary)

            #Save the probabilities into a dictionary
            propabilityDictionary[item] = json.dumps({"MAIL":mailPropability,"SPAM":spamPropability})
            
        #######################   Testing  ##############################    
        
        
        accuracy = testAtCrossValidation(propabilityDictionary,testSet,testSetCategory,"./Email_spam/train"
                                         ,wordDictionary)
        print("Iteration: %d Accuracy: %f %%"%(i,accuracy*100))
                
crossValidation()# $\triangleright$ Create the dictionaries with the words

Iteration: 0 Accuracy: 96.666667 %
Iteration: 1 Accuracy: 96.666667 %
Iteration: 2 Accuracy: 98.333333 %
Iteration: 3 Accuracy: 96.666667 %
Iteration: 4 Accuracy: 95.000000 %
Iteration: 5 Accuracy: 100.000000 %
Iteration: 6 Accuracy: 100.000000 %
Iteration: 7 Accuracy: 98.333333 %
Iteration: 8 Accuracy: 96.666667 %
Iteration: 9 Accuracy: 98.333333 %


We can conclude that the data are of high quality, because whichever training subset we choose, the model predicts equally well, so every portion of the dataset can represent the general behaviour of the data.

<h1>Problem 3 - K-Nearest Neighbors</h1>

# $\triangleright$ Libraries

In [32]:
from os import listdir
from os.path import isfile, join
import sys
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import json
import random
from random import shuffle
import math
import numpy as np

# $\triangleright$ Preproccessing the text

In [33]:
def readAllTheFile(fileName):
    contents = ""
    with open(fileName) as f:
        for line in f.readlines():
            contents += line
    return contents
def processEachTextFile(fileContent):
    
    #Turn to lower 
    fileContent = fileContent.lower()

    #Remove irrelevant numbers
    fileContent = re.sub(r'\d+', '', fileContent)
    
    #Remove punctuation
    fileContent = fileContent.translate(str.maketrans("","",string.punctuation))
    
    #Remove whitespaces
    fileContent = fileContent.strip()
    
    #Remove stop words and tokenization
    stop_words = set(stopwords.words('english'))
    
    tokens = word_tokenize(fileContent)
    fileContent = [i for i in tokens if not i in stop_words]

    return fileContent

# $\triangleright$ Training

In [34]:
def getFilesFromDirectory(dirName):
    
    #Retrieve the name of the files that dirName contains
    msgFiles = [f for f in listdir(dirName) if isfile(join(dirName, f))]
    
    #Check if it is spam or not spam
    category = []
    for f in msgFiles:
        
        if f.startswith("sp"):
            category.append("SPAM")
        else:
            category.append("MAIL")
    return category,msgFiles

#Returns an array with dictionaries for each message and
#how many times does each word comes up
def train(dirName):
 
    #Read the files from the directory
    category,msgFiles = getFilesFromDirectory(dirName)
    
    #Dictionary with all vectors from messages
    vectorDicArray = []
    
    for k in range(0,len(msgFiles)):
        
         #Create the full path
        fullPath = dirName+'/'+msgFiles[k]
    
        #Get the words of the file we process
        wordsFromFile = processEachTextFile(readAllTheFile(fullPath))
        
        #Define a temp dictionary for the current file
        fileDic = {}
        fileDic["$cat"]=category[k]
        
        for word in wordsFromFile:
            
            #Check if the word exists in dictionary
            if word in fileDic:          
                #If it already exists add one to the count
                fileDic[word] =fileDic[word] +1
            else:
                #Add the word to the dictionary
                fileDic[word]=1
        #Add the file dictionary to the array
        vectorDicArray.append(fileDic)
        
    return vectorDicArray 

#Train our model with the training instances
vectorDicArray = train("./Email_spam/train")

def calculateSim(firstSet,secSet):
    
    #Calculate the root of first element
    rootFirst = 0
    for key, value in firstSet.items():
        
        if(value!="MAIL" and value!="SPAM"):
            rootFirst += value**2
    rootFirst = math.sqrt(rootFirst)
    #print(rootFirst)
    
    #Calculate the root of second element
    rootSec = 0 
    for key, value in secSet.items():
        
        if(value!="MAIL" and value!="SPAM"):
            rootSec += value**2
    rootSec = math.sqrt(rootSec)
    #print(rootSec)
    
    upperSum = 0
    for key,value in firstSet.items():
        
        #Check if the key exists at both sets
        if((key in firstSet) and (key in secSet)):
            if(key!="$cat"):
                upperSum += firstSet.get(key)*secSet.get(key)  
    #Return sim
    return upperSum / (rootFirst * rootSec)

# $\triangleright$ Testing with K=1,3,5,19

In [35]:
def findKappaMax(k,arr):
    possibleMax = []
    for item in arr:
        possibleMax.append(item["Sim"])
    topK = np.argsort(possibleMax)[-k:]
    
    return topK.tolist()

def findTheMost(arr):
    countSpam = 0
    countMail = 0
    
    for item in arr:
        if(item["Category"] == "MAIL"):
            countMail+=1
        else:
            countSpam+=1
    
    if countSpam>=countMail:
        return "SPAM"
    else:
        return "MAIL"
    
def testknn(dirName,vectorDicArray,kappa,topT=False, glob_dict=None):
    
    #Read the files from the directory
    category,msgFiles = getFilesFromDirectory(dirName)
    
    correntClass = 0
    wrongClass = 0
    for k in range(0,len(msgFiles)):
        
        #Create the full path
        fullPath = dirName+'/'+msgFiles[k]
    
        #Get the words of the file we process
        wordsFromFile = processEachTextFile(readAllTheFile(fullPath))
        
        #Define a temp dictionary for the current file
        fileDic = {}
        fileDic["$cat"]=category[k]
        
        for word in wordsFromFile:
            
            #Check if the word exists in dictionary
            if word in fileDic:          
                #If it already exists add one to the count
                fileDic[word] =fileDic[word] +1
            else:
                #Add the word to the dictionary
                fileDic[word]=1
        
        results = []
        #Compare each vectorDic element with the dictionary we got
        for item in vectorDicArray:
            if topT:
                results.append({"Category":item["$cat"],"Sim":calculateSimtopT(item,fileDic, glob_dict)})
            else:
                results.append({"Category":item["$cat"],"Sim":calculateSim(item,fileDic)})
        #print(results)
        
        #Return the K top values
        maxIndex = findKappaMax(kappa,results)
       
        #Get the top K elements from the list
        topKElements = []
        for item in maxIndex:
            topKElements.append(results[item])
        
        #print(topKElements)
        
        #Find the category
        cat=findTheMost(topKElements)
    
        if fileDic["$cat"] == cat:
            correntClass+=1
        else:
            wrongClass+=1
    
    accuracy = correntClass/len(msgFiles)
    return accuracy
acc=testknn("./Email_spam/test",vectorDicArray,1)
print("\nK:",1,"Test Accuracy: ",acc*100," %")
acc=testknn("./Email_spam/test",vectorDicArray,3)
print("\nK:",3,"Test Accuracy: ",acc*100," %")
acc=testknn("./Email_spam/test",vectorDicArray,5)
print("\nK:",5,"Test Accuracy: ",acc*100," %")
acc=testknn("./Email_spam/test",vectorDicArray,19)
print("\nK:",19,"Test Accuracy: ",acc*100," %")


K: 1 Test Accuracy:  98.00995024875621  %

K: 3 Test Accuracy:  97.01492537313433  %

K: 5 Test Accuracy:  97.51243781094527  %

K: 19 Test Accuracy:  98.00995024875621  %


# $\triangleright$ Select only T features at preprocessing.

In [36]:
#Find the initial entroypy 
def e0(dictionary):
    nspam = 0
    nmail = 0
    
    #Calculate the entropy based on the dictionary 
    #we give
    entropy_dict = {}
    for word in dictionary:
        spamnum = json.loads(dictionary.get(word))["SPAM"]
        mailnum = json.loads(dictionary.get(word))["MAIL"]
        total = len(dictionary)
        if spamnum == 0:
            e0 = - (mailnum/total)*math.log(mailnum/total,2)
        elif mailnum == 0:
            e0 = - (spamnum/total)*math.log(spamnum/total,2)
        elif spamnum == 0 and mailnum == 0:
            e0 = 0
        else:
            e0 = - (spamnum/total)*math.log(spamnum/total,2) - (mailnum/total)*math.log(mailnum/total,2)
        entropy_dict[word] = e0
    return entropy_dict

def information_gain_nb(T):
    
    #Get the filenames and the category of each email
    cat , msg_files = getFilesFromDirectory('Email_spam/train')
    global_dict = createTheDictionary('Email_spam/train',cat,msg_files)
    new_dict = {}
    
    #Get the words that appear 50 or more times inside mails
    for word in global_dict:
        spamnum = json.loads(global_dict.get(word))["SPAM"]
        mailnum = json.loads(global_dict.get(word))["MAIL"]
        res = spamnum + mailnum
        if res >= 50:
            new_dict[word]= json.dumps({"MAIL":mailnum,"SPAM":spamnum}) 
            
    #Calculate the entropy 
    entropy0 = e0(new_dict)
    word_array = []
    entropy_array = []
    for field in entropy0:
        word_array.append(field)
        entropy_array.append(entropy0[field])
    word_array = [x for y, x in sorted(zip(entropy_array, word_array))]
    word_array = word_array[-T:]
    final_dict = {}
    
    #Return dictionary with the top T features
    for word in word_array:
        spamnum = json.loads(new_dict.get(word))["SPAM"]
        mailnum = json.loads(new_dict.get(word))["MAIL"]
        final_dict[word] = json.dumps({"MAIL":mailnum,"SPAM":spamnum})
    return final_dict

#Naive Bayes Training
def train_top(T):
    
    #Create the dictionary
    wordDictionary = information_gain_nb(T)

    #Calculate the N of each class
    Nspam = 0
    Nmail = 0
    for word in wordDictionary:
        Nspam += json.loads(wordDictionary.get(word))["SPAM"]
        Nmail += json.loads(wordDictionary.get(word))["MAIL"]
    
    propabilityDictionary = {}
    
    for item in wordDictionary:
        
        #Calculate the two probabilities
        spamPropability = wordPropability(item,"SPAM",Nspam,wordDictionary)
        mailPropability = wordPropability(item,"MAIL",Nmail,wordDictionary)
        
        #Save the probabilities into a dictionary
        propabilityDictionary[item] = json.dumps({"MAIL":mailPropability,"SPAM":spamPropability})
    return propabilityDictionary,wordDictionary

probs,dictionary = train_top(10)
print("The top 10 most predictive words are: ", end='')
for word in dictionary:
    print(word, end=', ')
print('\n')

print("Naive Bayes model")
for i in [20,50,100,200,500]:
    probs,dictionary = train_top(i)
    testAcc = test(probs,"./Email_spam/test",dictionary)
    print("\nT:",i,"Test Accuracy: ",testAcc*100,"%")

## KNN
def calculateSimtopT(firstSet,secSet,top_dict):
    
    #Calculate the root of first element
    rootFirst = 0
    for key, value in firstSet.items():
        
        if(value!="MAIL" and value!="SPAM"):
            rootFirst += value**2
    rootFirst = math.sqrt(rootFirst)
    #print(rootFirst)
    
    #Calculate the root of second element
    rootSec = 0 
    for key, value in secSet.items():
        
        if(value!="MAIL" and value!="SPAM"):
            rootSec += value**2
    rootSec = math.sqrt(rootSec)
    #print(rootSec)
    
    upperSum = 0
    for key,value in firstSet.items():
        
        #Check if the key exists at both sets
        if((key in firstSet) and (key in secSet) and (key in top_dict)):
            if(key!="$cat"):
                upperSum += firstSet.get(key)*secSet.get(key)  
    #Return sim
    return upperSum / (rootFirst * rootSec)

print("\nK nearest neighbours model")
for i in [20,50,100,200,500]:
    _,dictionary = train_top(i)
    vectorDicArray = train("./Email_spam/train")
    testAcc = testknn("./Email_spam/test",vectorDicArray,3,True,dictionary)
    print("\nT:",i,"Test Accuracy: ",testAcc*100,"%")


The top 10 most predictive words are: call, number, use, like, first, fax, e, work, also, would, 

Naive Bayes model

T: 20 Test Accuracy:  71.14427860696517 %

T: 50 Test Accuracy:  80.09950248756219 %

T: 100 Test Accuracy:  92.53731343283582 %

T: 200 Test Accuracy:  95.52238805970148 %

T: 500 Test Accuracy:  98.00995024875621 %

K nearest neighbours model

T: 20 Test Accuracy:  53.233830845771145 %

T: 50 Test Accuracy:  62.68656716417911 %

T: 100 Test Accuracy:  69.65174129353234 %

T: 200 Test Accuracy:  80.09950248756219 %

T: 500 Test Accuracy:  96.51741293532339 %


We can see that with higher T, the algorithms both learn better, which was expected. KNN does not perform equally well with more feature selection, due to the nature of the algorithm that needs more data to perform. NB has an advantage of performing better with good feature selection because dealing with probabilities is better in that domain than dealing with pure distances.

# $\triangleright$ Our Model.

In [37]:
def breakthrough_method(dictionary):
    nspam = 0
    nmail = 0
    our_dict = {}
    for word in dictionary:
        our_dict[word] = len(word)
    return our_dict

def create_new_dict(T):
    cat , msg_files = getFilesFromDirectory('Email_spam/train')
    global_dict = createTheDictionary('Email_spam/train',cat,msg_files)
    occurence = 0 
    amazing = breakthrough_method(global_dict)
    word_array = []
    wordlength_array = []
    for field in amazing:
        word_array.append(field)
        wordlength_array.append(amazing[field])
    word_array = [x for y, x in sorted(zip(wordlength_array, word_array))]
    word_array = word_array[:T]
    final_dict = {}
    
    for word in word_array:
        spamnum = json.loads(global_dict.get(word))["SPAM"]
        mailnum = json.loads(global_dict.get(word))["MAIL"]
        final_dict[word] = json.dumps({"MAIL":mailnum,"SPAM":spamnum})
    return final_dict

def train_top_2(T):
    
    #Create the dictionary
    wordDictionary = create_new_dict(T)

    #Calculate the N of each class
    Nspam = 0
    Nmail = 0
    for word in wordDictionary:
        Nspam += json.loads(wordDictionary.get(word))["SPAM"]
        Nmail += json.loads(wordDictionary.get(word))["MAIL"]
    
    propabilityDictionary = {}
    
    for item in wordDictionary:
        
        #Calculate the two probabilities
        spamPropability = wordPropability(item,"SPAM",Nspam,wordDictionary)
        mailPropability = wordPropability(item,"MAIL",Nmail,wordDictionary)
        
        #Save the probabilities into a dictionary
        propabilityDictionary[item] = json.dumps({"MAIL":mailPropability,"SPAM":spamPropability})
    return propabilityDictionary,wordDictionary

print("Naive Bayes model")
for i in [3500,3800,4000,4200]:
    print('\nT: ',i,' words with the smallest length.')
    probs,dictionary = train_top_2(i)
    testAcc = test(probs,"./Email_spam/test",dictionary)
    print("Test Accuracy: ",testAcc*100,"%")
    print("Using only ", i/18400 ," of the training dataset.")


print("\nK nearest neighbours model")
for i in [700,800,900]:
    print('\nT:',i,' words with the smallest length.')
    _,dictionary = train_top_2(i)
    vectorDicArray = train("./Email_spam/train")
    testAcc = testknn("./Email_spam/test",vectorDicArray,7,True,dictionary)
    print("Test Accuracy: ",testAcc*100,"%")
    print("Using only ", i/18400 ," of the training dataset.")

Naive Bayes model

T:  3500  words with the smallest length.
Test Accuracy:  78.60696517412936 %
Using only  0.19021739130434784  of the training dataset.

T:  3800  words with the smallest length.
Test Accuracy:  79.60199004975125 %
Using only  0.20652173913043478  of the training dataset.

T:  4000  words with the smallest length.
Test Accuracy:  81.09452736318407 %
Using only  0.21739130434782608  of the training dataset.

T:  4200  words with the smallest length.
Test Accuracy:  82.58706467661692 %
Using only  0.22826086956521738  of the training dataset.

K nearest neighbours model

T: 700  words with the smallest length.
Test Accuracy:  88.05970149253731 %
Using only  0.03804347826086957  of the training dataset.

T: 800  words with the smallest length.
Test Accuracy:  89.05472636815921 %
Using only  0.043478260869565216  of the training dataset.

T: 900  words with the smallest length.
Test Accuracy:  89.05472636815921 %
Using only  0.04891304347826087  of the training dataset.


# $\triangleright$ Comparing different algorithms.

In [38]:
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import sys
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# $\triangleright$ Preprocessing the data.

In [39]:
def readAllTheFile(fileName):
    contents = ""
    with open(fileName) as f:
        for line in f.readlines():
            contents += line
    return contents
def createTheDataFrame(dirName):
    
     #Get the files from a directory
    files = [os.path.join(dirName,fi) for fi in os.listdir(dirName)]
    
    df = pd.DataFrame(columns=['message', 'label'])
    for f in files:
        
        #Read the message of the file
        messageFromFile =readAllTheFile(f)
        
        #Check the category SPAM --> 1 OR MAIL --> 0
        filepathTokens = f.split('/')
        lastToken = filepathTokens[len(filepathTokens) - 1]
        if lastToken.startswith("sp"):
            lab = 1
        else:
            lab = 0
        
        #Pass a new row to dataframe
        row = [messageFromFile,lab]
        df.loc[len(df)] = row
    return df

# $\triangleright$ Vectorize the data 

In [40]:
#Create a dataframe with the samples for train,test
df=createTheDataFrame('Email_spam/train/')

vectorizer = TfidfVectorizer()
print(df.head())
# Extract feature column 'Text'
X = vectorizer.fit_transform(df.message)
# Extract target column 'Class'
y = df.label.astype('int')

                                             message label
0  Subject: genetic classification\n\ni wish to m...     0
1  Subject: iscll3\n\nthe third international sym...     0
2  Subject: attention smokers quit smoking immedi...     1
3  Subject: salk insitute job\n\nnew research pos...     0
4  Subject: ! find out anything about anyone on t...     1


# $\triangleright$ Test algorithms on espam dataset

In [41]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, train_size=0.80, random_state=42)

from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model.fit(X_train,y_train)
print("\nDecisionTree Classifier: ")
print("Accuracy: ",model.score(X_test,y_test)*100," %")

#pip install sklearn-contrib-py-earth
#https://contrib.scikit-learn.org/py-earth/content.html

from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(30, 3), random_state=1)
clf.fit(X_train, y_train)
print("\nNeural Network Classifier: ")
print("Accuracy: ",clf.score(X_test,y_test)*100," %")

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(X_train,y_train)
print("\nLogistic Regression Classifier: ")
print("Accuracy: ",clf.score(X_test,y_test)*100," %")

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train.toarray(), y_train)
print("\nNaive Bayers Classifier: ")
print("Accuracy: ",gnb.score(X_test.toarray(),y_test)*100," %")

from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
print("\nRegression Classifier: ")
print("Accuracy: ",reg.score(X_test,y_test)*100," %")

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=2)
neigh.fit(X, y) 
print("\nKNN Classifier: ")
print("Accuracy: ",neigh.score(X_test,y_test)*100," %")


DecisionTree Classifier: 
Accuracy:  93.38842975206612  %

Neural Network Classifier: 
Accuracy:  99.17355371900827  %

Logistic Regression Classifier: 
Accuracy:  99.17355371900827  %

Naive Bayers Classifier: 
Accuracy:  97.52066115702479  %

Regression Classifier: 
Accuracy:  89.35199816726399  %

KNN Classifier: 
Accuracy:  100.0  %
