## Loading Dataset

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm, multivariate_normal
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interact, interactive, fixed, IntSlider, interact_manual
import operator

ModuleNotFoundError: No module named 'seaborn'

In [None]:
data=np.loadtxt('C:/Users/deshw/Downloads/Datasets ML/winery-univariate/wine.data.txt',delimiter=',')
features = ['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash','Magnesium', 'Total phenols', 
                'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 
                'OD280/OD315 of diluted wines', 'Proline']


In [None]:
np.random.seed(0)
indices=np.random.permutation(178)
trainx=data[indices[0:130],1:14]
trainy=data[indices[0:130],0]
testx=data[indices[130:],1:14]
testy=data[indices[130:],0]

In [None]:
print("Training points per each class: ")
for i in range(1,4):
    print('{}: '.format(i),eval('sum(trainy=={})'.format(i)))

In [None]:
print("Testing points per each class: ")
for i in range(1,4):
    print('{}: '.format(i),eval('sum(testy=={})'.format(i)))

# 1. Univariate Generative approach

##  Visualization of features

Visualising each class per feature through Gaussian distribution.

In [None]:
@interact_manual(feature=IntSlider(0,0,12),label=IntSlider(1,1,3))
def density_funt(feature,label):
    sns.set_style('dark')
    mew=np.mean(trainx[trainy==label,feature])
    std=np.std(trainx[trainy==label,feature])
    n=np.linspace(mew-3*std,mew+3*std,1000)
    
    plt.hist(trainx[trainy==label,feature],density=True,color=plt.cm.Blues(130))
    plt.plot(n,norm.pdf(n,mew,scale=std),'k',lw=2)
    
    plt.title('Winery {}'.format(label),fontsize=16)
    plt.xlabel(str(features[feature]),fontsize=12)
    plt.ylabel('Density',fontsize=12)
    plt.show()

Fitting Gausian Distribution for each Feature

In [None]:
def measure(x,y,feature):
    k=set(y)
    mew=np.zeros(len(k)+1)
    std=np.zeros(len(k)+1)
    pi=np.zeros(len(k)+1)
    
    for i in k:
        mew[int(i)]=np.mean(x[y==i,feature])
        std[int(i)]=np.std(x[y==i,feature])
        pi[int(i)]=sum(y==i)/len(y)
    return mew,std,pi

In [None]:
@interact(feature=IntSlider(0,0,12))
def visual(feature):
    sns.set_style('whitegrid')
    mean,std,pi=measure(trainx,trainy,feature)
    for i in range(1,4):
        n=np.linspace(mean[i]-3*std[i],mean[i]+3*std[i],1000)
        plt.plot(n,norm.pdf(n,mean[i],scale=std[i]),color=plt.cm.Spectral(i*25),label='Class {0}'.format(i),lw=2.5)
    plt.xlabel(features[feature],fontsize=12)
    plt.ylabel('Density',fontsize=12)
    plt.legend()
    plt.show

## Predicting Labels 

Using test data for prediction 

In [None]:
@interact(feature=IntSlider(0,0,12))
def predict(feature):
    mean,std,pi=measure(trainx,trainy,feature)
    k=set(trainy)
    label_prob=np.zeros((len(testx),len(k)+1))
    for i in range(len(testx)):
        for j in k:
            label_prob[i,int(j)]=np.log(pi[int(j)]*norm.pdf(testx[i,feature],mean[int(j)],std[int(j)]))
    predictions=np.argmax(label_prob[:,1:],axis=1)+1
    error=np.not_equal(predictions,testy).sum()/len(testy)
    return error
    

Error comparison among features of Test and Train dataset after prediction

In [None]:
def approach(a):
    if a=='test_set':
        x,y=testx,testy
    else:
        x,y=trainx,trainy
    
    error={}
    for feature in range(0,12):
        mean,std,pi=measure(trainx,trainy,feature)
        k=set(trainy)
        feature_prob=np.zeros((len(x),len(k)+1))
        for i in range(len(x)):
            for j in range(1,len(k)+1):
                feature_prob[i,j]=np.log(pi[j]*norm.pdf(x[i,feature],mean[j],std[j]))
        predictions=np.argmax(feature_prob[:,1:],axis=1)+1
        error[feature]=np.not_equal(predictions,y).sum()/len(y)
        
    sns.set_style('white')
    plt.figure(figsize=(12,6))    
    plt.plot(list(error.keys()),list(error.values()),marker='o',markersize=4,markerfacecolor='k')
    for feature,e in error.items():
        plt.text(s=str(format(e,'.3f')),x=feature+0.2,y=e,fontsize=10)
    plt.xlabel('Features',fontsize=14)
    plt.xlim(-1,13)
    plt.ylabel('Error',fontsize=14)
    plt.show()
    
    return sorted(error.items(),key=operator.itemgetter(1))

In [None]:
interact(approach,a=['test_set','train_set'])

# 2. Bivariate Generative Approach

In [None]:
def measure_bi(x,features):
    mean=np.mean(x[:,features],axis=0)
    cov=np.cov(x[:,features],bias=True,rowvar=False)
    return mean,cov

#### Setting limit of variables

In [None]:
def limits(x):
    width=max(x)-min(x)
    upper=max(x)+0.3*width
    lower=min(x)-0.3*width
    return (upper,lower)

#### Plotting contour lines

In [None]:
def contours(x,feature,mean,cov):
    if feature[0]==feature[1]:
        print('Choose different value!')
        return
    x1_up,x1_low=limits(x[:,feature[0]])
    x2_up,x2_low=limits(x[:,feature[1]])
    x1=np.linspace(x1_low,x1_up,200)
    x2=np.linspace(x2_low,x2_up,200)
    z=np.zeros((len(x1),len(x2)))
    multi=multivariate_normal(mean=mean,cov=cov)
    
    for i in range(len(x1)):
        for j in range(len(x2)):
            z[j,i]=multi.logpdf([x1[i],x2[j]])
    sign,logdet=np.linalg.slogdet(cov)
    normalizer=-0.5*(2*np.log(2*np.pi)+sign*logdet)
    for value in range(1,4):
        plt.contour(x1,x2,z,levels=[normalizer-value],linestyles='solid',linewidths=1,colors='k')
    

Plotting contour lines to every feature combination

In [None]:
@interact_manual(feature_1=IntSlider(0,0,12,1),feature_2=IntSlider(6,0,12,1),label=IntSlider(1,1,3,1))
def plot(feature_1,feature_2,label):
    if feature_1==feature_2:
        print('Choose different value!')
        return
    feature=[feature_1,feature_2]
    x=trainx[trainy==label,]
    mean,cov=measure_bi(x,feature)
    
    sns.set_style('white')
    contours(x,feature,mean,cov)
    plt.scatter(x[:,feature[0]],x[:,feature[1]],color='green',alpha=0.7)
    plt.title(str(features[feature_1])+' vs '+str(features[feature_2]),fontsize=14)
    plt.xlabel(str(features[feature_1]),fontsize=10)
    plt.ylabel(str(features[feature_2]),fontsize=10)
    plt.show()
    

## Fitting gaussian for each class

In [None]:
def fit_model(x,y,feature):
    d=len(feature)
    k=len(set(y))
    mew=np.zeros((k+1,d))
    covar=np.zeros((k+1,d,d))
    pi=np.zeros(k+1)
    for i in range(1,k+1):
        mew[i,:], covar[i,:,:]=measure_bi(x[y==i,],feature)
        pi[i]=np.sum([y==i])/len(y)
    return mew,covar,pi

In [None]:
@interact_manual(feature_1=IntSlider(0,0,12,1),feature_2=IntSlider(6,0,12,1))
def plot_label(feature_1,feature_2):
    if feature_1==feature_2:
        print('Choose different value!')
        return
    
    sns.set_style('darkgrid')
    plt.figure(figsize=(12,8))
    feature=[feature_1,feature_2]
    mew,covar,pi=fit_model(trainx,trainy,feature)
    col=[' ','red','green','blue']
    
    for label in range(1,4):
        mean,cov,pie=mew[label,:],covar[label,:,:],pi[label]
        contours(trainx[trainy==label,],feature,mean,cov)
        plt.scatter(trainx[trainy==label,feature[0]],trainx[trainy==label,feature[1]],color=plt.cm.Spectral(27*label)\
                   ,label='Class '+str(label))
        plt.text(mean[0],mean[1], s=str(label),fontsize=18,fontweight='bold')
        
    plt.legend()
    plt.title('Bivariate Distribution',fontsize=14,fontweight='bold')
    plt.xlabel(str(features[feature_1]),fontsize=12)
    plt.ylabel(str(features[feature_2]),fontsize=12)
    plt.show()

## Predicting lables

In [None]:
@interact(f1=IntSlider(0,0,12,1),f2=IntSlider(6,0,12,1),printf=[True,False])
def predict(f1,f2,printf):
    if f1==f2:
        print('Enter a different combination')
        return
    
    feature=[f1,f2]
    k=3
    score=np.zeros((len(testx),k+1))
    mean,cov,pi=fit_model(trainx,trainy,feature)
    x=testx[:,feature]
    
    for i in range(0,len(x)):
        for label in range(1,k+1):
            multivariate_normal(mean[label,:],cov[label,:,:])
            score[i,label]=np.log(pi[label])+multivariate_normal.logpdf(x[i,],mean[label,:],cov[label,:,:])
    labels=np.argmax(score[:,1:],axis=1)+1
    error=sum(np.not_equal(labels,testy))/len(testy)
    if printf:
        print('Features used: '+features[f1]+' and '+features[f2])
    return error
        
    

### Error analysis

In [None]:
errors={}  
for i in range(0,13):
    for j in range(0,13):
        index=features[i]+' and '+features[j]
        if i==j:
            continue
        else:
            errors[features[i]+' and '+features[j]]=predict(i,j,False)
errors=sorted(errors.items(),key=operator.itemgetter(1))
print('Feature Combination with min error: '+str(errors[0]))
print('Feature Combination with max error: '+str(errors[-1]))

### Decision boundaries

For this we height(z) of the distribution contains only three distint values, i.e. 1,2 or 3. Thus we do not
specify position of contours as we did earlier using normalizing term.

In [None]:
@interact_manual(f1=IntSlider(0,0,12,1),f2=IntSlider(6,0,12,1))
def show_boundaries(f1,f2):
    plt.figure(figsize=(10,6))
    sns.set_style('whitegrid')
    f=[f1,f2]
    k=3
    x1_lim=limits(trainx[:,f1])
    x2_lim=limits(trainx[:,f2])
    x1=np.linspace(x1_lim[1],x1_lim[0],350)
    x2=np.linspace(x2_lim[1],x2_lim[0],350)
    
    mean,cov,pi=fit_model(trainx,trainy,f)
    z=np.zeros((len(x1),len(x2)))
    multi=[multivariate_normal(mean=mean[label,:],cov=cov[label,:,:]) for label in range(1,k+1)]
    
    
    for i in range(len(x1)):
        for j in range(len(x2)):
            score=[]
            for label in range(1,k+1):
                score.append(np.log(pi[label])+multi[label-1].logpdf([x1[i],x2[j]]))
                
            z[j,i]=np.argmax(score)+1
            
    for label in range(1,k+1):
        plt.scatter(trainx[trainy==label,f1],trainx[trainy==label,f2],color=plt.cm.Spectral(label*30),\
                    label='Class '+str(label))
    plt.pcolormesh(x1,x2,z)
    plt.contour(x1,x2,z,linewidths=0.3,colors='k')
    plt.title('Decision Boundaries',fontsize=14,fontweight='bold')
    plt.xlabel(features[f1],fontsize=12)
    plt.legend()
    plt.ylabel(features[f2],fontsize=12)
    plt.show()