## Data Mining and Machine Learning
### Edgar Acuna
### Error Estimation (Accuracy Estimation)
#### October 21
#### Datasets: Diabetes, Vehicle

In [6]:
#Ejemplo de estimacion de la prediccion por Validacion Cruzada
#usando el clasifidor LDA y la base de datos Diabetes
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import LeaveOneOut
import warnings
warnings.filterwarnings('ignore')

In [7]:
names= ['v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7','V8','class']
diab=pd.read_table("http://academic.uprm.edu/eacuna/diabetes.dat",header=None,names=names)

In [8]:
#Computing the relative frequency of each class
diab.groupby('class').size()/len(diab)

class
1    0.651042
2    0.348958
dtype: float64

In [9]:
#Obteniendo el vector de clases y la matriz de predictoras
y=diab["class"]
y.count()
X=diab.iloc[:,0:8]
y1=y.to_numpy()
X1=X.to_numpy()

#### Estimating the accuracy  by resubstitution 

In [10]:
#Haciendo el analisis discriminante y calculando el porcentaje de precision
ldadis = LinearDiscriminantAnalysis().fit(X1,y1)
#Tasa de precision
ldadis.score(X1, y1)

0.7838541666666666

In [11]:
#Computing the recall
pred=ldadis.predict(X1)
recall_score(y1,pred)

0.892

In [12]:
#computing f1 score
f1_score(y1,pred)

0.8431001890359168

In [13]:
print(classification_report(y1, pred))

              precision    recall  f1-score   support

           1       0.80      0.89      0.84       500
           2       0.74      0.58      0.65       268

    accuracy                           0.78       768
   macro avg       0.77      0.74      0.75       768
weighted avg       0.78      0.78      0.78       768



### Acuracy estimation by cross-validation

In [14]:
#Estimando la precision  por validacion cruzada
from sklearn.model_selection import cross_val_score
clf = LinearDiscriminantAnalysis()
scores = cross_val_score(clf, X1, y1, cv=10)
scores   

array([0.74025974, 0.75324675, 0.80519481, 0.72727273, 0.72727273,
       0.77922078, 0.80519481, 0.80519481, 0.76315789, 0.82894737])

In [15]:
#Hallando la precision media y un intervalo de confianza 
print("CV Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

CV Accuracy: 0.773 (+/- 0.069)


In [16]:
#Tasa de precision usando validacion cruzada usando 10 repeticiones con 10 folds cada una
clf = LinearDiscriminantAnalysis()
pred1=[]
for i in range (0,11):
    cv = ShuffleSplit()
    scores = cross_val_score(clf, X1,y1, cv=cv)
    pred1.append(scores)
print ("Accuracy by cross validacion=", np.mean(pred1),"+/-",np.std(pred1))

Accuracy by cross validacion= 0.7855962219598582 +/- 0.04694782592956986


### Accuracy Estimation by the holdout method

In [17]:
#Estimando el error por el metodo holdout
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.3,random_state=0)
X_train, y_train

X_test, y_test

ldadiab = LinearDiscriminantAnalysis().fit(X_train, y_train)
ldadiab.score(X_test, y_test)

0.7792207792207793

In [18]:
#Estimando la precision por el metodo holdout con 50 muestras
pred=[]
for i in range(0,51):
    X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.33)
    ldadiab = LinearDiscriminantAnalysis().fit(X_train, y_train)
    pred.append(ldadiab.score(X_test, y_test))
print(np.asarray(pred).round(3))
print("Accuracy by holdout: %0.3f (+/- %0.3f)" % (np.mean(pred), np.std(pred)))

[0.787 0.78  0.783 0.791 0.78  0.772 0.783 0.803 0.76  0.756 0.744 0.732
 0.752 0.783 0.819 0.776 0.732 0.827 0.787 0.772 0.819 0.811 0.76  0.772
 0.76  0.764 0.783 0.744 0.772 0.76  0.791 0.736 0.772 0.752 0.787 0.795
 0.783 0.787 0.768 0.772 0.807 0.736 0.799 0.756 0.756 0.74  0.764 0.811
 0.776 0.78  0.772]
Accuracy by holdout: 0.775 (+/- 0.023)


### Accuracy Estimation for the Vehicle dataset

In [19]:
#Ejemplo2. Leyendo los datos de vehiculos que tienen 4 clases y 18 predictoras
df1=pd.read_csv("http://academic.uprm.edu/eacuna/vehicle.csv")
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   COMPACTNESS                846 non-null    int64 
 1   CIRCULARITY                846 non-null    int64 
 2   DISTANCE_CIRCULARITY       846 non-null    int64 
 3   RADIUS_RATIO               846 non-null    int64 
 4   PR.AXIS_ASPECT_RATIO       846 non-null    int64 
 5   MAX.LENGTH_ASPECT_RATIO    846 non-null    int64 
 6   SCATTER_RATIO              846 non-null    int64 
 7   ELONGATEDNESS              846 non-null    int64 
 8   PR.AXIS_RECTANGULARITY     846 non-null    int64 
 9   MAX.LENGTH_RECTANGULARITY  846 non-null    int64 
 10  SCALED_VARIANCE_MAJOR      846 non-null    int64 
 11  SCALED_VARIANCE_MINOR      846 non-null    int64 
 12  SCALED_RADIUS_OF_GYRATION  846 non-null    int64 
 13  SKEWNESS_ABOUT_MAJOR       846 non-null    int64 
 14  SKEWNESS_A

In [20]:
#Convirtiendo en matriz la tabla de predictoras y la columna de clases
y=df1['Class']
X=df1.iloc[:,0:18]
y1=y.to_numpy()
X1=X.to_numpy()

In [21]:
#Haciendo el analisis discriminante y calculando el porcentaje de precision
ldadis = LinearDiscriminantAnalysis().fit(X1,y1)
#Tasa de precision
ldadis.score(X1, y1)

0.7978723404255319

In [22]:
#Estimando la precision  por validacion cruzada
from sklearn.model_selection import cross_val_score
clf = LinearDiscriminantAnalysis()
scores = cross_val_score(clf, X1, y1, cv=10)
scores   

array([0.78823529, 0.75294118, 0.72941176, 0.75294118, 0.78823529,
       0.77647059, 0.83333333, 0.80952381, 0.72619048, 0.8452381 ])

In [23]:
#Hallando la precision media y un intervalo de confianza 
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.78 (+/- 0.08)


In [24]:
#Estimando la precison por el metodo holdout
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.3, random_state=0)
X_train, y_train

X_test, y_test

ldaveh = LinearDiscriminantAnalysis().fit(X_train, y_train)
ldaveh.score(X_test, y_test)

0.7874015748031497