In [465]:
import pandas as pd

data = pd.read_csv("datasets/cross_validataion.csv")
data

Unnamed: 0,x,classification
0,1,1
1,2,1
2,4,1
3,5,0
4,6,0
5,7,0


In [466]:
X_train = data['x']
X_train

0    1
1    2
2    4
3    5
4    6
5    7
Name: x, dtype: int64

In [467]:
y_train = data['classification']
y_train

0    1
1    1
2    1
3    0
4    0
5    0
Name: classification, dtype: int64

# Algorithm A (MeanClassifier)

In [488]:
from sklearn.base import BaseEstimator
from statistics import mean
import math

class MeanClassifier(BaseEstimator):
    mean_class = {}
    def fit(self, X, y):#X should only have one attribute with numerical values
        #all categories
        target_values_counts = y.value_counts()
        target_values = []
        for value, count in target_values_counts.items():
            target_values.append(value)
        #print('target_values', target_values)    
        
        #group attribute values based on categories
        attribute_class = {}
        #print('X', X)
        #print('y', y)
        for idx,value in X.items():
            #print(idx,value)
            target_list = attribute_class.get(y[idx], [])
            target_list.append(value)
            attribute_class[y[idx]] = target_list
        
        #print('attribute_class', attribute_class)    
        #mean
        for key, list_values in attribute_class.items():
            self.mean_class[key] = mean(list_values)
            
        return self.mean_class
        
    def predict(self, X):
        ret = {}
        for x in X:
            #print(x)
            min_class= math.inf
            min_diff = math.inf
            for key, mean in self.mean_class.items():
                #print(key, mean)
            
                diff = math.fabs(x-mean)
                if(diff < min_diff):
                    min_diff = diff
                    min_class = key
            #print('min', x, min_diff, min_class)
            ret[x] = min_class
        return ret

In [489]:
mean_clf = MeanClassifier()

print(mean_clf.fit(X_train, y_train))

{1: 2.3333333333333335, 0: 6}


In [490]:
print(mean_clf.predict(X_train))

{1: 1, 2: 1, 4: 1, 5: 0, 6: 0, 7: 0}


In [491]:
print(mean_clf.predict([5]))

{5: 0}


# Algorithm B (NearestClassifier)

In [552]:
from sklearn.base import BaseEstimator
from statistics import mean
import statistics
import math

class NearestClassifier(BaseEstimator):
    def fit(self, X, y):
        self.X = X
        self.y = y
    def predict(self, X):
        ret = {}
        for x in X:
            min_idx = math.inf
            min_diff = math.inf
            for idx, value in self.X.items():
                diff = math.fabs(value-x)
                if(diff < min_diff):
                    min_diff = diff
                    min_idx = idx
            ret[x] = self.y[min_idx]
        return ret

ImportError: cannot import name 'std'

In [494]:
near_clf = NearestClassifier()

near_clf.fit(X_train, y_train)
print(near_clf.predict([5,9]))

{5: 0, 9: 0}


# Cross Validation

In [502]:
def k_cross_validation(classifier, X, y, k=3):
    # folders
    folders_X = {}
    folders_y = {}
    n_elements = int(len(X)/k)
    #print('n_elements:', n_elements)
    for idx in range(k):
        folders_X[idx] = X[idx*n_elements:idx*n_elements+n_elements]
        folders_y[idx] = y[idx*n_elements:idx*n_elements+n_elements]
    #print(folders_X)
    #print(folders_y)
    
    errors = []
    for idx in range(k):
        X_test = folders_X[idx]
        y_test = folders_y[idx]
        
        X_train = pd.Series()
        y_train = pd.Series()
        for i in range(k):
            if(i != idx):
                X_train = pd.concat([X_train, folders_X[i]], axis=0)
                y_train = pd.concat([y_train, folders_y[i]], axis=0)
        
        #print(X_train)
        #print(y_train)
        
        classifier.fit(X_train, y_train)
        y_predict = classifier.predict(X_test)
        #print(y_predict)
        y_predict_list  = list(y_predict.values())
        #print(y_predict_list)
        y_test_list = list(y_test)
        #print(y_test_list)
        diff = [test - pred for test, pred in zip(y_test_list, y_predict_list)]
        count = 0
        for element in diff:
            if(element != 0):
                count += 1
        errors.append(count)
    return mean(errors)

def cross_validation(classifier, X, y, k=3, cv = 1):
    errors = []
    for time in range(cv):
        e = k_cross_validation(classifier, X, y, k)
        errors.append(e)
    return mean(errors)
    

In [504]:
near_clf = NearestClassifier()
e = cross_validation(near_clf, X_train, y_train, 6, 1)
print(e)

0.3333333333333333


# HW3 Q2

In [537]:
import pandas as pd

index = [1,2,3,4,5,6]
data = pd.DataFrame({
    'x': [1,2,4,5,6,7],
    'classification': [1, 0, 1, 1, 0, 0]},
    index=index)

print(data)

   x  classification
1  1               1
2  2               0
3  4               1
4  5               1
5  6               0
6  7               0


In [538]:
X = data['x']
y = data['classification']

In [539]:
#part 1 
mean_clf = MeanClassifier()
e = cross_validation(mean_clf, X, y, 6, 1)
print(e)

near_clf = NearestClassifier()
e = cross_validation(near_clf, X, y, 6, 1)
print(e)

0.3333333333333333
0.5


In [550]:
#part 2
ret1 = []
ret2 = []
#1
print('permutation 1')
idx = [1, 2, 3, 4, 5, 6]
X_1, y_1 =X[idx], y[idx]
mean_clf = MeanClassifier()
e1 = cross_validation(mean_clf, X_1, y_1, 3, 1)
print('mean_clf', e1)
ret1.append(e1)

near_clf = NearestClassifier()
e2 = cross_validation(near_clf, X_1, y_1, 3, 1)
print('near_clf', e2)
ret2.append(e2)

#2
print('permutation 2')
idx = [1, 3, 5, 2, 4, 6]
X_2, y_2 = X[idx], y[idx]
mean_clf = MeanClassifier()
e1 = cross_validation(mean_clf, X_2, y_2, 3, 1)
print('mean_clf', e1)
ret1.append(e1)

near_clf = NearestClassifier()
e2 = cross_validation(near_clf, X_2, y_2, 3, 1)
print('near_clf', e2)
ret2.append(e2)

#3
print('permutation 3')
idx = [1, 6, 2, 5, 3, 4]
X_3, y_3 = X[idx], y[idx]
mean_clf = MeanClassifier()
e1 = cross_validation(mean_clf, X_3, y_3, 3, 1)
print('mean_clf', e1)
ret1.append(e1)

near_clf = NearestClassifier()
e2 = cross_validation(near_clf, X_3, y_3, 3, 1)
print('near_clf', e2)
ret2.append(e2)

print(mean(ret1), mean(ret2))


permutation 1
mean_clf 1.6666666666666667
near_clf 1.6666666666666667
permutation 2
mean_clf 0.6666666666666666
near_clf 1
permutation 3
mean_clf 1.6666666666666667
near_clf 1.3333333333333333
1.3333333333333335 1.3333333333333333


In [551]:
import print(std(ret1), std(ret2))


NameError: name 'std' is not defined

# Other

In [385]:
s1 = pd.Series([1, 2], index=['A', 'B'], name='s1')
s1

A    1
B    2
Name: s1, dtype: int64

In [170]:
s2 = pd.Series([3, 4], index=['A', 'B'], name='s2')
s2

A    3
B    4
Name: s2, dtype: int64

In [171]:
pd.concat([s1, s2], axis=1)

Unnamed: 0,s1,s2
A,1,3
B,2,4


In [172]:
s11 = pd.Series([3, 4], index=['A', 'B'], name='s1')
s11

A    3
B    4
Name: s1, dtype: int64

In [174]:
pd.concat([s1, s11], axis=0)

A    1
B    2
A    3
B    4
Name: s1, dtype: int64