In [4]:
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd

%matplotlib inline
%matplotlib notebook

## 1. Train-test split method

### 1.1 Fisher-Yates shuffle algorithm

In [10]:
def shuffle(in_list):
    n=len(in_list)
    
    for i in range(n-1,0,-1):
        j=np.random.randint(0,i)
        in_list[j],in_list[i]=in_list[i],in_list[j]
        
    return in_list

In [11]:
a=list(range(10))

for k in range(10):
    print (shuffle(a))

[1, 4, 6, 9, 5, 8, 0, 3, 7, 2]
[3, 2, 0, 6, 4, 9, 5, 8, 1, 7]
[5, 9, 7, 2, 3, 0, 6, 1, 4, 8]
[6, 3, 1, 8, 0, 4, 7, 9, 2, 5]
[9, 7, 0, 2, 4, 5, 1, 8, 3, 6]
[1, 2, 3, 6, 8, 7, 4, 5, 9, 0]
[2, 0, 4, 7, 5, 9, 8, 6, 1, 3]
[8, 2, 0, 9, 4, 1, 3, 5, 6, 7]
[4, 0, 9, 8, 1, 7, 5, 2, 3, 6]
[5, 4, 8, 1, 0, 6, 7, 9, 2, 3]


### 1.2 Sattolo's algorithm

In [12]:
def sattolo(in_list):
    n = len(in_list)
    for i in range(n-1,1,-1):
        j = np.random.randint(0,i)  
        in_list[j], in_list[i] = in_list[i], in_list[j]
    return in_list

In [13]:
a=list(range(10))

for k in range(10):
    print (sattolo(a))

[8, 3, 0, 1, 2, 7, 9, 4, 6, 5]
[5, 9, 3, 6, 7, 4, 2, 1, 0, 8]
[4, 9, 6, 7, 0, 8, 1, 5, 2, 3]
[1, 7, 8, 5, 3, 2, 6, 9, 0, 4]
[8, 0, 1, 7, 5, 9, 2, 4, 6, 3]
[1, 6, 8, 2, 4, 3, 9, 7, 5, 0]
[0, 6, 2, 4, 9, 5, 3, 1, 7, 8]
[4, 7, 8, 0, 3, 2, 6, 5, 1, 9]
[3, 2, 1, 4, 8, 9, 0, 6, 5, 7]
[1, 0, 8, 5, 4, 7, 9, 3, 6, 2]


## Train-test split method

In [45]:
## In this module, I will build the train-test split method based on the random shuffle

def train_test_split(input_array,ratio=0.7):
    """
    The input array must have the format that the last column is the 
    target value.
    
    For example:
    
    input_array is (m,n), then the dataset has n-1 features and 1 target column
    
    The ratio is the percentage of Train and Test array size. The default value is 0.7
    """
    
    ## The number of observations, which is also the number of rows of input array
    n_row = input_array.shape[0]
    
    ## Flag list which is used to generate shuffled points
    flag = shuffle(list(range(n_row)))
    
    ## flag list for train and test list
    split_number = int(ratio*len(flag))
    train_flag = flag[:split_number]
    test_flag = flag[split_number:]
    
    ## build the train and test set
    train_x = np.array([input_array[candidate,:-1] for candidate in train_flag])
    train_y = np.array([input_array[candidate,-1] for candidate in train_flag])
    test_x = np.array([input_array[candidate,:-1] for candidate in test_flag])
    test_y = np.array([input_array[candidate,-1] for candidate in test_flag])
    
    return train_x,train_y,test_x,test_y
    

In [53]:
## Test

a=np.array([[1,2,3,4,5],[2,3,4,5,6],[7,8,9,10,11]])


train_x,train_y,test_x,test_y = train_test_split(a)

print ("The train observations: ",train_x)
print ("The train target: ", train_y)
print ("The test observations: ", train_y)
print ("The test target: ", test_y)

The train observations:  [[ 2  3  4  5]
 [ 7  8  9 10]]
The train target:  [ 6 11]
The test observations:  [ 6 11]
The test target:  [5]


## 2. Accuracy of Model

### 2.1 Accuracy of classfication problem

In [6]:
def model_accuracy(test_y, result, verbose=True):
    """
    This function calculate the model accuracy of the classfication problem. It is the most simple way to anlysis the ability
    of model prediction.
    
    Input: 1. test_y, this is the already known data of target. size = (num_of_observation,)
           2. result, this is the predicted value after training by any classification algorithm. size = (num_of_observation,)
           
    Output: the accuracy score in percentage form.
            
            if verbose is true, then print out the final result.
            
    """
    
    ## test: check if the two input array has the same dimension.
    assert test_y.shape == result.shape
    
    the_same=0.0
    for test,predict in zip(test_y,result):
        if (test == predict):
            the_same += 1
    
    accuracy_score = the_same/len(test_y)
    
    if (verbose == True):
        print ("Your model accuracy is: {0:.2f}%, keep doing!".format(accuracy_score*100))
    
    return accuracy_score

In [7]:
## Test of the above function

y_test = np.array([1,1,1,0,0,0,1])
result = np.array([1,1,0,0,0,0,0])

model_accuracy(y_test,result)

Your model accuracy is: 71.43%, keep doing!


0.7142857142857143