# Appendix 2 : Code

### 1. Preparation

In [0]:
#load and libraries
!pip install mord
import mord
import numpy as np
import pandas as pd
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression as LR
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV



In [0]:
# Loading data 
! git clone https://github.com/chaoyangzhengnash/ASL
train = pd.read_csv('ASL/datrain.txt', sep=" ", header = 'infer')
test = pd.read_csv('ASL/dateststudent.txt', sep=" ", header = 'infer')

#Feature scaling
sc = StandardScaler()
train[["fixedacidity", "volatileacidity", "citricacid", "residualsugar",
       "chlorides", "freesulfurdioxide", "totalsulfurdioxide", "density", "pH", 
       "sulphates", "alcohol"]] = sc.fit_transform(train[["fixedacidity", "volatileacidity", "citricacid", "residualsugar",
                                                          "chlorides", "freesulfurdioxide", "totalsulfurdioxide",
                                                          "density", "pH", "sulphates", "alcohol"]])
test[["fixedacidity", "volatileacidity", "citricacid", "residualsugar",
       "chlorides", "freesulfurdioxide", "totalsulfurdioxide", "density", "pH", 
       "sulphates", "alcohol"]] = sc.fit_transform(test[["fixedacidity", "volatileacidity", "citricacid", "residualsugar",
                                                          "chlorides", "freesulfurdioxide", "totalsulfurdioxide",
                                                          "density", "pH", "sulphates", "alcohol"]])
# Get x and y 
train_x = np.array(train.drop(['y'], axis=1)) 
train_y = np.array(train['y'])
test_x = np.array(test)





### 2.Design Ordinal classifier

In [0]:
class OrdinalClassifier():
    
    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}
    
    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0]-1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf
    
    def predict_proba(self, X):
        clfs_predict = {k:self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i,y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[y-1][:,1])
            elif i == 1:
                # Vi = Pr(y > Vi-1) - Pr(y > Vi)
                predicted.append(clfs_predict[y-2][:,1] - clfs_predict[y-1][:,1])
            else:
                # Vk = Pr(y > Vk-1)
                 predicted.append(clfs_predict[y-2][:,1])
        print(predicted)
        return np.vstack(predicted).T
    
    def predict(self, X):
        return np.array((np.argmax(self.predict_proba(X), axis=1)+1))
        
    def score(self, X, y, sample_weight=None):
        return accuracy_score(y, self.predict(X), sample_weight=sample_weight)






    

### 3. Model fitting

**RandomForestClassifier**

In [0]:
# Random search the best hyperparameter for Randomforest 
para_ref = {"max_depth": list(range(3,60,3)),  
}

In [0]:
#The RandomForestClassifier
clf = DecisionTreeClassifier()

# Implementing Repeated Stratified KFold
scores = []
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10,
    random_state=1)

for train_index, test_index in rskf.split(train_x, train_y):
    print("TRAIN:", train_index[:10], "TEST:", test_index[:10])
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]

    # Do grid search for hyperparameter
    clf = RandomizedSearchCV(clf, para_ref, n_jobs=-1, cv=3)
    clf.fit(X_train, y_train)
    print('Best parameters found:\n', clf.best_params_)
        
    scores.append(clf.score(X_test,y_test))
    print("score in current fold:",clf.score(X_test,y_test))
    print("-----------------------------------------------------------------------------")
    
print(np.mean(scores))

In [0]:
#The Ordinal_RaandomForestClassifier
clf = OrdinalClassifier(
   DecisionTreeClassifier(max_depth=125)
    )

# Implementing Repeated Stratified KFold
scores = []
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10,
    random_state=1)

for train_index, test_index in rskf.split(train_x, train_y):
    print("TRAIN:", train_index[:10], "TEST:", test_index[:10])
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]   
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test,y_test))
    print("score in current fold:",clf.score(X_test,y_test))
    print("-----------------------------------------------------------------------------")
    
print(np.mean(scores))

**MLPClassifier**

In [0]:
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100),(150,100,50),(125,150,200)],
    'activation': [ 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.05, 0.1],
    'learning_rate': ['constant','adaptive'],
}

mlp = MLPClassifier(max_iter=100)


In [0]:
#The MLPClassifier

# Implementing Repeated Stratified KFold

scores = []
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10,
    random_state=1)

for train_index, test_index in rskf.split(train_x, train_y):
    print("TRAIN:", train_index[:10], "TEST:", test_index[:10])
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index] 

    # Do grid search for hyperparameter
    clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
    clf.fit(X_train, y_train)
    print('Best parameters found:\n', clf.best_params_)

    scores.append(clf.score(X_test, y_test))
    print("r2 in current fold:",clf.score(X_test, y_test))
    print("-----------------------------------------------------------------------------")
print(np.mean(scores))

In [0]:
#The Ordinal_MLPClassifier
clf = OrdinalClassifier(
    MLPClassifier(hidden_layer_sizes=(150,100,50), 
                           max_iter=300,
                           activation = 'relu',
                           solver='adam',
                           random_state=1))


# Implementing Repeated Stratified KFold
scores = []
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10,
    random_state=1)

for train_index, test_index in rskf.split(train_x, train_y):
    print("TRAIN:", train_index[:10], "TEST:", test_index[:10])
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]   

    clf.fit(X_train, y_train)

    scores.append(clf.score(X_test,y_test))
    print("score in current fold:",clf.score(X_test,y_test))
    print("-----------------------------------------------------------------------------")
    
print(np.mean(scores))

**Mord**

In [0]:
# 1. LogisticIT
scores_LogisticIT = []
LogisticIT_classifier = mord.LogisticIT(alpha = 1.0,
                                        verbose = 0,
                                        max_iter = 1000)

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10,
    random_state=1)

for train_index, test_index in rskf.split(train_x, train_y):
    print("TRAIN:", train_index[:10], "TEST:", test_index[:10])
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]    
    LogisticIT_classifier.fit(X_train, y_train)
    scores_LogisticIT.append(LogisticIT_classifier.score(X_test, y_test))
    print("r2 in current fold:",LogisticIT_classifier.score(X_test, y_test))
    print("-----------------------------------------------------------------------------")
print(np.mean(scores_LogisticIT))


In [0]:
# 2. LogisticAT
scores_LogisticAT = []
LogisticAT_classifier = mord.LogisticAT(alpha = 1.0,
                                        verbose = 0,
                                        max_iter = 1000)

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10,
    random_state=1)

for train_index, test_index in rskf.split(train_x, train_y):
    print("TRAIN:", train_index[:10], "TEST:", test_index[:10])
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]    
    LogisticAT_classifier.fit(X_train, y_train)
    scores_LogisticAT.append(LogisticAT_classifier.score(X_test, y_test))
    print("r2 in current fold:",LogisticAT_classifier.score(X_test, y_test))
    print("-----------------------------------------------------------------------------")
print(np.mean(scores_LogisticAT))

In [0]:
# 3. OrdinalRidge
scores_OrdinalRidge = []
OrdinalRidge_classifier = mord.OrdinalRidge(alpha = 1.0,
                                        max_iter = 1000)

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10,
    random_state=1)

for train_index, test_index in rskf.split(train_x, train_y):
    print("TRAIN:", train_index[:10], "TEST:", test_index[:10])
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]    
    OrdinalRidge_classifier.fit(X_train, y_train)
    scores_OrdinalRidge.append(OrdinalRidge_classifier.score(X_test, y_test))
    print("r2 in current fold:",OrdinalRidge_classifier.score(X_test, y_test))
    print("-----------------------------------------------------------------------------")
print(np.mean(scores_OrdinalRidge))

In [0]:
# 4. LAD
scores_LAD = []
LAD_classifier = mord.LAD(max_iter = 1000)

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10,
    random_state=1)

for train_index, test_index in rskf.split(train_x, train_y):
    print("TRAIN:", train_index[:10], "TEST:", test_index[:10])
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]    
    LAD_classifier.fit(X_train, y_train)
    scores_LAD.append(LAD_classifier.score(X_test, y_test))
    print("r2 in current fold:",LAD_classifier.score(X_test, y_test))
    print("-----------------------------------------------------------------------------")
print(np.mean(scores_LAD))

### Prediction with test data

In [0]:
# Using Ordinal MLP to make prediction in the test dataset.
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100),(150,100,50),(125,150,200)],
    'activation': [ 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.05, 0.1],
    'learning_rate': ['constant','adaptive'],
}

mlp = MLPClassifier()
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(train_x, train_y)
print('Best parameters found:\n', clf.best_params_)


In [0]:
clf = OrdinalClassifier(
    MLPClassifier(hidden_layer_sizes=(50,50,50), 
                           max_iter=300,
                           activation = 'relu',
                           solver='adam',
                           alpha = 0.0001,
                           random_state=1,
                           learning_rate = 'adaptive'))
clf.fit(train_x, train_y)
clf.predict(test_x)


In [0]:
# Just to print the entire array
import sys
import numpy
np.set_printoptions(threshold=sys.maxsize)

a = clf.predict(test_x)
a

In [0]:
np.savetxt('chaoyang zheng',clf.predict(test_x))