In [10]:
# Imports
import numpy as np
import scipy.io as sio
from sklearn import decomposition
from sklearn import preprocessing
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectKBest, f_classif, SelectPercentile
from sklearn import linear_model
from sklearn import cross_validation
from sklearn.semi_supervised import LabelSpreading
from sklearn.semi_supervised import LabelPropagation
from sklearn.svm import SVC
from sklearn import cross_validation
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

In [2]:
'''
Import your data 
'''
# Load the .mat files
events_1000 = sio.loadmat('Data/events_1000.mat')
missIdx = sio.loadmat('Data/missIdx.mat')
provideData_1000 = sio.loadmat('Data/provideData_1000.mat')
provideIdx = sio.loadmat('Data/provideIdx.mat')
trainData = sio.loadmat('Data/Train.mat')
testData = sio.loadmat('Data/Test.mat')
yTest = sio.loadmat('Data/Ytest.mat')
# 
events = events_1000.get('events')
missidx = missIdx.get('missIdx')
provideData = provideData_1000.get('provideData')
provideidx = provideIdx.get('provideIdx')

Xtrain = trainData.get('Xtrain')
Ytrain = trainData.get('Ytrain')
Yt2 = yTest.get('Ytest')
Xtest = testData.get('Xtest')
missingData = np.genfromtxt ('Data/prediction.csv', delimiter=",")

In [3]:
'''
Get unlabeled training data
'''
# Preallocate size of unlabeled data
X2 = np.zeros((1000, 5903))
# Get fully construct unlabeled data
# Do missing data first
i = 0
for index in xrange(0, np.shape(missidx)[1]):
    #print missidx[0][index]
    X2[:, missidx[0][index]-1] = missingData[:,i]
    i = i + 1
# Now do for provided data
i = 0
for index in xrange(0, np.shape(provideidx)[1]):
    X2[:, provideidx[0][index]-1] = provideData[:, i]
    i = i + 1

In [4]:
# Add this data to the training data from part 1
X = np.vstack((Xtrain, X2))

# Get the Y for the semi-supervised learning
Y = np.zeros((1501,1))
Y[:501,:] = Ytrain
Y[501:,:] = -1
print "X shape:"
print np.shape(X)
print "Y shape:"
print np.shape(Y)

X shape:
(1501, 5903)
Y shape:
(1501, 1)


In [5]:
'''
Setup Label Propogation - RBF or KNN
'''
# Parameters
kernel_ = 'knn'
gamma_ = 0.0001
alpha_ = 0.2
n_neighbors_ = 7
tol_ = 0.001
clf = LabelPropagation(kernel=kernel_, gamma=gamma_,
                       alpha=alpha_, n_neighbors=n_neighbors_,
                       tol=tol_)
# Train the Classifier
Xscale = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True).fit_transform(X)
pca = decomposition.PCA(n_components=420)
Xpca = pca.fit_transform(Xscale)
clf.fit(Xpca, np.ravel(Y))
# Get the new Y data
X2scale = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True).fit_transform(X2)
X2pca = pca.transform(X2scale)
Y2 = clf.predict(X2pca)
# Add these to the Y values
Y[501:,:] = Y2[:][0]

In [None]:
'''
Use the full data on the classification problem
'''
selection = SelectKBest(k=180)
X_pca = selection.fit(Xpca, np.ravel(Y)).transform(Xpca)

svm = SVC(C = 10, cache_size=200, coef0=0.0, gamma = 0.0001,
          degree=3, kernel='rbf', max_iter=-1,
          probability=True,random_state=None, shrinking=True, 
          tol=0.0001, verbose=False)

svm.fit(X_pca, np.ravel(Y))
scores = cross_validation.cross_val_score(clf, X_pca, np.ravel(Y), cv=10)
print scores.mean()
svm2 = SVC(C = 10, cache_size=200, coef0=0.0, gamma = 0.0001,
          degree=3, kernel='rbf', max_iter=-1,
          probability=True,random_state=None, shrinking=True, 
          tol=0.0001, verbose=False)
Xscale = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True).fit_transform(Xtrain)
pca = decomposition.PCA(n_components=420)
Xpca = pca.fit_transform(Xscale)
svm2.fit(Xtrain, np.ravel(Ytrain))
scores2 = cross_validation.cross_val_score(svm2, Xpca, np.ravel(Ytrain), cv=10)
print scores2.mean()
plt.figure()
plt.plot(scores, label='Part 3')
plt.plot(scores2, label='Part 1')
plt.xlabel('Fold')
plt.ylabel('Cross-Validation Score')
plt.title('Cross-Validation Scores for Part 1 and Part 3')
plt.legend(loc="lower right")
plt.show()

0.816815837429
0.620828300551

In [None]:
Y = label_binarize(Y, classes=[0, 1, 3])
n_samples, n_features = X.shape
random_state = np.random.RandomState(0)
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
classifier = OneVsRestClassifier(SVC(kernel='rbf', probability=True,
                                  gamma=0.0001,C=10, tol=0.0001))
classifier.fit(X_pca, np.ravel(Y))

In [None]:
n_classes = 3
XTscale = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True).fit_transform(Xtest)
XTpca = pca.transform(XTscale)
X_Tpca = selection.transform(XTpca)
y_score = classifier.decision_function(X_Tpca)
y_test = classifier.predict(X_Tpca)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])


##############################################################################
# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr[2], tpr[2], label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()


##############################################################################
# Plot ROC curves for the multiclass problem

# Compute macro-average ROC curve and ROC area

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         linewidth=2)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         linewidth=2)

for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
                                   ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")


In [None]:
'''
Predict on old testX
'''
# Predict Y values
XTscale = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True).fit_transform(Xtest)
XTpca = pca.transform(XTscale)
X_Tpca = selection.transform(XTpca)
Ytest = svm.predict(X_Tpca)
print Ytest
# See accuracy on test set
count = 0
for i in xrange(0, np.shape(Ytest)[0]):
    if(Ytest[i] == Yt2[i][0]):
        count = count + 1
print ((float(count) / 1000) *  100), "%"