In [1]:
# Import our dependencies
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
%matplotlib inline 

### Prepare Training and testing data matrix

In [2]:
## Read in input file as data frame
train_df = pd.read_csv("C:\\Users\\Eric\\Desktop\\Final_project_code_data\\10000-1-vcf\\10000-1-vcf\\part1\\train.Matrix", header=None, sep="\t" )
test_df = pd.read_csv("C:\\Users\\Eric\\Desktop\\Final_project_code_data\\10000-1-vcf\\10000-1-vcf\\part1\\test.Matrix", header=None, sep="\t")

## The second column from the tab-delimited .Matrix file is the class, 1,0 means sick, 0,1 means not sick
## Extract the class values, map them 1,0=>1, 0,1=>0, and save them as the y_train and y_test matrix
y_train = train_df.iloc[:,1].str.replace('1,0', '1').replace('0,1', 0).astype('int').to_numpy()
y_test = test_df.iloc[:,1].str.replace('1,0', '1').replace('0,1', 0).astype('int').to_numpy()

## Record dimensions of train and test
train_dim = train_df.shape
test_dim = test_df.shape
train_dim, y_train.shape, test_dim, y_test.shape

((1328, 2), (1328,), (331, 2), (331,))

In [4]:
## Get number of features
num_feats = len(train_df.iloc[0,0].split(','))
num_feats

10000

In [5]:
## Convert comma separated feature str to individual features and save them in X matrix
X_train = np.zeros(shape=[train_dim[0], num_feats ])
for i in range(len(train_df)):
    X_train[i] = train_df.iloc[i,0].split(',')
    
X_test = np.zeros(shape= [test_dim[0], num_feats ])
for i in range(len(test_df)):
    X_test[i] = test_df.iloc[i,0].split(',')
    
X_train.shape, X_test.shape

((1328, 10000), (331, 10000))

In [7]:
x_demo_df= pd.DataFrame({'x':['1,0,1,0', '0,0,0,1'], 'y':['1,0', '0,1']})
x_demo_df

Unnamed: 0,x,y
0,1010,10
1,1,1


In [13]:
X_demo = np.zeros(shape=[2, 4])
X_demo

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [16]:
for i in range(len( x_demo_df)):
    X_demo[i] = x_demo_df.iloc[i, 0].split(',')
X_demo

array([[1., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [10]:
for i in range(len( x_demo_df)):
    print(x_demo_df.iloc[i, 0])


1,0,1,0
0,0,0,1


## SVM Classification

In [6]:
from sklearn.svm import SVC
# Create the SVM model
svm = SVC(kernel='linear')

# Train the model
svm.fit(X_train, y_train)

# Evaluate the model
y_pred = svm.predict(X_test)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SVM model accuracy: 0.861


## RandomForest Classification

In [8]:
from sklearn.ensemble import RandomForestClassifier
# Create RF model
n_estimators = [10, 100, 500, 1000]
for n in n_estimators:
    clf = RandomForestClassifier(n_estimators=n, random_state=0)

    # Train the model
    clf.fit(X_train, y_train)

    # Evaluate the model
    y_pred = clf.predict(X_test)
    print(f" RF model accuracy(n_estimators={n}): {accuracy_score(y_test,y_pred):.3f}")

 RF model accuracy(n_estimators=10): 0.861
 RF model accuracy(n_estimators=100): 0.864
 RF model accuracy(n_estimators=500): 0.876
 RF model accuracy(n_estimators=1000): 0.867


## K-Nearest Neighbor (Knn) Classification

In [10]:
from sklearn.neighbors import KNeighborsClassifier as knn
# Create knn model

# Experiment with different k and two type of weights
for k in range(1, 11):
    for weights in ['uniform', 'distance']:
        clf = knn(k, weights=weights)
        ## Train
        clf.fit(X_train, y_train)
        ## Evaluate
        y_pred = clf.predict(X_test)
        print(f" KNN model accuracy using k={k}, weights={weights}: {accuracy_score(y_test,y_pred):.3f}")
    

 KNN model accuracy using k=1, weights=uniform: 0.553
 KNN model accuracy using k=1, weights=distance: 0.553
 KNN model accuracy using k=2, weights=uniform: 0.616
 KNN model accuracy using k=2, weights=distance: 0.598
 KNN model accuracy using k=3, weights=uniform: 0.474
 KNN model accuracy using k=3, weights=distance: 0.474
 KNN model accuracy using k=4, weights=uniform: 0.523
 KNN model accuracy using k=4, weights=distance: 0.505
 KNN model accuracy using k=5, weights=uniform: 0.462
 KNN model accuracy using k=5, weights=distance: 0.462
 KNN model accuracy using k=6, weights=uniform: 0.480
 KNN model accuracy using k=6, weights=distance: 0.474
 KNN model accuracy using k=7, weights=uniform: 0.456
 KNN model accuracy using k=7, weights=distance: 0.459
 KNN model accuracy using k=8, weights=uniform: 0.468
 KNN model accuracy using k=8, weights=distance: 0.462
 KNN model accuracy using k=9, weights=uniform: 0.423
 KNN model accuracy using k=9, weights=distance: 0.423
 KNN model accuracy

## Gradient Boost Classifier

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

# different learning rate
learning_rate = [0.01, 0.03, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]
n_estimators = [10, 20, 50, 100, 500]

for rate in learning_rate:
    for n in n_estimators:
        clf = GradientBoostingClassifier(n_estimators=n, learning_rate=rate, max_features=2, max_depth=2, random_state=0)

        # Train the model
        clf.fit(X_train, y_train)

        # Evaluate the model
        y_pred = clf.predict(X_test)
        print(f" GBM model accuracy (n_estimators={n}, learning rate={rate}): {accuracy_score(y_test,y_pred):.3f}")

 GBM model accuracy (n_estimators=10, learning rate=0.01): 0.647
 GBM model accuracy (n_estimators=20, learning rate=0.01): 0.647
 GBM model accuracy (n_estimators=50, learning rate=0.01): 0.647
 GBM model accuracy (n_estimators=100, learning rate=0.01): 0.647
 GBM model accuracy (n_estimators=500, learning rate=0.01): 0.647
 GBM model accuracy (n_estimators=10, learning rate=0.03): 0.647
 GBM model accuracy (n_estimators=20, learning rate=0.03): 0.647
 GBM model accuracy (n_estimators=50, learning rate=0.03): 0.647
 GBM model accuracy (n_estimators=100, learning rate=0.03): 0.647
 GBM model accuracy (n_estimators=500, learning rate=0.03): 0.647
 GBM model accuracy (n_estimators=10, learning rate=0.05): 0.647
 GBM model accuracy (n_estimators=20, learning rate=0.05): 0.647
 GBM model accuracy (n_estimators=50, learning rate=0.05): 0.647
 GBM model accuracy (n_estimators=100, learning rate=0.05): 0.647
 GBM model accuracy (n_estimators=500, learning rate=0.05): 0.647
 GBM model accuracy

In [None]:
grid_search