## <font color='green'> Application of Support Vector Machine to Gene Expression Data (Khan.csv)

In [1]:
import os
os.chdir('../data')

import numpy as np
import pandas as pd
import math

# 83 tissue samples are classified into four cancer types based on 2308 gene expression measurements
raw0 = pd.read_csv('Khan.csv') 

print(raw0.head())
print(raw0.shape) # high-dimensional data (large # of features)

  Unnamed: 0  c        V1        V2        V3        V4        V5        V6  \
0         V1  2  0.773344 -2.438405 -0.482562 -2.721135 -1.217058  0.827809   
1         V2  2 -0.078178 -2.415754  0.412772 -2.825146 -0.626236  0.054488   
2         V3  2 -0.084469 -1.649739 -0.241308 -2.875286 -0.889405 -0.027474   
3         V4  2  0.965614 -2.380547  0.625297 -1.741256 -0.845366  0.949687   
4         V5  2  0.075664 -1.728785  0.852626  0.272695 -1.841370  0.327936   

         V7        V8  ...     V2299     V2300     V2301     V2302     V2303  \
0  1.342604  0.057042  ... -0.238511 -0.027474 -1.660205  0.588231 -0.463624   
1  1.429498 -0.120249  ... -0.657394 -0.246284 -0.836325 -0.571284  0.034788   
2  1.159300  0.015676  ... -0.696352  0.024985 -1.059872 -0.403767 -0.678653   
3  1.093801  0.819736  ...  0.259746  0.357115 -1.893128  0.255107  0.163309   
4  1.251219  0.771450  ... -0.200404  0.061753 -2.273998 -0.039365  0.368801   

      V2304     V2305     V2306     V2307   

### <font color='green'> Select a kernel function and tune the penalty parameter "C" using CV

#### <font color='green'> i) Data Preparation

In [2]:
X = raw0.iloc[:,2:]
Y = raw0.iloc[:,1]

import warnings
warnings.filterwarnings("ignore") # suppress warnings

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

#### <font color='green'> ii) Select a kernel function and tune the penalty parameter "C" using "GridSearchCV"
* SVC: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
* Available kernel functions: https://scikit-learn.org/stable/modules/svm.html#svm-kernels
* Precision & Recall: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html#sklearn.metrics.precision_recall_fscore_support

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC # support vector machines for classification (SVR is for regression)

tuned_parameters = [{'kernel': ['rbf'], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['poly'], 'degree': [5, 10, 15], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['sigmoid'], 'coef0': [0, 1, 2], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % score).fit(X_train, y_train)

    print("Best parameters set found on train set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on train set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The scores are computed on test set.")
    print()
    print(classification_report(y_test, clf.predict(X_test)))
    print()

# Tuning hyper-parameters for precision

Best parameters set found on train set:

{'C': 1, 'kernel': 'linear'}

Grid scores on train set:

0.983 (+/-0.067) for {'C': 1, 'kernel': 'rbf'}
0.990 (+/-0.040) for {'C': 10, 'kernel': 'rbf'}
0.990 (+/-0.040) for {'C': 100, 'kernel': 'rbf'}
0.990 (+/-0.040) for {'C': 1000, 'kernel': 'rbf'}
1.000 (+/-0.000) for {'C': 1, 'kernel': 'linear'}
1.000 (+/-0.000) for {'C': 10, 'kernel': 'linear'}
1.000 (+/-0.000) for {'C': 100, 'kernel': 'linear'}
1.000 (+/-0.000) for {'C': 1000, 'kernel': 'linear'}
0.855 (+/-0.230) for {'C': 1, 'degree': 5, 'kernel': 'poly'}
0.744 (+/-0.158) for {'C': 1, 'degree': 10, 'kernel': 'poly'}
0.634 (+/-0.180) for {'C': 1, 'degree': 15, 'kernel': 'poly'}
0.914 (+/-0.139) for {'C': 10, 'degree': 5, 'kernel': 'poly'}
0.819 (+/-0.156) for {'C': 10, 'degree': 10, 'kernel': 'poly'}
0.627 (+/-0.174) for {'C': 10, 'degree': 15, 'kernel': 'poly'}
0.914 (+/-0.139) for {'C': 100, 'degree': 5, 'kernel': 'poly'}
0.819 (+/-0.156) for {'C'

## <font color='green'> Classification of Web Documents Using Naive Bayes
* https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

#### <font color='green'> i) Import raw data (texts and their categories)
* 20 news group data : 
    - http://qwone.com/~jason/20Newsgroups/
    - https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html#sklearn.datasets.fetch_20newsgroups

In [4]:
from sklearn.datasets import fetch_20newsgroups
# fetch_20newsgroups is a function !

categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space'] # the entire data contains 20 categories but we'll be using only those categories

remove = ('headers', 'footers', 'quotes') # remove non-main text

data_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove, shuffle=True, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove, shuffle=True, random_state=42)

# extract Y and X from the datasets
Y_train = data_train.target 
Y_test = data_test.target

X_train = data_train.data 
X_test = data_test.data

In [5]:
# check how each category is indexed
data_train.target_names

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']

In [6]:
print(X_train[0]) # text
print(Y_train) # integers (0-3)

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych
[1 3 2 ... 1 0 1]


#### <font color='green'> ii) Covert texts (bags of words) to numerical vectors
* TfidfVectorizer 
    - https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer
    
* Alternatively,
    - CountVectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer
    - HashingVectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html#sklearn.feature_extraction.text.HashingVectorizer
    
* Stop words: https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer 

Vectorizer=TfidfVectorizer(stop_words='english')

X_train = Vectorizer.fit_transform(X_train) 
X_test = Vectorizer.transform(X_test) 

# !!!Caution: Use ".fit_transform()" for training data, but use ".transform()" for testing data
# This is to make sure the training and test sets have the same number of columns (features) 
# Here we are using the vectorizer trained for the training data to convert the testing data

# check the size of X_train
print(X_train.shape)
print(X_test.shape)

(2034, 26576)
(1353, 26576)


In [8]:
print(X_train[0,:])

  (0, 21025)	0.14589314130201253
  (0, 3998)	0.07036762966055367
  (0, 5546)	0.14589314130201253
  (0, 10605)	0.16717988448915075
  (0, 20973)	0.09485258934884024
  (0, 19841)	0.06346990252225734
  (0, 2408)	0.0740617740444856
  (0, 14706)	0.04664380384555488
  (0, 20977)	0.09029017643192268
  (0, 23828)	0.2258173261964949
  (0, 21208)	0.10761272733713331
  (0, 15301)	0.10649668941393081
  (0, 21084)	0.06206087717654851
  (0, 9848)	0.10649668941393081
  (0, 22878)	0.11143515786946494
  (0, 13023)	0.13924444936699948
  (0, 14154)	0.04763446792799959
  (0, 8554)	0.10162931867025875
  (0, 18949)	0.13313300331371947
  (0, 18704)	0.10260670288726481
  (0, 19066)	0.4152868172245007
  (0, 17464)	0.22287031573892988
  (0, 18699)	0.0823823626717928
  (0, 7698)	0.11451045447542964
  (0, 11203)	0.07006583621645067
  (0, 20513)	0.14589314130201253
  (0, 20239)	0.13842893907483358
  (0, 10286)	0.44662488427974206
  (0, 1152)	0.36111859597627916
  (0, 5385)	0.10649668941393081
  (0, 18701)	0.1384289

#### <font color='green'> iii) Run NB
* https://scikit-learn.org/stable/modules/naive_bayes.html
* https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB

In [9]:
from sklearn.naive_bayes import MultinomialNB as NB
from sklearn.metrics import classification_report

NBres= NB(alpha=.01).fit(X_train, Y_train) # alpha is a kind of a shrinkage parameter

print(NBres.score(X_test, Y_test))
print(classification_report(Y_test, NBres.predict(X_test)))

0.78640059127864
              precision    recall  f1-score   support

           0       0.68      0.66      0.67       319
           1       0.92      0.89      0.90       389
           2       0.80      0.90      0.85       394
           3       0.68      0.61      0.64       251

    accuracy                           0.79      1353
   macro avg       0.77      0.76      0.77      1353
weighted avg       0.78      0.79      0.78      1353



In [10]:
# Run SVC on the same data
from sklearn.svm import SVC
SVCres= SVC(kernel = 'linear', C = 10).fit(X_train, Y_train)

print(SVCres.score(X_test, Y_test))
print(classification_report(Y_test, SVCres.predict(X_test)))

0.7634885439763488
              precision    recall  f1-score   support

           0       0.69      0.57      0.62       319
           1       0.90      0.88      0.89       389
           2       0.75      0.89      0.82       394
           3       0.66      0.62      0.64       251

    accuracy                           0.76      1353
   macro avg       0.75      0.74      0.74      1353
weighted avg       0.76      0.76      0.76      1353



### <font color='darkred'> HW7: Similarly to HW6-2, optimize SVC and NB on the newsgroups data

* Select ten categories and import raw data under your categories. Follow the steps above to prepare datasets to run SVC and NB
* Use the function "GridSearchCV" to optimize SVC and NB
    - To optimize SVC, select a kernel function and tune "C" parameter
    - To optimize NB, tune "alpha" parameter
* Use both "precision" and "recall" to evaluate prediction performance

## Prep the Data

In [11]:
from sklearn.datasets import fetch_20newsgroups

categories = [
        'alt.atheism',
'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
'rec.sport.baseball'] 

remove = ('headers', 'footers', 'quotes') # remove non-main text

data_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove, shuffle=True, random_state=10)
data_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove, shuffle=True, random_state=10)

# extract Y and X from the datasets
Y_train = data_train.target 
Y_test = data_test.target

X_train = data_train.data 
X_test = data_test.data

In [12]:
# check how each category is indexed
print('Target names length: ', len(data_train.target_names))
data_train.target_names

Target names length:  10


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball']

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer 

Vectorizer=TfidfVectorizer(stop_words='english')

X_train = Vectorizer.fit_transform(X_train) 
X_test = Vectorizer.transform(X_test) 

# !!!Caution: Use ".fit_transform()" for training data, but use ".transform()" for testing data
# This is to make sure the training and test sets have the same number of columns (features) 
# Here we are using the vectorizer trained for the training data to convert the testing data

# check the size of X_train
print(X_train.shape)
print(X_test.shape)

(5790, 67164)
(3855, 67164)


### Run GridSearchCV on SVC

In [None]:
scores = ['precision', 'recall']

params = {'kernel': ['linear'], 'C': [1, 10, 100]}

for score in scores:
    clf_svc = GridSearchCV(SVC(), params, cv=3, scoring='%s_macro' % score).fit(X_train, Y_train)
    print(f'SVC Best params ({score}): ', clf_svc.best_params_, '\n')
    means = clf_svc.cv_results_['mean_test_score']
    for mean, params in zip(means, clf_svc.cv_results_['params']):
        print("%0.3f for %r \n" % (mean, params))


### Run GridSearchCV on NB

In [None]:
scores = ['precision', 'recall']

params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]}
for score in scores:
    clf_nb = GridSearchCV(NB(), params, cv=5, scoring='%s_macro' % score).fit(X_train, Y_train)
    print(f'NB Best params ({score}): ', clf_svc.best_params_, '\n')
    means = clf_nb.cv_results_['mean_test_score']
    for mean, params in zip(means, clf_nb.cv_results_['params']):
        print("%0.3f for %r \n" % (mean, params))