In [62]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import lasso_path
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

This MALDI-TOF dataset consists in:
A) A reference panel of 20 Gram positive and negative bacterial species covering 9 genera among which several species are known to be hard to discriminate by mass spectrometry (MALDI-TOF). Each species was represented by 11 to 60 mass spectra obtained from 7 to 20 bacterial strains, constituting altogether a dataset of 571 spectra obtained from 213 strains. The spectra were obtained according to the standard culture-based workflow used in clinical routine in which the microorganism was first grown on an agar plate for 24 to 48 hours, before a portion of colony was picked, spotted on a MALDI slide and a mass spectrum was acquired. 

*Input Pure Spectra*

In [63]:
df_pure_spectra_matrix = pd.read_csv('pure_spectra_matrix.csv', delimiter=';')
print(df_pure_spectra_matrix.head())
print(len(df_pure_spectra_matrix))

   0  0.1  0.2  0.3  0.4  0.5  44431.921875  0.6  0.7  0.8   ...    \
0  0    0  0.0  0.0  0.0    0      0.000000  0.0  0.0  0.0   ...     
1  0    0  0.0  0.0  0.0    0      0.000000  0.0  0.0  0.0   ...     
2  0    0  0.0  0.0  0.0    0      0.000000  0.0  0.0  0.0   ...     
3  0    0  0.0  0.0  0.0    0   4701.712402  0.0  0.0  0.0   ...     
4  0    0  0.0  0.0  0.0    0      0.000000  0.0  0.0  0.0   ...     

         0.1217  0.1218  0.1219  0.1220        0.1221  0.1222  0.1223  0.1224  \
0  27667.777344     0.0       0     0.0  35438.351562     0.0     0.0     0.0   
1  59820.968750     0.0       0     0.0   6384.590820     0.0     0.0     0.0   
2      0.000000     0.0       0     0.0      0.000000     0.0     0.0     0.0   
3   3913.969727     0.0       0     0.0      0.000000     0.0     0.0     0.0   
4      0.000000     0.0       0     0.0      0.000000     0.0     0.0     0.0   

   0.1225  0.1226  
0     0.0     0.0  
1     0.0     0.0  
2     0.0     0.0  
3     0.0   

*Check Pure Spectra Metadata*

In [64]:
df_pure_spectra_metadata = pd.read_csv('pure_spectra_metadata.csv', delimiter=';')
print(df_pure_spectra_metadata.head(10))
print(len(df_pure_spectra_metadata))

   Species  Strain
0  RTO.TQH       1
1  BUT.YZE       2
2  BUT.YZE       2
3  BUT.YZE       2
4  BUT.YZE       2
5  VVJ.KSF       3
6  VVJ.KSF       3
7  VVJ.KSF       3
8  VVJ.KSF       3
9  VVJ.KSF       4
571


*Input Mixed Spectra Metadata and Check*

In [65]:
df_mixed_spectra_metadata = pd.read_csv('mixed_spectra_metadata.csv', delimiter=';')
print(df_mixed_spectra_metadata)
print(len(df_mixed_spectra_metadata))

    Mixture_Label  Mixture_Id Species_1  Strain_1 Species_2  Strain_2  \
0               A           1   NYV.XSY       214   NYV.VCE        77   
1               A           1   NYV.XSY       214   NYV.VCE        77   
2               A           1   NYV.XSY       214   NYV.VCE        77   
3               A           1   NYV.XSY       214   NYV.VCE        77   
4               A           1   NYV.XSY       214   NYV.VCE        77   
5               A           1   NYV.XSY       214   NYV.VCE        77   
6               A           1   NYV.XSY       214   NYV.VCE        77   
7               A           1   NYV.XSY       214   NYV.VCE        77   
8               A           1   NYV.XSY       214   NYV.VCE        77   
9               A           1   NYV.XSY       214   NYV.VCE        77   
10              A           1   NYV.XSY       214   NYV.VCE        77   
11              A           1   NYV.XSY       214   NYV.VCE        77   
12              A           1   NYV.XSY       214  

B) Based on this reference panel, a dedicated in vitro mock-up mixture dataset was constituted. For that purpose we considered 10 pairs of species of various taxonomic proximity:
* 4 mixtures, labelled A, B, C and D, involved species that belong to the same genus,
* 2 mixtures, labelled E and F, involved species that belong to distinct genera, but to the same Gram type,
* 4 mixtures, labelled G, H, I and J, involved species that belong to distinct Gram types.
Each mixture was represented by 2 pairs of strains, which were mixed according to the following 9 concentration ratios : 1:0, 10:1, 5:1, 2:1, 1:1, 1:2, 1:5, 1:10, 0:1. Two replicate spectra were acquired for each concentration ratio and each couple of strains, leading altogether to a dataset of 360 spectra, among which 80 are actually pure sample spectra.

*Input Mixed Spectra and Check*

In [66]:
df_mixed_spectra_matrix = pd.read_csv('mixed_spectra_matrix.csv', delimiter=';', header=None)
print(df_mixed_spectra_matrix.head(360))
print(len(df_mixed_spectra_matrix))

     0     1           2     3     4     5     6             7     \
0       0   0.0        0.00   0.0   0.0     0   0.0      0.000000   
1       0   0.0        0.00   0.0   0.0     0   0.0      0.000000   
2       0   0.0        0.00   0.0   0.0     0   0.0      0.000000   
3       0   0.0        0.00   0.0   0.0     0   0.0      0.000000   
4       0   0.0        0.00   0.0   0.0     0   0.0      0.000000   
5       0   0.0        0.00   0.0   0.0     0   0.0      0.000000   
6       0   0.0        0.00   0.0   0.0     0   0.0      0.000000   
7       0   0.0        0.00   0.0   0.0     0   0.0      0.000000   
8       0   0.0        0.00   0.0   0.0     0   0.0      0.000000   
9       0   0.0        0.00   0.0   0.0     0   0.0      0.000000   
10      0   0.0        0.00   0.0   0.0     0   0.0      0.000000   
11      0   0.0        0.00   0.0   0.0     0   0.0      0.000000   
12      0   0.0        0.00   0.0   0.0     0   0.0      0.000000   
13      0   0.0        0.00   0.0 

*Input Mixed Spectra*

In [67]:
df_mixed_spectra_metadata = pd.read_csv('mixed_spectra_metadata.csv', delimiter=';')
print(df_mixed_spectra_metadata.head(20))
print(len(df_mixed_spectra_metadata))

   Mixture_Label  Mixture_Id Species_1  Strain_1 Species_2  Strain_2  \
0              A           1   NYV.XSY       214   NYV.VCE        77   
1              A           1   NYV.XSY       214   NYV.VCE        77   
2              A           1   NYV.XSY       214   NYV.VCE        77   
3              A           1   NYV.XSY       214   NYV.VCE        77   
4              A           1   NYV.XSY       214   NYV.VCE        77   
5              A           1   NYV.XSY       214   NYV.VCE        77   
6              A           1   NYV.XSY       214   NYV.VCE        77   
7              A           1   NYV.XSY       214   NYV.VCE        77   
8              A           1   NYV.XSY       214   NYV.VCE        77   
9              A           1   NYV.XSY       214   NYV.VCE        77   
10             A           1   NYV.XSY       214   NYV.VCE        77   
11             A           1   NYV.XSY       214   NYV.VCE        77   
12             A           1   NYV.XSY       214   NYV.VCE      

*Train model and Predict with X_train and X_test*

In [68]:
X_train = df_pure_spectra_matrix
print(X_train.shape)
y_train = df_pure_spectra_metadata["Strain"][0:570]
X_test = df_mixed_spectra_matrix
enc = OneHotEncoder(sparse=False)

#columns : list, optional, default:None. If None, return all columns, otherwise, returns specified columns
y_train_reshape = y_train.as_matrix(columns=None)
y_train_reshape = y_train_reshape.reshape(-1, 1)
Y_transform = enc.fit_transform(y_train_reshape)

#print(y_train)
print("Transformed y_train to onehot encoding")
print(Y_transform)
print(Y_transform.shape)

(570, 1300)
Transformed y_train to onehot encoding
[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
(570, 213)


  


Instantiate a Lasso regressor, fit the training data, and predict y_train

In [69]:
# Instantiate a lasso regressor: lassoreg
lassoreg = Lasso(normalize=True, positive=True, alpha = .00000001, max_iter=10000, selection="random")

# Fit the regressor to the data
print(X_train.shape)
lassoreg.fit(X_train,Y_transform)

# Compute the coefficients
lassoreg_coef = lassoreg.coef_

#print(lassoreg_coef)
y_pred_train = lassoreg.predict(X_train)
print("y_pred_train")
print(y_pred_train)

##Check that the sum of probabilities is 1
#Y_transform.sum(axis=1)

(570, 1300)
y_pred_train
[[ 8.30298686e-01  4.42669543e-01  4.39095455e-02 ... -1.24413118e-03
  -2.17751177e-03  1.30458694e-04]
 [ 2.44519698e-01  5.10904115e-01  3.34418805e-02 ...  1.28619651e-03
   8.55609262e-03 -4.87776134e-06]
 [ 5.62589606e-02  1.01389839e+00  8.99312477e-03 ...  3.05321259e-03
  -2.37894063e-04 -1.98961729e-06]
 ...
 [-3.19093971e-04 -1.05643425e-03  2.38492685e-02 ...  3.48799315e-02
   3.53476443e-01 -2.34008333e-06]
 [-6.94772704e-03 -2.94525872e-03 -5.17951910e-03 ... -2.38260287e-02
   2.43548890e-01  1.53030370e-05]
 [ 2.02333613e-02  4.38380021e-03  3.70914910e-02 ...  5.25018516e-02
   2.51612815e-01  9.99994317e-01]]


In [70]:
#Y_transform_get_dummies = pd.get_dummies(y_train, drop_first=True, prefix='category', columns=['category'])

#One-hot encoding and remove one variable
Y_transform_get_dummies = pd.get_dummies(y_train, drop_first=True)
print(Y_transform_get_dummies)

     2    3    4    5    6    7    8    9    10   11  ...   204  205  206  \
0      0    0    0    0    0    0    0    0    0    0 ...     0    0    0   
1      1    0    0    0    0    0    0    0    0    0 ...     0    0    0   
2      1    0    0    0    0    0    0    0    0    0 ...     0    0    0   
3      1    0    0    0    0    0    0    0    0    0 ...     0    0    0   
4      1    0    0    0    0    0    0    0    0    0 ...     0    0    0   
5      0    1    0    0    0    0    0    0    0    0 ...     0    0    0   
6      0    1    0    0    0    0    0    0    0    0 ...     0    0    0   
7      0    1    0    0    0    0    0    0    0    0 ...     0    0    0   
8      0    1    0    0    0    0    0    0    0    0 ...     0    0    0   
9      0    0    1    0    0    0    0    0    0    0 ...     0    0    0   
10     0    0    1    0    0    0    0    0    0    0 ...     0    0    0   
11     0    0    0    1    0    0    0    0    0    0 ...     0    0    0   

In [71]:
# Instantiate a lasso regressor: lassoreg
lassoreg = Lasso(normalize=True, positive=True, alpha = .00001, max_iter=10000, selection="random")

# Fit the regressor to the data
print(X_train.shape)
lassoreg.fit(X_train,Y_transform_get_dummies)

# Compute the coefficients
lassoreg_coef = lassoreg.coef_

#print(lassoreg_coef)
y_pred_train = lassoreg.predict(X_train)
print("y_pred_train")
print(y_pred_train)

(570, 1300)
y_pred_train
[[ 4.38632353e-01  4.61303009e-02  2.66631769e-03 ... -5.39446224e-03
  -2.33351810e-03  1.14233049e-03]
 [ 5.04992090e-01  3.60646649e-02  3.63894215e-03 ... -2.47138296e-03
   9.50972306e-03 -4.17220948e-07]
 [ 1.00899049e+00  1.28325213e-02 -4.36065375e-03 ... -1.78579151e-03
  -4.79186308e-03 -4.17220948e-07]
 ...
 [ 5.74962972e-04  2.37789079e-02 -4.47549208e-03 ...  3.44556865e-02
   3.49015187e-01 -4.17220948e-07]
 [-3.58756841e-03 -2.37853118e-03 -4.80120880e-03 ... -1.96934255e-02
   2.35845664e-01 -4.17220948e-07]
 [ 2.28036614e-03  3.59201672e-02 -7.64245762e-04 ...  5.47181310e-02
   2.46274777e-01  9.94305002e-01]]


Display bacteria strain predicted from X_train, indicate matches with Y_transform

In [72]:
print("Predicted bacteria strain from pure spectra using Lasso regression")
print(y_pred_train.argmax(axis=1) + 1)

print("Matches of strain predicted for training using Lasso Regression and actual for each sample:")
print(Y_transform.argmax(axis=1)+1)
comparison_train = np.equal(Y_transform.argmax(axis=1)+1,y_pred_train.argmax(axis=1)+1)
print(comparison_train)

#function to calculate accuracy of model's prediction
def accuracy(comparison):
    total_samples = len(comparison)
    print(total_samples)
    matches = np.sum(comparison)
    print(matches)
    print(float(matches)/float(total_samples))

accuracy(comparison_train)

Predicted bacteria strain from pure spectra using Lasso regression
[  1   1   1   1  74   2   6   2   2   3   3   2   4   4   5   5   6  22
   6   6   6 199 199   8   8  53   9   9  10  11  11  11  12 143  13  13
  13  14  14  14 137  15  15  16  16 171  17  17 104  17  18  19  20 135
  34  20  20  21  21 150  22  22   1  57  23  57 209  24  24  25  25   8
  25  26  26  26  26  28  28  28  28  28  29  30  30 125  30  31  32  32
  33  33  34  34  34  34  35  35  36  37  37  38  38  39  39  40  40  40
  40  41  41  41  42  41  42  43  43  43 130  44  45  45  46  46  46  46
  47  47  48  48  48  48  48 155  49  50  50  50  50  50  51 174  51  52
  52  52  53  53  53  54  54  55  55  55  55  56 106  56 211  57  57  57
  53  58  57  58  59  60   3  61  61  61  62  62  62  62  63  63  65  65
  65  65  65  65  65  66 154 154  67  68  68  68  68  69  69  69  70  71
  71  71  72  72  73  73  73  73  73  73  75  75  75  75  76  76  76  76
  17  77  77  78  79  79  80  81  81  82  83  83  83  83 

In [73]:
print("Predicted bacteria strain from pure spectra using Lasso regression")
print(y_pred_train.argmax(axis=1) + 1)

print("Matches of strain predicted for training using Lasso Regression and actual for each sample:")
print(Y_transform_get_dummies)
#print(Y_transform_get_dummies.idxmax(axis=1)+1)
#comparison_train = np.equal(Y_transform_get_dummies.idxmax(axis=1)+1,y_pred_train.argmax(axis=1)+1)
print(Y_transform_get_dummies.idxmax(axis=1))
comparison_train = np.equal(Y_transform_get_dummies.idxmax(axis=1),y_pred_train.argmax(axis=1))

print(comparison_train)

accuracy(comparison_train)

Predicted bacteria strain from pure spectra using Lasso regression
[  1   1   1   1  74   2   6   2   2   3   3   2   4   4   5   5   6  22
   6   6   6 199 199   8   8  53   9   9  10  11  11  11  12 143  13  13
  13  14  14  14 137  15  15  16  16 171  17  17 104  17  18  19  20 135
  34  20  20  21  21 150  22  22   1  57  23  57 209  24  24  25  25   8
  25  26  26  26  26  28  28  28  28  28  29  30  30 125  30  31  32  32
  33  33  34  34  34  34  35  35  36  37  37  38  38  39  39  40  40  40
  40  41  41  41  42  41  42  43  43  43 130  44  45  45  46  46  46  46
  47  47  48  48  48  48  48 155  49  50  50  50  50  50  51 174  51  52
  52  52  53  53  53  54  54  55  55  55  55  56 106  56 211  57  57  57
  53  58  57  58  59  60   3  61  61  61  62  62  62  62  63  63  65  65
  65  65  65  65  65  66 154 154  67  68  68  68  68  69  69  69  70  71
  71  71  72  72  73  73  73  73  73  73  75  75  75  75  76  76  76  76
  17  77  77  78  79  79  80  81  81  82  83  83  83  83 

In [74]:
#y_true = Y_transform.argmax(axis=1)+1
#y_prediction = y_pred_train.argmax(axis=1)+1

#cnf_matrix = confusion_matrix(y_true, y_prediction)
#print(cnf_matrix)
##[[1 1 3]
## [3 2 2]
## [1 3 1]]

#FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
#FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
#TP = np.diag(cnf_matrix)
#TN = cnf_matrix.sum() - (FP + FN + TP)

#FP = FP.astype(float)
#FN = FN.astype(float)
#TP = TP.astype(float)
#TN = TN.astype(float)

## Overall accuracy
#ACC = (TP+TN)/(TP+FP+FN+TN)

#print(ACC)

## Sensitivity, hit rate, recall, or true positive rate
#TPR = TP/(TP+FN)
## Specificity or true negative rate
#TNR = TN/(TN+FP) 
## Precision or positive predictive value
#PPV = TP/(TP+FP)
## Negative predictive value
#NPV = TN/(TN+FN)
## Fall out or false positive rate
#FPR = FP/(FP+TN)
## False negative rate
#FNR = FN/(TP+FN)
## False discovery rate
#FDR = FP/(TP+FP)

Predict strain from mixtures stored in X_test

In [75]:
print("Predict strain from X_test using Lasso Regression:")
y_pred_test = lassoreg.predict(X_test)

print("y_pred_test")
print(y_pred_test)

#print("The length of y_pred_test is: " + str(len(y_pred_test)))

print("The predicted strain in the mixture for each sample is:")
print(y_pred_test.argmax(axis=1))
y_pred_test = y_pred_test.argmax(axis=1)

#print("The length of the mixed spectra metadata is: " + str(len(df_mixed_spectra_metadata)))
y_actual = df_mixed_spectra_metadata['Strain_1']
#print((y_pred_test-1)==df_mixed_spectra_metadata['Strain_1'][1:] or (y_pred_test-1)==df_mixed_spectra_metadata['Strain_2'][1:])

comparison = np.equal(y_pred_test,y_actual)

print(comparison)

accuracy(comparison)

#print(np.sum(comparison))


#TP = 0
#FP = 0
#TN = 0
#FN = 0

#for i in range(len(y_pred_test)): 
#    if y_actual[i]==y_pred_test[i]==True:
#        TP += 1
#    if y_pred_test[i]==True and y_actual[i]!=y_pred_test[i]:
#        FP += 1
#    if y_actual[i]==y_pred_test[i]==False:
#        TN += 1
#    if y_pred_test[i]==False and y_actual[i]!=y_pred_test[i]:
#        FN += 1

#print(y_pred_test[359])

#print(TP/len(y_pred_test))
#print(FP/len(y_pred_test))
#print(TN/len(y_pred_test))
#print(FN/len(y_pred_test))

Predict strain from X_test using Lasso Regression:
y_pred_test
[[ 6.02265298e-03 -1.15932806e-02  8.52094502e-05 ...  9.56760663e-02
  -1.26672267e-02 -4.17220948e-07]
 [-3.64318288e-03  3.89729559e-03 -3.60110865e-03 ...  2.76240177e-02
  -1.67018709e-02 -4.17220948e-07]
 [ 6.09858473e-03 -1.34272926e-02  9.17981928e-04 ...  5.20433188e-02
  -1.68013710e-02 -4.17220948e-07]
 ...
 [-2.45014814e-03 -3.62153103e-03 -5.00958181e-03 ...  1.58832042e-02
  -1.16100000e-02 -4.17220948e-07]
 [-3.70052645e-03 -4.79456180e-03 -3.26402541e-03 ...  2.62415912e-02
  -1.38642438e-02 -4.17220948e-07]
 [-2.11977195e-03  5.50490612e-03 -3.21406902e-03 ...  1.31778375e-02
  -1.13883868e-02 -4.17220948e-07]]
The predicted strain in the mixture for each sample is:
[135 135 114 114 114 135 155 114  61  61  61  61  51  61 135  61  61  61
 155 155 178 178 178  67  13 155  61  13  61  61 148  61  51  61  61  51
  40 171  40 120  40  40  40  11  46 210  46  46  46  46  46  45 210  46
 120 142  53 142 142 120 1

In [76]:
#clf=RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1)
clf=RandomForestClassifier()

#Train the model using the training sets y_pred_test=clf.predict(X_test)
print(X_train.shape)
print(Y_transform.shape)
clf.fit(X_train,Y_transform)

y_pred_train=clf.predict(X_train)
print(y_pred_train)

y_pred_test=clf.predict(X_test)
print(y_pred_test)
print(y_pred_test.argmax(axis=1))
#print(df_mixed_spectra_metadata[1]==y_pred_train.argmax(axis=1))
print(df_mixed_spectra_metadata["Strain_1"][0:570] - 1 == y_pred_test.argmax(axis=1))
#print((y_train-1)==y_pred_train.argmax(axis=1))

(570, 1300)
(570, 213)
[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

Logistic Regression

In [80]:
X_train = df_pure_spectra_matrix
y_train = df_pure_spectra_metadata["Strain"][0:570]
X_test = df_mixed_spectra_matrix
y_train = y_train.astype('category')

# Instantiate a lasso regressor: lassoreg
logreg = LogisticRegression(penalty='l1')

# Fit the regressor to the data
print(X_train.shape)
print(y_train.shape)
logreg.fit(X_train,y_train)

#print(lassoreg_coef)
print(y_train)
y_pred_train = logreg.predict_proba(X_train)
print(y_pred_train)
y_pred_test = logreg.predict_proba(X_test)
print(y_pred_test)
print(df_mixed_spectra_metadata[:])

y_true = df_mixed_spectra_metadata["Strain_1"][0:570] - 1
y_prediction = y_pred_test.argmax(axis=1)

print(y_true == y_prediction)
print("The matching accuracy of predicted versus ground truth is:")
print(accuracy_score(y_true, y_prediction))

(570, 1300)
(570,)
0        1
1        2
2        2
3        2
4        2
5        3
6        3
7        3
8        3
9        4
10       4
11       5
12       5
13       5
14       6
15       6
16       7
17       7
18       7
19       7
20       7
21       7
22       8
23       8
24       9
25       9
26      10
27      10
28      11
29      12
      ... 
540    202
541    202
542    202
543    202
544    203
545    203
546    204
547    205
548    206
549    206
550    206
551    206
552    207
553    207
554    208
555    208
556    209
557    209
558    210
559    210
560    211
561    211
562    211
563    211
564    211
565    212
566    212
567    212
568    212
569    213
Name: Strain, Length: 570, dtype: category
Categories (213, int64): [1, 2, 3, 4, ..., 210, 211, 212, 213]
[[9.99968909e-01 1.06608926e-05 2.94293388e-07 ... 1.65695439e-07
  6.88677321e-13 9.44597563e-09]
 [7.51374641e-06 9.99965716e-01 6.23473828e-07 ... 3.18709510e-07
  1.13937576e-11 3.92402887e-08]
 [3.69

In [79]:
X_train = df_pure_spectra_matrix
y_train = df_pure_spectra_metadata["Strain"][0:570]
X_test = df_mixed_spectra_matrix
y_train = y_train.astype('category')

# Instantiate a lasso regressor: lassoreg
logreg = LogisticRegression(penalty='l1')

# Fit the regressor to the data
print(X_train.shape)
print(Y_transform.shape)
logreg.fit(X_train,Y_transform)

#print(lassoreg_coef)
print(y_transform)
y_pred_train = logreg.predict_proba(X_train)
print(y_pred_train)
y_pred_test = logreg.predict_proba(X_test)
print(y_pred_test)
print(df_mixed_spectra_metadata[:])

y_true = df_mixed_spectra_metadata["Strain_1"][0:570] - 1
y_prediction = y_pred_test.argmax(axis=1)

print(y_true == y_prediction)
print("The matching accuracy of predicted versus ground truth is:")
print(accuracy_score(y_true, y_prediction))

(570, 1300)
(570, 213)


ValueError: bad input shape (570, 213)

In [None]:
X_train = df_pure_spectra_matrix
y_train = df_pure_spectra_metadata["Strain"][0:570]
X_test = df_mixed_spectra_matrix

print(y_train)

# Instantiate a lasso regressor: lassoreg
lassoreg = Lasso(normalize=True, positive=True, alpha = .1)

# Fit the regressor to the data
lassoreg.fit(X_train,y_train)

# Compute the coefficients
lassoreg_coef = lassoreg.coef_
#print(lassoreg_coef)
y_pred_train = lassoreg.predict(X_train)
#print(y_pred_train)
y_pred_test = lassoreg.predict(X_test)
#print(y_pred_test)

# Plot the coefficients
plt.plot(range(len(lassoreg_coef)), lassoreg_coef)
plt.xticks(range(len(lassoreg_coef)), lassoreg_coef, rotation=60)
plt.margins(0.02)
plt.show()

# Plot the y_pred_test
# plt.plot(range(len(y_pred_test)), y_pred_test)
# plt.xticks(range(len(y_pred_test)), y_pred_test, rotation=60)
# plt.margins(0.02)
# plt.show()

print(y_pred_test)

# Import Lasso
#from sklearn.linear_model import Lasso

# Instantiate a lasso regressor: lasso
#lasso = Lasso(alpha = 0.4, normalize = True)

# Fit the regressor to the data
#lasso.fit(X, y)

# Compute and print the coefficients
#lasso_coef = lasso.coef_
#print(lasso_coef)

# Plot the coefficients
#plt.plot(range(len(df_columns)), lasso_coef)
#plt.xticks(range(len(df_columns)), df_columns.values, rotation=60)
#plt.margins(0.02)
#plt.show()

In [None]:
_, coef_path, _ = lasso_path(X_train,y_train,max_iter=1,positive=True)
print(coef_path)

*Calculate Length of y_pred_train and y_pred_test*

In [None]:
print(len(y_pred_train))
print(len(y_pred_test))

*Evaluate test data*

In [None]:
X_train = [[0, 2],[2,0]]
y_train = pd.Series([1, 2])
X_test = [[0, 2], [2, 0]]
y_train = y_train.astype('category')

lassoreg = Lasso(positive=True, alpha = 0.001, normalize=True)
lassoreg.fit(X_train,y_train)
#y_pred_train = lassoreg.predict(X_train)
#print(y_pred_train)
y_pred_test = lassoreg.predict(X_test)
print(y_pred_test)

In [None]:
X_train = [[0, 2],[2,0]]
y_train = pd.Series(["a", "b"])
X_test = [[0, 2], [2, 0]]
y_train = y_train.astype('category')

logreg = LogisticRegression(penalty='l1')
logreg.fit(X_train,y_train)
#y_pred_train = lassoreg.predict(X_train)
#print(y_pred_train)
y_pred_test = logreg.predict(X_test)
print(y_pred_test)
y_pred = logreg.predict_proba(X_test)
print(y_pred)

In [None]:
X_train = df_pure_spectra_matrix
y_train = df_pure_spectra_metadata["Strain"][0:570]
X_test = df_mixed_spectra_matrix
enc = OneHotEncoder(sparse=False)
y_train_reshape = y_train.as_matrix(columns=None)
y_train_reshape = y_train_reshape.reshape(-1, 1)
Y_transform = enc.fit_transform(y_train_reshape)

# Instantiate a lasso regressor: lassoreg
logreg = LogisticRegression(penalty='l1')

# Fit the regressor to the data
logreg.fit(X_train,y_train)

#print(lassoreg_coef)
print(y_train)
y_pred_train = logreg.predict(X_train)
print(y_pred_train)
y_pred_test = logreg.predict(X_test)
print(y_pred_test)
print(df_mixed_spectra_metadata[:])

In [None]:
# Generate some sparse data to play with
np.random.seed(42)

n_samples, n_features = 50, 200
X = np.random.randn(n_samples, n_features)
coef = 3 * np.random.randn(n_features)
inds = np.arange(n_features)
np.random.shuffle(inds)
coef[inds[10:]] = 0  # sparsify coef
y = np.dot(X, coef)

# add noise
y += 0.01 * np.random.normal(size=n_samples)

# Split data in train set and test set
n_samples = X.shape[0]
X_train, y_train = X[:n_samples // 2], y[:n_samples // 2]
X_test, y_test = X[n_samples // 2:], y[n_samples // 2:]

# #############################################################################
# Lasso

alpha = 0.1
lasso = Lasso(alpha=alpha, positive=True)

y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test)
r2_score_lasso = r2_score(y_test, y_pred_lasso)
print(lasso)
print("r^2 on test data : %f" % r2_score_lasso)

# #############################################################################
# ElasticNet

enet = ElasticNet(alpha=alpha, l1_ratio=0.7)

y_pred_enet = enet.fit(X_train, y_train).predict(X_test)
r2_score_enet = r2_score(y_test, y_pred_enet)
print(enet)
print("r^2 on test data : %f" % r2_score_enet)

plt.plot(enet.coef_, color='lightgreen', linewidth=2,
         label='Elastic net coefficients')
plt.plot(lasso.coef_, color='gold', linewidth=2,
         label='Lasso coefficients')
plt.plot(coef, '--', color='navy', label='original coefficients')
plt.legend(loc='best')
plt.title("Lasso R^2: %f, Elastic Net R^2: %f"
          % (r2_score_lasso, r2_score_enet))
plt.show()