## Generative Models

***

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
# import shap
# import statsmodels.api as sm
# import datetime
# from datetime import datetime, timedelta
# import scipy.stats
# import pandas_profiling
# from pandas_profiling import ProfileReport
# import graphviz

# import xgboost as xgb
# from xgboost import XGBClassifier, XGBRegressor
# from xgboost import to_graphviz, plot_importance

#from sklearn.experimental import enable_hist_gradient_boosting
#from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, LogisticRegression, Ridge
#from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor
#from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor


%matplotlib inline
#sets the default autosave frequency in seconds
%autosave 60 
sns.set_style('dark')
sns.set(font_scale=1.2)

plt.rc('axes', labelsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)


#from sklearn.pipeline import Pipeline
#from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE, RFECV, SelectKBest, f_classif, f_regression, chi2

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import export_graphviz, plot_tree
from sklearn.metrics import confusion_matrix, classification_report, mean_absolute_error, mean_squared_error,r2_score
from sklearn.metrics import plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve, accuracy_score
from sklearn.metrics import auc, f1_score, precision_score, recall_score, roc_auc_score


#from tpot import TPOTClassifier, TPOTRegressor
#from imblearn.under_sampling import RandomUnderSampler
#from imblearn.over_sampling import RandomOverSampler
#from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

# import pickle
# from pickle import dump, load

# Use Folium library to plot values on a map.
#import folium

# Use Feature-Engine library

#import feature_engine.missing_data_imputers as mdi
#from feature_engine.outlier_removers import Winsorizer
#from feature_engine import categorical_encoders as ce


np.random.seed(0)

#from pycaret.classification import *
#from pycaret.clustering import *
#from pycaret.regression import *

pd.set_option('display.max_columns',100)
#pd.set_option('display.max_rows',100)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format','{:.2f}'.format)
np.set_printoptions(suppress=True)

Autosaving every 60 seconds


In the following exercises, we are going to apply LDA and QDA to the tissue_gene_expression dataset from dslabs. We will start with simple examples based on this dataset and then develop a realistic example.

Create a dataset of samples from just cerebellum and hippocampus, two parts of the brain, and a predictor matrix with 10 randomly selected columns using the following code:

In [2]:
df = pd.read_csv("tissuex.csv")

In [3]:
df

Unnamed: 0,SAPCD1,HEMK1,PLCB1,SPI1,RAB1B,MSH4,IL18R1,OAZ2,FOXE3,C21orf62,y
0,6.70,7.34,4.99,5.99,10.49,5.35,5.36,10.23,5.96,5.84,cerebellum
1,7.30,7.28,4.81,5.82,9.94,5.55,5.66,9.84,5.97,6.07,cerebellum
2,7.00,7.24,5.03,5.73,10.14,5.21,5.48,10.06,6.21,5.77,cerebellum
3,6.69,7.21,4.91,6.01,10.25,5.65,5.35,9.91,6.05,5.85,cerebellum
4,7.08,7.53,5.04,6.01,10.33,5.39,5.28,10.03,6.01,5.97,cerebellum
...,...,...,...,...,...,...,...,...,...,...,...
64,7.00,7.17,4.98,6.70,9.94,5.02,5.57,9.75,5.89,5.87,hippocampus
65,7.09,7.45,4.88,6.53,10.30,5.08,5.75,9.97,5.91,5.96,hippocampus
66,6.98,7.32,4.84,6.35,10.02,5.42,5.53,9.70,5.95,5.68,hippocampus
67,7.15,7.32,5.02,6.47,10.04,5.45,5.69,9.72,6.16,5.67,hippocampus


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SAPCD1    69 non-null     float64
 1   HEMK1     69 non-null     float64
 2   PLCB1     69 non-null     float64
 3   SPI1      69 non-null     float64
 4   RAB1B     69 non-null     float64
 5   MSH4      69 non-null     float64
 6   IL18R1    69 non-null     float64
 7   OAZ2      69 non-null     float64
 8   FOXE3     69 non-null     float64
 9   C21orf62  69 non-null     float64
 10  y         69 non-null     object 
dtypes: float64(10), object(1)
memory usage: 6.1+ KB


## Data Preprocessing

### Replacing values

In [5]:
df["y"] = df["y"].apply(lambda x: 0 if x=="cerebellum" else 1)

In [6]:
df

Unnamed: 0,SAPCD1,HEMK1,PLCB1,SPI1,RAB1B,MSH4,IL18R1,OAZ2,FOXE3,C21orf62,y
0,6.70,7.34,4.99,5.99,10.49,5.35,5.36,10.23,5.96,5.84,0
1,7.30,7.28,4.81,5.82,9.94,5.55,5.66,9.84,5.97,6.07,0
2,7.00,7.24,5.03,5.73,10.14,5.21,5.48,10.06,6.21,5.77,0
3,6.69,7.21,4.91,6.01,10.25,5.65,5.35,9.91,6.05,5.85,0
4,7.08,7.53,5.04,6.01,10.33,5.39,5.28,10.03,6.01,5.97,0
...,...,...,...,...,...,...,...,...,...,...,...
64,7.00,7.17,4.98,6.70,9.94,5.02,5.57,9.75,5.89,5.87,1
65,7.09,7.45,4.88,6.53,10.30,5.08,5.75,9.97,5.91,5.96,1
66,6.98,7.32,4.84,6.35,10.02,5.42,5.53,9.70,5.95,5.68,1
67,7.15,7.32,5.02,6.47,10.04,5.45,5.69,9.72,6.16,5.67,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SAPCD1    69 non-null     float64
 1   HEMK1     69 non-null     float64
 2   PLCB1     69 non-null     float64
 3   SPI1      69 non-null     float64
 4   RAB1B     69 non-null     float64
 5   MSH4      69 non-null     float64
 6   IL18R1    69 non-null     float64
 7   OAZ2      69 non-null     float64
 8   FOXE3     69 non-null     float64
 9   C21orf62  69 non-null     float64
 10  y         69 non-null     int64  
dtypes: float64(10), int64(1)
memory usage: 6.1 KB


### Train Test Split

In [8]:
df.shape

(69, 11)

In [9]:
X = df.iloc[:,0:10]
y = df.iloc[:,10]

In [10]:
X.values, y.values

(array([[ 6.70076089,  7.3369594 ,  4.992272  ,  5.99151703, 10.48583984,
          5.35277657,  5.35798734, 10.23359108,  5.95553234,  5.84093847],
        [ 7.30220205,  7.2821201 ,  4.81234552,  5.82192832,  9.94283793,
          5.55043486,  5.66160543,  9.84051386,  5.97028874,  6.06564579],
        [ 7.00103431,  7.23560069,  5.02594081,  5.72828806, 10.13948581,
          5.20621229,  5.48037712, 10.06097328,  6.2137753 ,  5.77111725],
        [ 6.68586801,  7.20634145,  4.91329496,  6.00500484, 10.25089   ,
          5.64928276,  5.34907856,  9.91384173,  6.05017281,  5.8516165 ],
        [ 7.08498411,  7.53014783,  5.03810677,  6.01043604, 10.33300897,
          5.38808167,  5.28221974, 10.02825228,  6.01078423,  5.96573157],
        [ 6.75953303,  7.38046132,  5.2555583 ,  5.986095  ,  9.93549852,
          5.36608562,  5.48854035, 10.10302818,  6.13497828,  5.76996056],
        [ 6.92463104,  7.30226394,  5.07801403,  5.87190483,  9.98483677,
          5.17413588,  5.3705891

In [11]:
model1 = LinearDiscriminantAnalysis()

In [12]:
model1.fit(X,y)

LinearDiscriminantAnalysis()

In [13]:
model1.score(X,y)

0.9420289855072463

In this case, LDA fits two 10-dimensional normal distributions. Look at the fitted model by looking at the finalModel component of the result of train(). Notice there is a component called means that includes the estimated means of both distributions. Plot the mean vectors against each other and determine which predictors (genes) appear to be driving the algorithm. 

Which TWO genes appear to be driving the algorithm (i.e. the two genes with the highest means)?

In [14]:
df.columns

Index(['SAPCD1', 'HEMK1', 'PLCB1', 'SPI1', 'RAB1B', 'MSH4', 'IL18R1', 'OAZ2', 'FOXE3', 'C21orf62', 'y'], dtype='object')

In [15]:
select_feature = SelectKBest(chi2, k=10).fit(X,y)

In [16]:
select_feature.scores_

array([0.00395592, 0.0180232 , 0.00092091, 0.53249138, 0.00073612,
       0.01988109, 0.0434535 , 0.21129651, 0.00444739, 0.10585621])

Repeat the exercise in Q1 with QDA.

In [17]:
model2 = QuadraticDiscriminantAnalysis()

In [18]:
model2.fit(X,y)

QuadraticDiscriminantAnalysis()

In [19]:
model2.score(X,y)

0.9710144927536232

One thing we saw in the previous plots is that the values of the predictors correlate in both groups: some predictors are low in both groups and others high in both groups. The mean value of each predictor found in colMeans(x) is not informative or useful for prediction and often for purposes of interpretation, it is useful to center or scale each column. This can be achieved with the preProcess argument in train(). Re-run LDA with preProcess = "center". Note that accuracy does not change, but it is now easier to identify the predictors that differ more between groups than based on the plot made in Q2.

Which TWO genes drive the algorithm after performing the scaling?

Now we are going to increase the complexity of the challenge slightly. Repeat the LDA analysis from Q5 but using all tissue types. Use the following code to create your dataset:

In [20]:
df2 = pd.read_csv("tissue.csv")

In [21]:
df2.head()

Unnamed: 0,x.MAML1,x.LHPP,x.SEPT10,x.B3GNT4,x.ZNF280D,x.SOX12,x.C21orf62,x.PER3,x.HOXA10,x.HOXC5,x.BLVRB,x.ZIM2,x.HEMK1,x.FAP,x.MAN1A1,x.CDA,x.HTR7P1,x.DALRD3,x.FIBP,x.TTTY15,x.SLC30A1,x.SHANK2,x.MSL2,x.UBOX5,x.DUSP13,x.GJB5,x.MTF2,x.PPP1CA,x.IGHMBP2,x.VEGFA,x.KANSL1L,x.FCN3,x.USP32P2,x.HIVEP3,x.HRH1,x.HDAC7,x.HTT,x.IDH3A,x.TLR3,x.F11R,x.MOAP1,x.ISOC2,x.CLIP3,x.FZD10,x.VOPP1,x.RPL4,x.NUDT2,x.RAB30,x.DBI,x.CCDC87,...,x.ZNHIT3,x.DBF4B,x.GLUD1,x.ADRB2,x.FBXW4,x.SPP1,x.F2R,x.SDF4,x.GALNT8,x.TMEM63A,x.BSCL2,x.GZMM,x.THSD4,x.PRLR,x.PLEKHJ1,x.TLE3,x.PANK2,x.SKP1,x.TOMM70,x.KIAA1324,x.PCDHB12,x.CPA4,x.TRPV2,x.CHCHD2,x.TRPC6,x.MYO1D,x.SLURP1,x.ALG9,x.ZC4H2,x.KIR2DL3,x.IVL,x.BAMBI,x.SLC7A6,x.SLC17A1,x.CCL3,x.PHF8,x.KIR3DL3,x.SARS2,x.PIP4K2C,x.S100A13,x.EPHA1,x.MFGE8,x.OAZ2,x.PCBP3,x.POLA1,x.KREMEN2,x.CYP7B1,x.LILRB3,x.GSAP,y
0,9.83,8.33,5.5,8.69,5.64,6.25,5.84,8.33,5.52,7.66,7.67,8.36,7.34,6.26,5.33,6.19,5.12,6.34,9.63,5.71,7.82,8.06,7.73,8.89,7.2,6.63,9.62,8.66,7.11,8.74,5.46,6.63,9.96,7.93,5.91,8.1,8.39,7.92,5.22,7.7,10.56,8.03,11.72,5.97,9.4,11.96,7.27,6.87,10.73,6.75,...,9.0,5.77,8.48,7.04,8.61,11.07,4.63,10.22,6.43,6.76,9.95,6.44,5.92,4.62,8.45,5.82,8.72,11.46,7.41,6.17,7.33,8.57,6.93,11.7,4.96,5.09,6.52,6.29,8.69,7.27,7.26,6.49,5.59,6.96,6.21,7.37,7.27,7.41,8.99,8.64,7.02,8.44,10.23,8.46,7.35,8.39,4.76,8.23,6.74,cerebellum
1,9.63,8.54,5.64,8.83,5.69,6.29,6.07,8.26,5.53,7.57,7.78,8.65,7.28,6.38,5.27,6.2,5.15,6.29,9.94,6.41,8.43,8.34,8.1,8.98,7.14,6.46,9.57,9.05,7.18,9.1,5.41,6.47,10.06,7.93,6.15,8.2,8.18,7.78,5.26,7.78,10.55,8.21,12.05,6.09,9.67,11.75,7.13,6.9,11.55,6.87,...,8.74,5.94,9.03,7.23,8.6,7.01,4.7,10.22,6.51,6.75,10.14,6.63,6.34,4.58,8.64,6.04,8.76,11.3,6.83,6.09,7.35,8.57,6.99,11.85,5.06,5.08,6.77,6.46,8.68,7.3,7.36,6.92,5.6,6.86,7.12,7.35,7.43,7.68,9.21,9.18,7.1,8.63,9.84,8.78,7.48,8.26,4.95,8.33,6.39,cerebellum
2,9.69,8.48,5.72,8.5,5.96,6.22,5.77,9.1,5.6,7.42,7.55,8.85,7.24,6.12,5.51,5.99,4.98,6.27,9.81,6.53,8.27,8.58,7.72,8.64,7.08,6.47,9.67,8.64,7.18,8.12,5.54,6.59,10.65,7.89,5.83,8.05,8.14,8.87,5.19,7.4,11.05,7.8,11.79,5.88,9.9,11.82,7.09,6.83,11.43,6.87,...,9.19,5.73,8.67,6.97,8.67,8.66,4.56,10.18,6.33,6.84,10.24,6.53,6.01,4.64,8.59,5.93,8.76,11.61,7.55,5.98,7.08,8.42,6.84,11.68,4.96,5.12,6.44,6.36,8.53,7.14,7.1,7.0,5.54,6.85,5.94,7.3,7.48,7.33,9.27,8.88,6.74,8.19,10.06,8.52,7.3,8.28,4.8,7.98,6.68,cerebellum
3,9.99,8.51,5.79,8.42,5.74,6.51,5.85,9.21,5.56,7.33,7.84,8.2,7.21,6.28,5.33,5.82,4.86,6.64,9.75,6.44,8.62,8.59,7.92,8.69,6.99,6.37,9.48,8.81,7.24,8.82,5.47,6.65,10.29,7.72,5.79,8.36,8.22,8.01,5.28,7.42,10.71,7.98,12.0,6.01,9.85,11.88,7.18,6.58,12.04,6.65,...,9.06,5.62,8.94,6.85,8.89,8.1,4.56,10.43,6.36,6.56,10.27,6.34,6.0,4.67,8.4,5.71,8.8,11.44,7.11,5.84,7.17,8.18,6.72,11.72,4.93,5.15,6.28,6.54,8.25,7.06,7.06,6.8,5.48,6.74,6.04,7.38,7.18,7.31,9.02,8.71,6.85,8.28,9.91,8.21,7.45,8.28,4.96,7.89,6.82,cerebellum
4,9.58,8.37,5.78,8.82,5.63,6.2,5.97,8.5,5.6,7.54,7.6,8.58,7.53,6.34,5.33,6.02,5.11,6.12,9.58,5.79,8.3,8.61,7.55,8.83,7.1,6.26,9.52,8.61,7.17,9.14,5.4,6.73,9.79,7.86,5.91,7.8,8.34,7.99,5.23,7.25,10.72,7.96,11.78,5.73,9.33,11.93,7.2,6.77,11.44,6.7,...,8.91,5.44,8.84,6.79,8.64,9.43,4.7,10.09,6.35,6.83,10.11,6.68,6.25,4.57,8.66,5.81,8.7,11.3,7.53,5.99,7.26,8.8,6.65,11.46,5.07,5.14,6.59,6.32,8.72,7.37,7.47,6.54,5.53,6.92,6.1,7.24,7.25,7.56,9.23,8.85,6.84,8.04,10.03,8.76,7.25,8.47,4.87,8.13,7.0,cerebellum


In [22]:
df2.shape

(189, 501)

In [23]:
df2.y.value_counts()

kidney         39
cerebellum     38
colon          34
hippocampus    31
liver          26
endometrium    15
placenta        6
Name: y, dtype: int64

In [24]:
le = LabelEncoder()

In [25]:
df2["y"] = le.fit_transform(df2.y)

In [26]:
df2.head()

Unnamed: 0,x.MAML1,x.LHPP,x.SEPT10,x.B3GNT4,x.ZNF280D,x.SOX12,x.C21orf62,x.PER3,x.HOXA10,x.HOXC5,x.BLVRB,x.ZIM2,x.HEMK1,x.FAP,x.MAN1A1,x.CDA,x.HTR7P1,x.DALRD3,x.FIBP,x.TTTY15,x.SLC30A1,x.SHANK2,x.MSL2,x.UBOX5,x.DUSP13,x.GJB5,x.MTF2,x.PPP1CA,x.IGHMBP2,x.VEGFA,x.KANSL1L,x.FCN3,x.USP32P2,x.HIVEP3,x.HRH1,x.HDAC7,x.HTT,x.IDH3A,x.TLR3,x.F11R,x.MOAP1,x.ISOC2,x.CLIP3,x.FZD10,x.VOPP1,x.RPL4,x.NUDT2,x.RAB30,x.DBI,x.CCDC87,...,x.ZNHIT3,x.DBF4B,x.GLUD1,x.ADRB2,x.FBXW4,x.SPP1,x.F2R,x.SDF4,x.GALNT8,x.TMEM63A,x.BSCL2,x.GZMM,x.THSD4,x.PRLR,x.PLEKHJ1,x.TLE3,x.PANK2,x.SKP1,x.TOMM70,x.KIAA1324,x.PCDHB12,x.CPA4,x.TRPV2,x.CHCHD2,x.TRPC6,x.MYO1D,x.SLURP1,x.ALG9,x.ZC4H2,x.KIR2DL3,x.IVL,x.BAMBI,x.SLC7A6,x.SLC17A1,x.CCL3,x.PHF8,x.KIR3DL3,x.SARS2,x.PIP4K2C,x.S100A13,x.EPHA1,x.MFGE8,x.OAZ2,x.PCBP3,x.POLA1,x.KREMEN2,x.CYP7B1,x.LILRB3,x.GSAP,y
0,9.83,8.33,5.5,8.69,5.64,6.25,5.84,8.33,5.52,7.66,7.67,8.36,7.34,6.26,5.33,6.19,5.12,6.34,9.63,5.71,7.82,8.06,7.73,8.89,7.2,6.63,9.62,8.66,7.11,8.74,5.46,6.63,9.96,7.93,5.91,8.1,8.39,7.92,5.22,7.7,10.56,8.03,11.72,5.97,9.4,11.96,7.27,6.87,10.73,6.75,...,9.0,5.77,8.48,7.04,8.61,11.07,4.63,10.22,6.43,6.76,9.95,6.44,5.92,4.62,8.45,5.82,8.72,11.46,7.41,6.17,7.33,8.57,6.93,11.7,4.96,5.09,6.52,6.29,8.69,7.27,7.26,6.49,5.59,6.96,6.21,7.37,7.27,7.41,8.99,8.64,7.02,8.44,10.23,8.46,7.35,8.39,4.76,8.23,6.74,0
1,9.63,8.54,5.64,8.83,5.69,6.29,6.07,8.26,5.53,7.57,7.78,8.65,7.28,6.38,5.27,6.2,5.15,6.29,9.94,6.41,8.43,8.34,8.1,8.98,7.14,6.46,9.57,9.05,7.18,9.1,5.41,6.47,10.06,7.93,6.15,8.2,8.18,7.78,5.26,7.78,10.55,8.21,12.05,6.09,9.67,11.75,7.13,6.9,11.55,6.87,...,8.74,5.94,9.03,7.23,8.6,7.01,4.7,10.22,6.51,6.75,10.14,6.63,6.34,4.58,8.64,6.04,8.76,11.3,6.83,6.09,7.35,8.57,6.99,11.85,5.06,5.08,6.77,6.46,8.68,7.3,7.36,6.92,5.6,6.86,7.12,7.35,7.43,7.68,9.21,9.18,7.1,8.63,9.84,8.78,7.48,8.26,4.95,8.33,6.39,0
2,9.69,8.48,5.72,8.5,5.96,6.22,5.77,9.1,5.6,7.42,7.55,8.85,7.24,6.12,5.51,5.99,4.98,6.27,9.81,6.53,8.27,8.58,7.72,8.64,7.08,6.47,9.67,8.64,7.18,8.12,5.54,6.59,10.65,7.89,5.83,8.05,8.14,8.87,5.19,7.4,11.05,7.8,11.79,5.88,9.9,11.82,7.09,6.83,11.43,6.87,...,9.19,5.73,8.67,6.97,8.67,8.66,4.56,10.18,6.33,6.84,10.24,6.53,6.01,4.64,8.59,5.93,8.76,11.61,7.55,5.98,7.08,8.42,6.84,11.68,4.96,5.12,6.44,6.36,8.53,7.14,7.1,7.0,5.54,6.85,5.94,7.3,7.48,7.33,9.27,8.88,6.74,8.19,10.06,8.52,7.3,8.28,4.8,7.98,6.68,0
3,9.99,8.51,5.79,8.42,5.74,6.51,5.85,9.21,5.56,7.33,7.84,8.2,7.21,6.28,5.33,5.82,4.86,6.64,9.75,6.44,8.62,8.59,7.92,8.69,6.99,6.37,9.48,8.81,7.24,8.82,5.47,6.65,10.29,7.72,5.79,8.36,8.22,8.01,5.28,7.42,10.71,7.98,12.0,6.01,9.85,11.88,7.18,6.58,12.04,6.65,...,9.06,5.62,8.94,6.85,8.89,8.1,4.56,10.43,6.36,6.56,10.27,6.34,6.0,4.67,8.4,5.71,8.8,11.44,7.11,5.84,7.17,8.18,6.72,11.72,4.93,5.15,6.28,6.54,8.25,7.06,7.06,6.8,5.48,6.74,6.04,7.38,7.18,7.31,9.02,8.71,6.85,8.28,9.91,8.21,7.45,8.28,4.96,7.89,6.82,0
4,9.58,8.37,5.78,8.82,5.63,6.2,5.97,8.5,5.6,7.54,7.6,8.58,7.53,6.34,5.33,6.02,5.11,6.12,9.58,5.79,8.3,8.61,7.55,8.83,7.1,6.26,9.52,8.61,7.17,9.14,5.4,6.73,9.79,7.86,5.91,7.8,8.34,7.99,5.23,7.25,10.72,7.96,11.78,5.73,9.33,11.93,7.2,6.77,11.44,6.7,...,8.91,5.44,8.84,6.79,8.64,9.43,4.7,10.09,6.35,6.83,10.11,6.68,6.25,4.57,8.66,5.81,8.7,11.3,7.53,5.99,7.26,8.8,6.65,11.46,5.07,5.14,6.59,6.32,8.72,7.37,7.47,6.54,5.53,6.92,6.1,7.24,7.25,7.56,9.23,8.85,6.84,8.04,10.03,8.76,7.25,8.47,4.87,8.13,7.0,0


In [27]:
df2.y.value_counts()

4    39
0    38
1    34
3    31
5    26
2    15
6     6
Name: y, dtype: int64

In [28]:
X = df2.iloc[:,0:500]
y = df2.iloc[:,500]

In [29]:
X.values, y.values

(array([[ 9.82567961,  8.3271627 ,  5.4993819 , ...,  4.76001706,
          8.23018205,  6.74038468],
        [ 9.63124675,  8.54282676,  5.64429229, ...,  4.95237261,
          8.33250157,  6.39068256],
        [ 9.69054774,  8.47648577,  5.71718694, ...,  4.79623236,
          7.97819429,  6.67703781],
        ...,
        [ 9.3618059 ,  8.26365314, 10.29656824, ...,  4.88436628,
          8.1768516 ,  5.61143535],
        [ 9.3460355 ,  8.71853424,  9.66081321, ...,  4.9512614 ,
          8.63041198,  5.79046804],
        [ 9.44789324,  8.25538598,  8.46580862, ...,  4.99628241,
          8.70066461,  5.17747408]]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
   

In [30]:
model3 = LinearDiscriminantAnalysis()

In [31]:
model3.fit(X,y)

LinearDiscriminantAnalysis()

In [32]:
model3.score(X,y)

1.0

#### Python code done by Dennis Lam