In [None]:
import pandas as pd
input_data=pd.read_csv('train.csv', index_col=0)
X_submission=pd.get_dummies(pd.read_csv('test.csv', index_col=0))

### Let's see what the data looks like

In [None]:
input_data.head()

####  Are there any missing data?

In [None]:
input_data.isnull().sum()

##### One problem has been solved by itself  :)

#### How the types are distributed ?

In [None]:
input_data.type.value_counts()

##### The distribution is almost equal - nice

#### Let's see how it looks on the PairGrig chart

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_style('whitegrid')
sns.pairplot(input_data, hue="type", diag_kind='kde', size=3)

### Maybe color will help ?

In [None]:
types_col=input_data.groupby(['type','color']).count()['bone_length']
types=types_col.index.levels[0].values
col=types_col.index.levels[1].values
fig, axes=plt.subplots(nrows=1, ncols=3, figsize=(15,5))

for i, typ in enumerate(types):
    axes[i].bar(col, types_col[typ].values)
    axes[i].set_title(typ)
    
plt.show()

##### As you can see the colors are evenly distributed between the types. I am afraid that this data will be useless to us :(

## Data Preprocessing

### Encoding class labels

In [None]:
from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()
le_data=input_data
le_data['type']=class_le.fit_transform(input_data['type'])
le_data.head()

### Encoding nominal feature

In [None]:
le_data=pd.get_dummies(le_data)
typ=le_data['type']
le_data.drop(columns='type', inplace=True)
le_data['type']=typ
le_data.head()

### Partitioning a dataset in training and test sets

In [None]:
from sklearn.model_selection import train_test_split

X, y = le_data.iloc[:,:10], le_data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
y_train.value_counts()

## Standardization

In [None]:
from sklearn.preprocessing import StandardScaler

stdsc_1 = StandardScaler()
X_train_stand = stdsc_1.fit_transform(X_train)
X_test_stand = stdsc_1.transform(X_test)

## Feature selection PCA

In [None]:
from sklearn.decomposition import PCA
import numpy as np

pca = PCA()
X_train_pca = pca.fit_transform(X_train_stand)
X_test_pca = pca.transform(X_test_stand)
pca.explained_variance_ratio_

plt.bar(range(1, X_train_pca.shape[1]+1), pca.explained_variance_ratio_, alpha=0.5, align='center')
plt.step(range(1, X_train_pca.shape[1]+1), np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.show()


## Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA()
X_train_lda = lda.fit_transform(X_train_stand, y_train)
X_test_lda = lda.transform(X_test_stand)

In [None]:
lda.explained_variance_ratio_