In [14]:
# import the usual libraries
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans


from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import KFold, cross_val_score

In [9]:
df = pd.read_csv("data/data.csv",names=["A"+str(x) for x in range(1,17,1)])
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [10]:
#lets check the data types
df.dtypes

A1      object
A2      object
A3     float64
A4      object
A5      object
A6      object
A7      object
A8     float64
A9      object
A10     object
A11      int64
A12     object
A13     object
A14     object
A15      int64
A16     object
dtype: object

In [12]:
#lets check for null values
df.isnull().any()

A1     False
A2     False
A3     False
A4     False
A5     False
A6     False
A7     False
A8     False
A9     False
A10    False
A11    False
A12    False
A13    False
A14    False
A15    False
A16    False
dtype: bool

In [43]:
# lets encode all the worthy columns as categories
# operate on copy in case of mistakes
df2 = df.copy()
for col in df2.columns:
    if df2[col].dtype == 'object':
        df2[col] = df2[col].astype('category')
        df2[col] = df2[col].cat.codes
df2.rename(columns={'A16':'target'},inplace=True) # rename last column to target since it gound level truth
df2.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,2,156,0.0,2,1,13,8,1.25,1,1,1,0,0,68,0,0
1,1,328,4.46,2,1,11,4,3.04,1,1,6,0,0,11,560,0
2,1,89,0.5,2,1,11,4,1.5,1,0,0,0,0,96,824,0
3,2,125,1.54,2,1,13,8,3.75,1,1,5,1,0,31,3,0
4,2,43,5.625,2,1,13,8,1.71,1,0,0,0,2,37,0,0


In [44]:
#lets check data types now
df2.dtypes

A1           int8
A2          int16
A3        float64
A4           int8
A5           int8
A6           int8
A7           int8
A8        float64
A9           int8
A10          int8
A11         int64
A12          int8
A13          int8
A14         int16
A15         int64
target       int8
dtype: object

In [45]:
#check the characteristics
df2.describe()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
count,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0
mean,1.66087,150.528986,4.758725,2.215942,1.466667,6.672464,5.994203,2.223406,0.523188,0.427536,2.4,0.457971,0.176812,59.392754,1017.385507,0.555072
std,0.509195,96.188946,4.978163,0.477087,0.860126,4.320266,2.594506,3.346513,0.499824,0.49508,4.86294,0.498592,0.557869,48.23167,5210.102598,0.497318
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,71.0,1.0,2.0,1.0,2.0,4.0,0.165,0.0,0.0,0.0,0.0,0.0,23.0,0.0,0.0
50%,2.0,133.5,2.75,2.0,1.0,6.0,8.0,1.0,1.0,0.0,0.0,0.0,0.0,52.0,5.0,1.0
75%,2.0,226.0,7.2075,2.0,1.0,11.0,8.0,2.625,1.0,1.0,3.0,1.0,0.0,96.0,395.5,1.0
max,2.0,349.0,28.0,3.0,3.0,14.0,9.0,28.5,1.0,1.0,67.0,1.0,2.0,170.0,100000.0,1.0


In [60]:
# lets make X the feature matrix and y the target matrix
X = df2[df2.columns[:-1]]
y = df2['target']
# lets now scale the feature matrix
X = StandardScaler().fit_transform(X)
pca = PCA(0.95)
X2 = pca.fit_transform(X)
print(X2.shape,X.shape)
pca.explained_variance_ratio_
# sns.regplot(X2[:,0],X2[:,1])

(690, 13) (690, 15)


array([0.17981879, 0.12653995, 0.09355925, 0.0830736 , 0.07163962,
       0.06833095, 0.06360784, 0.06015657, 0.05730793, 0.04926579,
       0.04267368, 0.04061716, 0.03541381])