## Data Exploration

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
np.random.seed(1)

In [3]:
# load data
df = pd.read_csv('../drug_data.csv')
df.head()

Unnamed: 0,SampleID,Fly,Survival,Concentration,Line,Sex,Replicate,Drug,Number
0,1,0.229.Female.1.Trifluroperazine.1,96,0,229,Female,1,Trifluroperazine,1
1,2,0.229.Female.1.Trifluroperazine.10,96,0,229,Female,1,Trifluroperazine,10
2,3,0.229.Female.1.Trifluroperazine.11,96,0,229,Female,1,Trifluroperazine,11
3,4,0.229.Female.1.Trifluroperazine.12,96,0,229,Female,1,Trifluroperazine,12
4,5,0.229.Female.1.Trifluroperazine.13,96,0,229,Female,1,Trifluroperazine,13


In [4]:
df = df.drop(['SampleID','Fly'],axis=1)
df1 = df.pop('Survival') # remove column diagnosis and store it in df1
df['Survival'] = df1 # add df1 to df as a 'new' column

In [5]:
df.head()

Unnamed: 0,Concentration,Line,Sex,Replicate,Drug,Number,Survival
0,0,229,Female,1,Trifluroperazine,1,96
1,0,229,Female,1,Trifluroperazine,10,96
2,0,229,Female,1,Trifluroperazine,11,96
3,0,229,Female,1,Trifluroperazine,12,96
4,0,229,Female,1,Trifluroperazine,13,96


In [6]:
# select features and target:
df = np.array(df)

# features:
X = df[:,:-1]
l,n = X.shape
#print(l,n)

# target:
y = df[:,-1].astype(float)
# convert 1,0 to 1,-1:
#y = 2*y - 1

In [7]:
np.unique(y,return_counts=True)

(array([ 0., 16., 24., 48., 72., 96.]),
 array([  25,   25,  664,  829,  192, 1505]))

### Clean data

In [8]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False,categories='auto')

# Concentration
x0 = X[:,0]
#print(np.unique(x0))
x0[np.where(x0=='12_5')] = 12.5
#print(np.unique(x0))
x0 = x0.astype(float)
#print(np.unique(x0))

# line = X[:,1] = {229,703,900} --> 3
x1 = onehot_encoder.fit_transform(X[:,1].reshape(-1,1))

# sex = X[:,2] = {Female,Male} --> 2
#x2 = onehot_encoder.fit_transform(X[:,2].reshape(-1,1))
x2 = np.ones(l)
x2[X[:,2] == 'Female'] = -1.


# Replicate
x3 = X[:,3]
#print(np.unique(x3))
x3 = x3.astype(float)
#print(np.unique(x3))

# Drug = X[:,4] = {Trifluroperazine,Cdcl2} --> 2
#x4 = onehot_encoder.fit_transform(X[:,4].reshape(-1,1))
x4 = np.ones(l)
x4[X[:,4] == 'Trifluroperazine'] = -1.

# Number
x5 = X[:,5]
#print(np.unique(x5))
x5 = x5.astype(float)
#print(np.unique(x5))

In [9]:
# Combine every variables
Xnew = np.hstack([x0[:,np.newaxis],x1])
Xnew = np.hstack([Xnew,x2[:,np.newaxis]])
Xnew = np.hstack([Xnew,x3[:,np.newaxis]])
Xnew = np.hstack([Xnew,x4[:,np.newaxis]])
#Xnew = np.hstack([Xnew,x5[:,np.newaxis]])

Xnew.shape

(3240, 7)

In [10]:
X = Xnew

In [18]:
t1 = y==0

In [19]:
X1 = X[t1]
y1 = y[t1]

In [20]:
t2 = y==16
X2 = X[t2]
y2 = y[t2]

In [21]:
X = np.vstack([X1,X2])

In [22]:
X

array([[ 0. ,  1. ,  0. ,  0. ,  1. ,  1. , -1. ],
       [ 0. ,  0. ,  1. ,  0. , -1. ,  1. , -1. ],
       [12.5,  0. ,  1. ,  0. , -1. ,  2. , -1. ],
       [12.5,  0. ,  1. ,  0. , -1. ,  3. , -1. ],
       [12.5,  0. ,  0. ,  1. , -1. ,  2. , -1. ],
       [25. ,  1. ,  0. ,  0. , -1. ,  3. , -1. ],
       [25. ,  0. ,  0. ,  1. , -1. ,  3. , -1. ],
       [25. ,  0. ,  0. ,  1. ,  1. ,  1. , -1. ],
       [25. ,  0. ,  0. ,  1. ,  1. ,  2. , -1. ],
       [25. ,  0. ,  0. ,  1. ,  1. ,  3. , -1. ],
       [ 4. ,  1. ,  0. ,  0. ,  1. ,  1. ,  1. ],
       [ 4. ,  1. ,  0. ,  0. ,  1. ,  2. ,  1. ],
       [ 4. ,  0. ,  1. ,  0. ,  1. ,  2. ,  1. ],
       [ 4. ,  0. ,  0. ,  1. ,  1. ,  1. ,  1. ],
       [ 4. ,  0. ,  0. ,  1. ,  1. ,  1. ,  1. ],
       [ 4. ,  0. ,  0. ,  1. ,  1. ,  2. ,  1. ],
       [ 4. ,  0. ,  0. ,  1. ,  1. ,  3. ,  1. ],
       [ 5. ,  0. ,  1. ,  0. ,  1. ,  2. , -1. ],
       [ 5. ,  0. ,  0. ,  1. , -1. ,  3. , -1. ],
       [ 5. ,  0. ,  0. ,  1. ,

In [23]:
X.shape

(50, 7)

In [25]:
y = np.vstack([y1,y2])

In [26]:
y.shape

(2, 25)

In [27]:
y

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
        16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.]])

In [None]:
# convert Servival to -1 if < 72, 1 if =>72:
ynew = np.ones(l)
ynew[y < 72.] = -1.
y = ynew