# Sprint 2

## Introduction to Sprint - Machine Learning Scratch

### [Problem 1] Scratch of train_test_split

In [1]:
import numpy as np
import pandas as pd
from math import ceil

In [2]:
def scratch_train_test_split(X, y, train_size=0.8):
    """Split the validation data.
    Parameters
    ----------
    X : ndarray
      Training data (n_samples, n_features)
    y : ndarray
      Correct value (n_samples,)
    train_size : float
      Specify what percentage to use as train (0 < train_size < 1)
    Returns
    -------
    X_train : ndarray
      Training data (n_samples, n_features)
    X_test : ndarray
      Validation data (n_samples, n_features)
    y_train : ndarray
      Correct value of training data (n_samples,)
    y_test : ndarray
      Correct value of verification data (n_samples,)
    """
    # TODO
    if isinstance(y, range) or isinstance(y, list):
        y = np.array(y)
    assert X.shape[0] > 0 and y.shape[0] > 0, 'At least one row required in X and y'
    assert X.shape[0] == y.shape[0], '2 array must have same n_samples'
    assert train_size > 0 and train_size < 1, 'train_size must be in [0, 1]'
    
    n_samples = X.shape[0]
    n_train = ceil(n_samples * train_size)
    
    train_idx = np.random.choice(X.shape[0], size=n_train, replace=False)
    test_idx = [i for i in range(n_samples) if i not in train_idx]
    
    X_train = X[train_idx, :]
    y_train = y[train_idx]
    X_test = X[test_idx, :]
    y_test = y[test_idx]
    return X_train, X_test, y_train, y_test

In [3]:
X, y = np.arange(20).reshape((10, 2)), np.arange(10)
print(X)
print(y)

[[ 0  1]
 [ 2  3]
 [ 4  5]
 [ 6  7]
 [ 8  9]
 [10 11]
 [12 13]
 [14 15]
 [16 17]
 [18 19]]
[0 1 2 3 4 5 6 7 8 9]


In [4]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.6)
print('X_train\n', X_train)
print('y_train\n', y_train)
print('X_test\n', X_test)
print('y_test\n', y_test)

X_train
 [[ 4  5]
 [18 19]
 [ 2  3]
 [16 17]
 [10 11]
 [12 13]]
y_train
 [2 9 1 8 5 6]
X_test
 [[ 0  1]
 [ 6  7]
 [ 8  9]
 [14 15]]
y_test
 [0 3 4 7]


In [5]:
print(X.shape, y.shape)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(10, 2) (10,)
(6, 2) (6,) (4, 2) (4,)


In [6]:
# Simple data set 1 creation code
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, n_samples // 2)
f1 = np.random.multivariate_normal(f1, cov, n_samples // 2)
X1 = np.concatenate([f0, f1])
y1 = np.concatenate([
    np.full(n_samples // 2, 1),
    np.full(n_samples // 2, -1)
])

In [7]:
X1_train, X1_test, y1_train, y1_test = scratch_train_test_split(X1, y1)
print(X1.shape, y1.shape)
print(X1_train.shape, y1_train.shape, X1_test.shape, y1_test.shape)

(500, 2) (500,)
(400, 2) (400,) (100, 2) (100,)


In [8]:
# Simple data set 2 creation code
X2 = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y2 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [9]:
X2_train, X2_test, y2_train, y2_test = scratch_train_test_split(X2, y2)
print(X2.shape, y2.shape)
print(X2_train.shape, y2_train.shape, X2_test.shape, y2_test.shape)

(40, 2) (40,)
(32, 2) (32,) (8, 2) (8,)


### [Problem 2] Creating a code to solve the classification problem

In [10]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.datasets import load_iris

In [11]:
# Classification on iris dataset
X = pd.DataFrame(data=load_iris().data, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
y = pd.DataFrame(load_iris().target, columns=[('Species')])
df = pd.concat([X, y], axis=1)
df = df[df.Species != 0]
X = df.drop(columns=['Species']).values
y = df['Species'].values

In [12]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.8)

In [13]:
from sklearn.preprocessing import StandardScaler

def scale(X_train, X_test):
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

In [14]:
X_train_scaled, X_test_scaled = scale(X_train, X_test)

In [15]:
sgd = SGDClassifier(loss='log')
sgd.fit(X_train_scaled, y_train)
sgd.score(X_test_scaled, y_test)

0.9

In [16]:
svc = SVC()
svc.fit(X_train_scaled, y_train)
svc.score(X_test_scaled, y_test)

0.9

In [17]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train_scaled, y_train)
dtc.score(X_test_scaled, y_test)

0.85

In [18]:
# Classification on simple data set 1 
# (scratch_train_test_split on the dataset has already been done
# in problem 1)
X1_train_scaled, X1_test_scaled = scale(X1_train, X1_test)

In [19]:
sgd = SGDClassifier(loss='log')
sgd.fit(X1_train_scaled, y1_train)
sgd.score(X1_test_scaled, y1_test)

1.0

In [20]:
svc = SVC()
svc.fit(X1_train_scaled, y1_train)
svc.score(X1_test_scaled, y1_test)

1.0

In [21]:
dtc = DecisionTreeClassifier()
dtc.fit(X1_train_scaled, y1_train)
dtc.score(X1_test_scaled, y1_test)

1.0

In [22]:
# Classification on simple data set 2
# (scratch_train_test_split on the dataset has already been done
# in problem 1)
X2_train_scaled, X2_test_scaled = scale(X2_train, X2_test)

In [23]:
sgd = SGDClassifier(loss='log')
sgd.fit(X2_train_scaled, y2_train)
sgd.score(X2_test_scaled, y2_test)

0.75

In [24]:
svc = SVC()
svc.fit(X2_train_scaled, y2_train)
svc.score(X2_test_scaled, y2_test)

0.625

In [25]:
dtc = DecisionTreeClassifier()
dtc.fit(X2_train_scaled, y2_train)
dtc.score(X2_test_scaled, y2_test)

0.375

In [26]:
df_house = pd.read_csv('train.csv')
df_house.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [27]:
X_house = df_house[['GrLivArea', 'YearBuilt']].values
y_house = df_house['SalePrice'].values

In [28]:
X_house_train, X_house_test, y_house_train, y_house_test = scratch_train_test_split(X_house, y_house, train_size=0.8)
X_house_train_scaled, X_house_test_scaled = scale(X_house_train, X_house_test)

In [29]:
from sklearn.linear_model import SGDRegressor

sgd_house = SGDRegressor()
sgd_house.fit(X_house_train_scaled, y_house_train)
# Get coefficient of determination R^2
sgd_house.score(X_house_test_scaled, y_house_test)

0.6875579107432963

### [Problem 3] Creating a code to solve the regression problem

In [30]:
from sklearn.linear_model import LinearRegression

X_house_train, X_house_test, y_house_train, y_house_test = scratch_train_test_split(X_house, y_house, train_size=0.8)
lr = LinearRegression()
lr.fit(X_house_train_scaled, y_house_train)
lr.score(X_house_test_scaled, y_house_test)

0.004352933440801898