# [Problem 1]

In [1]:
import numpy as np
import pandas as pd

def scratch_train_test_split(X, y, train_size=0.8, shuffle = True):
    """Divide the validation data.
    Parameters
    ----------
    X : ndarray
      Training data (n_samples, n_features)
    y : ndarray
      Correct answer value (n_samples,)
    train_size : float
      Specify what percentage to use as a train (0 < train_size < 1)
    Returns
    -------
    X_train : ndarray
      Training data (n_samples, n_features)
    X_test : ndarray
      Validation data (n_samples, n_features)
    y_train : ndarray
      Correct answer value of training data (n_samples,)
    y_test : ndarray
      Correct value of verification data (n_samples,)
    """    
    train_samples_size = int(round(train_size * X.shape[0]))
    X_train = X[:train_samples_size,:]
    X_test = X[train_samples_size:,:]
    y_train = y[:train_samples_size]
    y_test = y[train_samples_size:]
    if shuffle:
        idx_c = np.random.choice(range(X.shape[0]), size=(train_samples_size,),replace=False)
        index = np.zeros(X.shape[0],dtype=bool)
        index[idx_c] = True
        rest = ~index
        X_train = X[index,:]
        X_test = X[rest,:]
        y_train = y[index]
        y_test = y[rest]
    return X_train, X_test, y_train, y_test

**Testing behavior of the self-made splitting method**

In [2]:
X, y = np.arange(100).reshape(50,2), np.arange(50)
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y,train_size=0.2)

In [3]:
print('X_train shape: {}'.format(X_train.shape))
print('X_test shape: {}'.format(X_test.shape))
print('y_train shape: {}'.format(y_train.shape))
print('y_test shape: {}'.format(y_test.shape))

X_train shape: (10, 2)
X_test shape: (40, 2)
y_train shape: (10,)
y_test shape: (40,)


In [4]:
print('X_train:\n {}'.format(X_train))

X_train:
 [[ 6  7]
 [20 21]
 [22 23]
 [30 31]
 [48 49]
 [50 51]
 [54 55]
 [68 69]
 [78 79]
 [94 95]]


In [5]:
print('y_train: {}'.format(y_train))

y_train: [ 3 10 11 15 24 25 27 34 39 47]


In [6]:
print('X_test:\n {}'.format(X_test))

X_test:
 [[ 0  1]
 [ 2  3]
 [ 4  5]
 [ 8  9]
 [10 11]
 [12 13]
 [14 15]
 [16 17]
 [18 19]
 [24 25]
 [26 27]
 [28 29]
 [32 33]
 [34 35]
 [36 37]
 [38 39]
 [40 41]
 [42 43]
 [44 45]
 [46 47]
 [52 53]
 [56 57]
 [58 59]
 [60 61]
 [62 63]
 [64 65]
 [66 67]
 [70 71]
 [72 73]
 [74 75]
 [76 77]
 [80 81]
 [82 83]
 [84 85]
 [86 87]
 [88 89]
 [90 91]
 [92 93]
 [96 97]
 [98 99]]


In [7]:
print('y_test:\n {}'.format(y_test))

y_test:
 [ 0  1  2  4  5  6  7  8  9 12 13 14 16 17 18 19 20 21 22 23 26 28 29 30
 31 32 33 35 36 37 38 40 41 42 43 44 45 46 48 49]


**Iris dataset**

In [44]:
from sklearn.datasets import load_iris

In [45]:
iris = load_iris()

In [46]:
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

In [47]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [73]:
df = df[df['target'] != 2.0]
X_iris = np.array(df.iloc[:,0:4])
y_iris = np.array(df['target'])

In [74]:
X_iris_train, X_iris_test, y_iris_train, y_iris_test = scratch_train_test_split(X_iris,y_iris)

In [75]:
print('X_train shape: {}'.format(X_iris_train.shape))
print('X_test shape: {}'.format(X_iris_test.shape))
print('y_train shape: {}'.format(y_iris_train.shape))
print('y_test shape: {}'.format(y_iris_test.shape))

X_train shape: (80, 4)
X_test shape: (20, 4)
y_train shape: (80,)
y_test shape: (20,)


**Simple Dataset 1**

In [77]:
import numpy as np
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, n_samples // 2)
f1 = np.random.multivariate_normal(f1, cov, n_samples // 2)
X_sample_1 = np.concatenate([f0, f1])
y_sample_1 = np.concatenate([
    np.full(n_samples // 2, 1),
    np.full(n_samples // 2, -1)
])

In [78]:
X_train_sample_1, X_test_sample_1, y_train_sample_1, y_test_sample_1 = scratch_train_test_split(X_sample_1,y_sample_1)

In [63]:
print('X_train shape: {}'.format(X_train_sample_1.shape))
print('X_test shape: {}'.format(X_test_sample_1.shape))
print('y_train shape: {}'.format(y_train_sample_1.shape))
print('y_test shape: {}'.format(y_test_sample_1.shape))

X_train shape: (400, 2)
X_test shape: (100, 2)
y_train shape: (400,)
y_test shape: (100,)


**Sample Dataset 2**

In [79]:
X_sample_2 = np.array([
    [-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
    [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
    [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
    [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
    [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
    [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
    [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
    [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
    [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
    [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
    [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
    [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
    [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
    [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
    [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
    [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
    [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
    [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
    [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
    [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ],
])
y_sample_2 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [65]:
X_train_sample_2, X_test_sample_2, y_train_sample_2, y_test_sample_2 = scratch_train_test_split(X,y)

In [66]:
print('X_train shape: {}'.format(X_train_sample_2.shape))
print('X_test shape: {}'.format(X_test_sample_2.shape))
print('y_train shape: {}'.format(y_train_sample_2.shape))
print('y_test shape: {}'.format(y_test_sample_2.shape))

X_train shape: (32, 2)
X_test shape: (8, 2)
y_train shape: (32,)
y_test shape: (8,)


# [Problem 2]

In [109]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [110]:
def classifier_and_estimator(model,X,y):
    """
        This function is used to perform easy training and estimating process
        ----
        Input:
            model: classifier model
            X: dataset
            y: labels set
        Output:
            accury score, precision score, recall score,f1 score
    """
    # Splitting the dataset
    X_train, X_test, y_train, y_test = scratch_train_test_split(X,y,train_size=0.75)
    
    clf = make_pipeline(StandardScaler(),model)
    clf.fit(X_train,y_train)
    
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    return accuracy, precision, recall, f1

In [111]:
def classification_solution(X,y):
    accuracy_list = []
    f1_list = []
    recall_list = []
    precision_list = []
    method_name = ['SGD Classifier','SVM','Decision Tree Classifier']
    models = [SGDClassifier(loss='log'),SVC(),DecisionTreeClassifier()]
    for model in models:
        accuracy, precision, recall, f1 = classifier_and_estimator(model, X, y)
        accuracy_list.append(accuracy)
        f1_list.append(f1)
        precision_list.append(precision)
        recall_list.append(recall)
    result = pd.DataFrame({
        'Accuracy' : accuracy_list,
        'Precision': precision_list,
        'Recall': recall_list,
        'F1': f1_list
    },index=method_name)
    print(result)

In [114]:
print('Iris Dataset solution')
classification_solution(X_iris,y_iris)

Iris Dataset solution
                          Accuracy  Precision  Recall   F1
SGD Classifier                 1.0        1.0     1.0  1.0
SVM                            1.0        1.0     1.0  1.0
Decision Tree Classifier       1.0        1.0     1.0  1.0


In [115]:
print('Sample Dataset 1 solution')
classification_solution(X_sample_1,y_sample_1)

Sample Dataset 1 solution
                          Accuracy  Precision  Recall   F1
SGD Classifier                 1.0        1.0     1.0  1.0
SVM                            1.0        1.0     1.0  1.0
Decision Tree Classifier       1.0        1.0     1.0  1.0


In [116]:
print('Iris Dataset solution')
classification_solution(X_sample_2,y_sample_2)

Iris Dataset solution
                          Accuracy  Precision    Recall        F1
SGD Classifier                 0.5   0.333333  0.666667  0.444444
SVM                            0.2   0.166667  0.250000  0.200000
Decision Tree Classifier       0.8   0.833333  0.833333  0.833333


## Regression Problem

In [123]:
from sklearn.linear_model import SGDRegressor

In [124]:
regression_data = pd.read_csv('train.csv')

In [125]:
regression_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [137]:
reg = make_pipeline(StandardScaler(), SGDRegressor() )
X = np.array(regression_data[['GrLivArea','YearBuilt']])
y = np.array(regression_data['SalePrice'])
X_train, X_test, y_train, y_test = scratch_train_test_split(X,y,train_size=0.75)

In [140]:
from sklearn.metrics import mean_squared_error
reg.fit(X_train,y_train)
pred = reg.predict(X_test)
mse = mean_squared_error(y_test,pred)
print('Mean square error for SDG Regression: {}'.format(mse))

Mean square error for SDG Regression: 1673432987.2005284


# [Problem 3]

In [141]:
from sklearn.linear_model import LinearRegression
def linear_train_estimate(X,y):
    X_train, X_test, y_train, y_test = scratch_train_test_split(X,y,train_size=0.75)
    reg = make_pipeline(StandardScaler(), LinearRegression() )
    reg.fit(X_train,y_train)
    pred = reg.predict(X_test)
    mse = mean_squared_error(y_test,pred)
    print('Mean square error for Linear Regression: {}'.format(mse)) 

In [146]:
linear_train_estimate(X,y)

Mean square error for Linear Regression: 2395562570.0367484
