# Scratch

#### - Run a Python File

In [1]:
# Run "hello.py"

%run hello.py

Hello, World!


#### - Use "argparser"

In [2]:
# Run "hello_argparse.py" without any designations

%run hello_argparse.py

Hello, World!
RESULT : 102.4


In [3]:
# Run "hello_argparse.py" by designating the following arguments

%run hello_argparse.py --display --alpha 0.3 --text "Hello, argparse!" --num_iters 5

Hello, argparse!
0.6
1.2
2.4
4.8
9.6
RESULT : 9.6


## [Task 1] Create "train_test_split" of scikit-learn from Scratch

In [4]:
# Add a path searching for specific modules

import sys

sys.path.append("../scratch/utils/")

In [5]:
# Check the addition

sys.path

['/Users/kazukiegusa/.pyenv/versions/anaconda3-5.3.0/lib/python36.zip',
 '/Users/kazukiegusa/.pyenv/versions/anaconda3-5.3.0/lib/python3.6',
 '/Users/kazukiegusa/.pyenv/versions/anaconda3-5.3.0/lib/python3.6/lib-dynload',
 '',
 '/Users/kazukiegusa/.pyenv/versions/anaconda3-5.3.0/lib/python3.6/site-packages',
 '/Users/kazukiegusa/.pyenv/versions/anaconda3-5.3.0/lib/python3.6/site-packages/aeosa',
 '/Users/kazukiegusa/.pyenv/versions/anaconda3-5.3.0/lib/python3.6/site-packages/IPython/extensions',
 '/Users/kazukiegusa/.ipython',
 '../scratch/utils/']

In [6]:
# Import "train_test_split" created from scratch

from split import train_test_split

In [7]:
# Split train and test datasets by taking an example

import numpy as np

X, y = np.arange(10).reshape((5, 2)), range(5)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [8]:
# Validate the split

X_train

array([[8, 9],
       [4, 5],
       [2, 3],
       [0, 1]])

In [9]:
X_test

array([6, 7])

In [10]:
y_train

array([2, 1, 3, 0])

In [11]:
y_test

array([4])

# Create Pipelines

## [Task 2] Create Classification Pipelines

<br />

I am going to use logistic regression, SVM and decition tree, and prepare for 3 kinds of datasets.

#### - First Dataset

<br />

The first dataset is iris datasets. I use only 2 objective variables, versicolor and virginica, to make this task to a binary classification.

In [12]:
# Import the dataset

from sklearn.datasets import load_iris

iris = load_iris()

iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [13]:
# Create a dataframe

import pandas as pd

X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.DataFrame(iris.target, columns=["Species"])

Xy = pd.concat([X,y], axis=1)

df = Xy[Xy.Species!=0]

df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
50,7.0,3.2,4.7,1.4,1
51,6.4,3.2,4.5,1.5,1
52,6.9,3.1,4.9,1.5,1
53,5.5,2.3,4.0,1.3,1
54,6.5,2.8,4.6,1.5,1
55,5.7,2.8,4.5,1.3,1
56,6.3,3.3,4.7,1.6,1
57,4.9,2.4,3.3,1.0,1
58,6.6,2.9,4.6,1.3,1
59,5.2,2.7,3.9,1.4,1


In [14]:
# create and save a csv file of the dataframe

df.to_csv('iris_dataset.csv')

#### - Second Dataset

In [15]:
# Create a dataset by myself and change it to a dataframe

np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]

f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))

X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2)))*(-1))).astype(np.int)

random_index = np.random.permutation(np.arange(n_samples))
X = X[random_index]
y = y[random_index]

df2 = pd.concat([pd.DataFrame(X,columns=["explanatory variable 1","explanatory variable 2"]), 
                 pd.DataFrame(y,columns=["objective variable"])], axis=1)

df2

Unnamed: 0,explanatory variable 1,explanatory variable 2,objective variable
0,0.772383,-2.291673,-1
1,-0.593349,1.667883,1
2,-2.076486,0.487468,1
3,0.119227,3.625380,1
4,-3.130006,-0.156732,1
5,-1.781098,1.222249,1
6,3.997710,1.251640,-1
7,2.156045,-0.385824,-1
8,1.947416,-1.296390,-1
9,1.587574,-1.859892,-1


In [16]:
# create and save a csv file of the dataframe

df2.to_csv('simple_dataset1.csv')

#### - Third Dataset

In [17]:
# Create a dataset by myself and change it to a dataframe

X = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
              [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
              [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
              [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
              [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
              [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
              [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
              [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
              [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
              [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
              [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
              [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
              [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
              [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
              [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
              [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
              [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
              [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
              [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
              [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])

y = np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1])

df3 = pd.concat([pd.DataFrame(X,columns=["explanatory variable 1","explanatory variable 2"]), 
                 pd.DataFrame(y,columns=["objective variable"])], axis=1)

df3

Unnamed: 0,explanatory variable 1,explanatory variable 2,objective variable
0,-0.44699,-2.8073,0
1,-1.4621,-2.4586,0
2,0.10645,1.9242,0
3,-3.5944,-4.0112,0
4,-0.9888,4.5718,0
5,-3.1625,-3.9606,0
6,0.56421,0.72888,0
7,-0.60216,8.4636,0
8,-0.61251,-0.75345,0
9,-0.73535,-2.2718,0


In [18]:
# create and save a csv file of the dataframe

df3.to_csv('simple_dataset2.csv')

#### Validate the Classification Pipeline

In [19]:
# validate the classification pipeline on the iris dataset by logistic regression

%run ../scratch/model/classification_pipeline.py --dataset "/Users/kazukiegusa/git-dic/dic/ml/sprint2/iris_dataset.csv" --model "lr"

y_pred is [2 2 2 2 2 1 2 1 1 1 1 2 1 1 2 2 2 1 2 1]
score is 1.0




In [20]:
# iris_dataset by svm

%run ../scratch/model/classification_pipeline.py --dataset "/Users/kazukiegusa/git-dic/dic/ml/sprint2/iris_dataset.csv" --model "svm"

y_pred is [2 1 1 1 1 2 2 1 2 1 1 1 1 2 2 1 1 2 1 2]
score is 1.0




In [21]:
# iris_dataset by decition tree
%run ../scratch/model/classification_pipeline.py --dataset "/Users/kazukiegusa/git-dic/dic/ml/sprint2/iris_dataset.csv" --model "dt"

y_pred is [2 2 1 1 1 1 1 2 1 1 1 2 2 2 2 1 1 1 1 2]
score is 1.0




In [22]:
# simple_dataset1 by logistic regression
%run ../scratch/model/classification_pipeline.py --dataset "/Users/kazukiegusa/git-dic/dic/ml/sprint2/simple_dataset1.csv" --model "lr"

y_pred is [ 1 -1  1  1  1 -1  1  1  1  1  1 -1  1  1 -1 -1 -1  1  1 -1  1  1  1  1
 -1  1 -1 -1 -1  1 -1  1 -1  1 -1  1 -1  1  1  1  1 -1  1  1  1 -1  1  1
  1  1 -1 -1 -1  1 -1 -1 -1  1 -1  1  1 -1  1 -1 -1  1  1  1 -1  1 -1 -1
  1  1  1 -1 -1  1 -1 -1 -1  1  1  1 -1 -1 -1 -1  1  1  1 -1 -1  1  1  1
 -1  1 -1 -1]
score is 1.0




In [23]:
# simple_dataset1 by svm
%run ../scratch/model/classification_pipeline.py --dataset "/Users/kazukiegusa/git-dic/dic/ml/sprint2/simple_dataset1.csv" --model "svm"

y_pred is [ 1 -1 -1  1 -1  1  1  1  1  1  1 -1 -1 -1 -1  1 -1  1 -1 -1  1 -1  1  1
 -1 -1  1 -1  1  1 -1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1 -1  1
  1  1  1  1  1 -1 -1 -1  1  1 -1  1  1  1 -1 -1 -1  1 -1  1 -1 -1  1 -1
 -1 -1  1  1 -1 -1  1  1 -1  1  1  1  1 -1 -1 -1  1  1  1  1  1  1 -1  1
 -1  1 -1 -1]
score is 0.85




## [Task 3] Create Regression Pipelines

<br />

I am going to create a pipeline of linear regression that can run python files.

In [24]:
# Import the dataset

train = pd.read_csv('"House Prices- Advanced Regression Techniques".train.csv')

train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [25]:
# Set the explanatory and objective variables and create a dataframe

X = train[["GrLivArea","YearBuilt"]]
y = train["SalePrice"]

df4 = pd.concat([X,y], axis=1)

df4

Unnamed: 0,GrLivArea,YearBuilt,SalePrice
0,1710,2003,208500
1,1262,1976,181500
2,1786,2001,223500
3,1717,1915,140000
4,2198,2000,250000
5,1362,1993,143000
6,1694,2004,307000
7,2090,1973,200000
8,1774,1931,129900
9,1077,1939,118000


In [26]:
# create and save a csv file of the dataframe

df4.to_csv('house_prices_dataset.csv')

In [27]:
# validate the regression pipeline

%run ../scratch/model/regression_pipeline.py --dataset "/Users/kazukiegusa/git-dic/dic/ml/sprint2/house_prices_dataset.csv" --model "lr"



y_pred is [141623.69394132 133207.7250299  212210.17721045 150544.83600963
 198768.05501258 152092.94557994 154824.55355046 252352.73402379
 149909.42505477 234236.64215577 278130.68257494 107712.06467465
 167929.91399996 153614.93703282 307497.81130395 263009.13418179
 137078.20200105 305869.68769358 113564.60788564 202672.97439064
 181090.3417448   81209.16248359 141025.35081086 243673.10898345
 217460.13886652 182348.99001668 197705.0409191  284797.08717921
  90308.99293884 252087.45781002 213589.00060865 126178.8274529
 159298.89948484 201536.70784231 231172.66612846 238179.33928928
 129037.33395833 160447.41700671 201495.025844   172123.38263855
 101182.53144451 198854.4537617  324217.79422785 133968.48303357
 110444.79198369 129346.1036686  331931.3895573  206035.83094588
 214532.97480233 141053.51498515 220933.77435692 211729.9602111
 134457.76505326 198073.96167309 147110.33187368 189263.67428905
 210721.75874796 242109.27032205 133955.62242985 243485.82778516
  92005.64978695 