<a href="https://colab.research.google.com/github/farhadrahimiinfo/Machine_Learning/blob/main/Split_Your_Dataset_With_scikit_learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#One of the key aspects of supervised machine learning is model evaluation and validation.
#When you evaluate the predictive performance of your model,
# it’s essential that the process be unbiased. Using train_test_split() from the data science library scikit-learn,
# you can split your dataset into subsets that minimize the potential for bias in your evaluation and validation process.
import numpy as np


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
MyData = np.arange(1, 25).reshape(12, 2)
MyData

array([[ 1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8],
       [ 9, 10],
       [11, 12],
       [13, 14],
       [15, 16],
       [17, 18],
       [19, 20],
       [21, 22],
       [23, 24]])

In [None]:
MyData_sample = np.array([0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
MyData_sample

array([0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])

In [None]:
#The samples of the dataset are shuffled randomly and then split into the training and test sets according to the size you defined.
#Data_train: The training part of the first sequence (x)
#Data_test: The test part of the first sequence (x)
#MyData_sample_train: The training part of the second sequence (y)
#MyData_sample_test: The test part of the second sequence (y)
Data_train, Data_test, Data_sample_train, Data_sample_test = train_test_split(MyData, MyData_sample);


In [None]:
Data_train

array([[ 3,  4],
       [ 9, 10],
       [ 7,  8],
       [19, 20],
       [15, 16],
       [13, 14],
       [ 5,  6],
       [17, 18],
       [23, 24]])

In [None]:
Data_test

array([[ 1,  2],
       [21, 22],
       [11, 12]])

In [None]:
Data_sample_train

array([1, 1, 0, 0, 1, 0, 1, 1, 0])

In [None]:
Data_sample_test

array([0, 1, 0])

In [None]:
#You can see that y has six zeros and six ones. However, the test set has three zeros out of four items.
#If you want to (approximately) keep the proportion of y values through the training and test sets, then pass stratify=y.
# This will enable stratified splitting:
Data_train, Data_test, Data_sample_train, Data_sample_test = train_test_split(
     MyData, MyData_sample, test_size=0.33, random_state=4, stratify=MyData_sample
 )

#Now y_train and y_test have the same ratio of zeros and ones as the original y array.



In [None]:
Data_train

array([[21, 22],
       [ 1,  2],
       [15, 16],
       [13, 14],
       [17, 18],
       [19, 20],
       [23, 24],
       [ 3,  4]])

In [None]:
Data_test

array([[11, 12],
       [ 7,  8],
       [ 5,  6],
       [ 9, 10]])

In [None]:
Data_sample_train

array([1, 0, 1, 0, 1, 0, 0, 1])

In [None]:
Data_sample_test

array([0, 0, 1, 1])

In [None]:
#Stratified splits are desirable in some cases, like when you’re classifying an imbalanced dataset, a dataset with a significant difference in the number of samples that belong to distinct classes.

#Finally, you can turn off data shuffling and random split with shuffle=False:

In [None]:
Data_train, Data_test, Data_sample_train, Data_sample_test = train_test_split(
     MyData, MyData_sample, test_size=0.33, shuffle=False
 )
#Now you have a split in which the first two-thirds of samples in the original x and y arrays are assigned to the training set and the last third to the test set. No shuffling. No randomness.



In [None]:
#Supervised Machine Learning With train_test_split()
#Minimalist Example of Linear Regression
#In this example, you’ll apply what you’ve learned so far to solve a small regression problem.
#You’ll learn how to create datasets, split them into training and test subsets, and use them for linear regression.

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [None]:
MyData = np.arange(20).reshape(-1, 1)
MyData_sample = np.array([5, 12, 11, 19, 30, 29, 23, 40, 51, 54, 74,
              62, 68, 73, 89, 84, 89, 101, 99, 106])

In [None]:
MyData

array([[ 0],
       [ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10],
       [11],
       [12],
       [13],
       [14],
       [15],
       [16],
       [17],
       [18],
       [19]])

In [None]:
MyData_sample

array([  5,  12,  11,  19,  30,  29,  23,  40,  51,  54,  74,  62,  68,
        73,  89,  84,  89, 101,  99, 106])

In [None]:
#Your dataset has twenty observations, or x-y pairs.
#You specify the argument test_size=8, so the dataset is divided into a training set with twelve observations and a test set with eight observations.
Data_train, Data_test, Data_Sample_train, Data_Sample_test = train_test_split(MyData, MyData_sample, test_size=8, random_state=0)

In [None]:
Data_train

array([[ 4],
       [ 2],
       [ 5],
       [14],
       [ 9],
       [ 7],
       [16],
       [11],
       [ 3],
       [ 0],
       [15],
       [12]])

In [None]:
Data_test

array([[18],
       [ 1],
       [19],
       [ 8],
       [10],
       [17],
       [ 6],
       [13]])

In [None]:
Data_Sample_train

array([30, 11, 29, 89, 54, 40, 89, 62, 19,  5, 84, 68])

In [None]:
Data_Sample_test

array([ 99,  12, 106,  51,  74, 101,  23,  73])

In [None]:
#Now you can use the training set to fit the model:
model = LinearRegression().fit(Data_train, Data_Sample_train)



In [None]:
#LinearRegression creates the object that represents the model, while .fit() trains, or fits, the model and returns it.
# With linear regression, fitting the model means determining the best intercept (model.intercept_) and slope (model.coef_) values of the regression line.



In [None]:
model.intercept_

3.1617195496417523

In [None]:
model.coef_


array([5.53121801])

In [None]:
#Although you can use x_train and y_train to check the goodness of fit, this isn’t a best practice.
# An unbiased estimation of the predictive performance of your model is based on test data:
model.score(Data_train, Data_Sample_train)


0.9868175024574795

In [None]:
model.score(Data_test, Data_Sample_test)

0.9465896927715023

In [None]:
import numpy as np
import sklearn.model_selection as ms

import sklearn.datasets as dt
IRIS=dt.load_iris()
X = IRIS.data
Y = IRIS.target


In [None]:
trX, teX, trY, teY = ms.train_test_split(X, Y, train_size=0.7)

In [None]:
trX, teX, trY, teY = ms.train_test_split(X, Y, train_size=0.7, random_state=2)


In [None]:
print(f'{X.shape = } -- {Y.shape = }')
print(f'{trX.shape = } -- {trY.shape = }')
print(f'{teX.shape = } -- {teY.shape = }')


X.shape = (150, 4) -- Y.shape = (150,)
trX.shape = (105, 4) -- trY.shape = (105,)
teX.shape = (45, 4) -- teY.shape = (45,)


In [None]:
trX, X2, trY, Y2 = ms.train_test_split(X, Y, train_size=0.7, random_state=2)

vaX, teX, vaY, teY = ms.train_test_split(X2, Y2, train_size=0.5, random_state=2)


In [None]:
print(f'{X.shape = } -- {Y.shape = }')
print(f'{trX.shape = } -- {trY.shape = }')
print(f'{vaX.shape = } -- {vaY.shape = }')
print(f'{teX.shape = } -- {teY.shape = }')


X.shape = (150, 4) -- Y.shape = (150,)
trX.shape = (105, 4) -- trY.shape = (105,)
vaX.shape = (22, 4) -- vaY.shape = (22,)
teX.shape = (23, 4) -- teY.shape = (23,)
