In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Train Test Split used to split our data into train and test sets
# First, we need to divide our data into features (X) and labels (y).
#  The dataframe gets divided into X_train,X_test , y_train and y_test. 
# X_train and y_train sets are used for training and fitting the model. 
# The X_test and y_test sets are used for testing the model if it's predicting the right outputs/labels. 
# we can explicitly test the size of the train and test sets. 
# It is suggested to keep our train sets larger than the test sets.

# Train set: The training dataset is a set of data that was utilized to fit the model.
#  The dataset on which the model is trained. This data is seen and learned by the model.

# Test set: The test dataset is a subset of the training dataset that is utilized to give an accurate evaluation of
#  a final model fit.

# validation set:  A validation dataset is a sample of data from your model's training set that is used to 
# estimate model performance while tuning the model's hyperparameters.

# by default, 25% of our data is test set and 75% data goes into training tests.



    Syntax: sklearn.model_selection.train_test_split()

    parameters:

        *arrays: sequence of indexables. Lists, numpy arrays, scipy-sparse matrices, and pandas dataframes are all valid inputs.
        test_size: int or float, by default None. If float, it should be between 0.0 and 1.0 and represent the percentage of the dataset to test split. If int is used, it refers to the total number of test samples. If the value is None, the complement of the train size is used. It will be set to 0.25 if train size is also None.
        train_size: int or float, by default None. 
        random_state : int,by default None. Controls how the data is shuffled before the split is implemented. For repeatable output across several function calls, pass an int.
         
        shuffle: boolean object , by default True. Whether or not the data should be shuffled before splitting. Stratify must be None if shuffle=False.
        stratify: array-like object , by default it is None. If None is selected, the data is stratified using these as class labels.


    returns: splitting: list



In [7]:
from sklearn.model_selection import train_test_split

In [8]:
df = pd.read_csv('headbrain1.csv')

In [9]:
df.head()

Unnamed: 0,Head Size(cm^3),Brain Weight(grams)
0,4512,1530
1,3738,1297
2,4261,1335
3,3777,1282
4,4177,1590


In [21]:
X = df['Head Size(cm^3)']
y = df['Brain Weight(grams)']

In [22]:
X

0      4512
1      3738
2      4261
3      3777
4      4177
       ... 
232    3214
233    3394
234    3233
235    3352
236    3391
Name: Head Size(cm^3), Length: 237, dtype: int64

In [23]:
y

0      1530
1      1297
2      1335
3      1282
4      1590
       ... 
232    1110
233    1215
234    1104
235    1170
236    1120
Name: Brain Weight(grams), Length: 237, dtype: int64

In [24]:
x_train, x_test, y_train, y_test = train_test_split(X, y)

In [25]:
x_test.shape

(60,)

In [26]:
x_train.shape

(177,)

In [27]:
y_test.shape

(60,)

In [28]:
y_train.shape

(177,)

In [29]:
y_train

114    1290
218    1142
10     1340
19     1400
132    1450
       ... 
230    1350
12     1355
68     1510
26     1490
204    1150
Name: Brain Weight(grams), Length: 177, dtype: int64

In [30]:
x_train

114    3383
218    3268
10     3443
19     4424
132    4046
       ... 
230    3685
12     3640
68     4430
26     4036
204    3067
Name: Head Size(cm^3), Length: 177, dtype: int64

In [31]:
df = pd.read_csv('Social_Network_Ads.csv')

In [32]:
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [33]:
df=df.iloc[:,2:]

In [34]:
df

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0
...,...,...,...
395,46,41000,1
396,51,23000,1
397,50,20000,1
398,36,33000,0


In [35]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Purchased', axis=1), df['Purchased'], test_size=0.3, random_state=0)

In [36]:
X_train.shape

(280, 2)

In [37]:
X_test.shape

(120, 2)

In [38]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(X_train)

# transform train and test sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [40]:
scaler

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [41]:
X_train_scaled

array([[-1.1631724 , -1.5849703 ],
       [ 2.17018137,  0.93098672],
       [ 0.0133054 ,  1.22017719],
       [ 0.20938504,  1.07558195],
       [ 0.40546467, -0.48604654],
       [-0.28081405, -0.31253226],
       [ 0.99370357, -0.8330751 ],
       [ 0.99370357,  1.8563962 ],
       [ 0.0133054 ,  1.24909623],
       [-0.86905295,  2.26126285],
       [-1.1631724 , -1.5849703 ],
       [ 2.17018137, -0.80415605],
       [-1.35925203, -1.46929411],
       [ 0.40546467,  2.2901819 ],
       [ 0.79762394,  0.75747245],
       [-0.96709276, -0.31253226],
       [ 0.11134522,  0.75747245],
       [-0.96709276,  0.55503912],
       [ 0.30742485,  0.06341534],
       [ 0.69958412, -1.26686079],
       [-0.47689368, -0.0233418 ],
       [-1.7514113 ,  0.3526058 ],
       [-0.67297331,  0.12125343],
       [ 0.40546467,  0.29476771],
       [-0.28081405,  0.06341534],
       [-0.47689368,  2.2901819 ],
       [ 0.20938504,  0.03449629],
       [ 1.28782302,  2.20342476],
       [ 0.79762394,

In [42]:


X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)



In [43]:
np.round(X_train.describe(), 1)

Unnamed: 0,Age,EstimatedSalary
count,280.0,280.0
mean,37.9,69807.1
std,10.2,34641.2
min,18.0,15000.0
25%,30.0,43000.0
50%,37.0,70500.0
75%,46.0,88000.0
max,60.0,150000.0


In [46]:
np.round(X_train_scaled.describe(), 1)

Unnamed: 0,Age,EstimatedSalary
count,280.0,280.0
mean,0.0,0.0
std,1.0,1.0
min,-1.9,-1.6
25%,-0.8,-0.8
50%,-0.1,0.0
75%,0.8,0.5
max,2.2,2.3
