## Data Scaling using MinMaxScaler

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
dataset = pd.read_csv('/content/amsPredictionSheet11-201010-101537.csv')
dataset.head()

Unnamed: 0,Attendance,MSE,HRS,ESE
0,70,10,17,42
1,92,7,20,39
2,67,3,18,32
3,82,16,13,50
4,80,9,10,44


In [3]:
dataset.describe()

Unnamed: 0,Attendance,MSE,HRS,ESE
count,73.0,73.0,73.0,73.0
mean,81.69863,9.780822,12.90411,46.630137
std,9.506847,3.047069,5.701278,6.617542
min,62.0,3.0,2.0,32.0
25%,75.0,8.0,8.0,42.0
50%,80.0,10.0,14.0,47.0
75%,90.0,11.0,18.0,50.0
max,98.0,17.0,22.0,64.0


In [4]:
scaler = MinMaxScaler()
dataset_val = dataset.values
dataset_scaled = scaler.fit_transform(dataset_val)
normalized_dataset = pd.DataFrame(dataset_scaled)
normalized_dataset.head()

Unnamed: 0,0,1,2,3
0,0.222222,0.5,0.75,0.3125
1,0.833333,0.285714,0.9,0.21875
2,0.138889,0.0,0.8,0.0
3,0.555556,0.928571,0.55,0.5625
4,0.5,0.428571,0.4,0.375


In [5]:
normalized_dataset.describe()

Unnamed: 0,0,1,2,3
count,73.0,73.0,73.0,73.0
mean,0.547184,0.484344,0.545205,0.457192
std,0.264079,0.217648,0.285064,0.206798
min,0.0,0.0,0.0,0.0
25%,0.361111,0.357143,0.3,0.3125
50%,0.5,0.5,0.6,0.46875
75%,0.777778,0.571429,0.8,0.5625
max,1.0,1.0,1.0,1.0


# Data Standardisation using StandardScaler

In [6]:
from sklearn.preprocessing import  StandardScaler
std_scaler = StandardScaler()
dataset_val_sc = dataset.values
dataset_std= std_scaler.fit_transform(dataset_val_sc)
standardized_dataset = pd.DataFrame(dataset_std)
standardized_dataset.head()

Unnamed: 0,0,1,2,3
0,-1.239064,0.072429,0.723388,-0.704518
1,1.091073,-0.918938,1.253227,-1.160996
2,-1.55681,-2.24076,0.900001,-2.226111
3,0.03192,2.055161,0.016936,0.512756
4,-0.179911,-0.258027,-0.512904,-0.4002


In [7]:
standardized_dataset.describe()

Unnamed: 0,0,1,2,3
count,73.0,73.0,73.0,73.0
mean,5.718409e-16,3.239418e-16,1.338351e-16,-5.201319e-16
std,1.00692,1.00692,1.00692,1.00692
min,-2.086386,-2.24076,-1.925809,-2.226111
25%,-0.7094874,-0.5884823,-0.8661302,-0.7045184
50%,-0.1799109,0.0724286,0.1935486,0.0562781
75%,0.879242,0.4028841,0.9000012,0.512756
max,1.726564,2.385617,1.606454,2.642986


## Converting Categorical Data to Numeric data through Replacing values

In [8]:
dataset = pd.read_csv('/content/stdcat-201010-101522 (1).csv')
dataset.head()

Unnamed: 0,RNO,State,Category,Gender
0,20150601,Maharashtra,OBC,F
1,20150602,Maharashtra,SC,M
2,20150603,Maharashtra,ST,M
3,20150604,JK,GEN,F
4,20150605,Nagaland,GEN,M


In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   RNO       5 non-null      int64 
 1   State     5 non-null      object
 2   Category  5 non-null      object
 3   Gender    5 non-null      object
dtypes: int64(1), object(3)
memory usage: 288.0+ bytes


In [10]:
new_gender = {'F':0,'M':1}
dataset = dataset.replace({'Gender':new_gender})
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   RNO       5 non-null      int64 
 1   State     5 non-null      object
 2   Category  5 non-null      object
 3   Gender    5 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 288.0+ bytes


In [11]:
dataset.describe()

Unnamed: 0,RNO,Gender
count,5.0,5.0
mean,20150600.0,0.6
std,1.581139,0.547723
min,20150600.0,0.0
25%,20150600.0,0.0
50%,20150600.0,1.0
75%,20150600.0,1.0
max,20150600.0,1.0


In [12]:
dataset.head()

Unnamed: 0,RNO,State,Category,Gender
0,20150601,Maharashtra,OBC,0
1,20150602,Maharashtra,SC,1
2,20150603,Maharashtra,ST,1
3,20150604,JK,GEN,0
4,20150605,Nagaland,GEN,1


## Converting Categorical Data to Numeric data through LabelEncoder

In [13]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
dataset['State'] = lb.fit_transform(dataset['State'])

In [14]:
dataset.head()

Unnamed: 0,RNO,State,Category,Gender
0,20150601,1,OBC,0
1,20150602,1,SC,1
2,20150603,1,ST,1
3,20150604,0,GEN,0
4,20150605,2,GEN,1


## Using dummy variable

In [15]:
dataset = pd.get_dummies(dataset, columns=['Category'], prefix = ['Cat'])
dataset.head()

Unnamed: 0,RNO,State,Gender,Cat_GEN,Cat_OBC,Cat_SC,Cat_ST
0,20150601,1,0,0,1,0,0
1,20150602,1,1,0,0,1,0
2,20150603,1,1,0,0,0,1
3,20150604,0,0,1,0,0,0
4,20150605,2,1,1,0,0,0


In [16]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   RNO      5 non-null      int64
 1   State    5 non-null      int64
 2   Gender   5 non-null      int64
 3   Cat_GEN  5 non-null      uint8
 4   Cat_OBC  5 non-null      uint8
 5   Cat_SC   5 non-null      uint8
 6   Cat_ST   5 non-null      uint8
dtypes: int64(3), uint8(4)
memory usage: 268.0 bytes


In [17]:
dataset.describe()

Unnamed: 0,RNO,State,Gender,Cat_GEN,Cat_OBC,Cat_SC,Cat_ST
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,20150600.0,1.0,0.6,0.4,0.2,0.2,0.2
std,1.581139,0.707107,0.547723,0.547723,0.447214,0.447214,0.447214
min,20150600.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20150600.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,20150600.0,1.0,1.0,0.0,0.0,0.0,0.0
75%,20150600.0,1.0,1.0,1.0,0.0,0.0,0.0
max,20150600.0,2.0,1.0,1.0,1.0,1.0,1.0
