**HANDLING CATEGORICAL DATA**

In [None]:
# Step 1: Import Libraries

import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

# Step 2: Load Data
        
datasets = pd.read_csv('/content/Exercise-CarData.csv - Exercise-CarData.csv.csv') 
print("\nData :\n",datasets)
print("\nData statistics\n",datasets.describe())


Data :
       Unnamed: 0  Price   Age     KM  ... Automatic    CC  Doors  Weight
0              0  13500  23.0  46986  ...         0  2000  three    1165
1              1  13750  23.0  72937  ...         0  2000      3    1165
2              2  13950  24.0  41711  ...         0  2000      3    1165
3              3  14950  26.0  48000  ...         0  2000      3    1165
4              4  13750  30.0  38500  ...         0  2000      3    1170
...          ...    ...   ...    ...  ...       ...   ...    ...     ...
1431        1431   7500   NaN  20544  ...         0  1300      3    1025
1432        1432  10845  72.0     ??  ...         0  1300      3    1015
1433        1433   8500   NaN  17016  ...         0  1300      3    1015
1434        1434   7250  70.0     ??  ...         0  1300      3    1015
1435        1435   6950  76.0      1  ...         0  1600      5    1114

[1436 rows x 11 columns]

Data statistics
         Unnamed: 0         Price  ...           CC      Weight
count  1

In [None]:
# Step 3: Seprate Input and Output attributes

# All rows, all columns except last 
X = datasets.iloc[:, :-1].values 
  
# Only last column  
Y = datasets.iloc[:, -1].values 

print("\n\nInput : \n", X) 
print("\n\nOutput: \n", Y) 



Input : 
 [[0 13500 23.0 ... 0 2000 'three']
 [1 13750 23.0 ... 0 2000 '3']
 [2 13950 24.0 ... 0 2000 '3']
 ...
 [1433 8500 nan ... 0 1300 '3']
 [1434 7250 70.0 ... 0 1300 '3']
 [1435 6950 76.0 ... 0 1600 '5']]


Output: 
 [1165 1165 1165 ... 1015 1015 1114]


In [None]:
# Step 4a: Apply LabelEncoder on the data 
#          to convert country names into numeric values

le = LabelEncoder()
X[ : ,0] = le.fit_transform(X[ : ,0])
print("\n\nInput : \n", X) 



Input : 
 [[0 13500 23.0 ... 0 2000 'three']
 [1 13750 23.0 ... 0 2000 '3']
 [2 13950 24.0 ... 0 2000 '3']
 ...
 [1433 8500 nan ... 0 1300 '3']
 [1434 7250 70.0 ... 0 1300 '3']
 [1435 6950 76.0 ... 0 1600 '5']]


In [None]:
# Step 4b: Use dummy variables from pandas library
#          to create one column for each country

dummy = pd.get_dummies(datasets['FuelType'])
print("\n\nDummy :\n",dummy)
datasets = datasets.drop(['Price','KM'],axis=1)
datasets = pd.concat([dummy,datasets],axis=1)
print("\n\nFinal Data :\n",datasets)



Dummy :
       CNG  Diesel  Petrol
0       0       1       0
1       0       1       0
2       0       1       0
3       0       1       0
4       0       1       0
...   ...     ...     ...
1431    0       0       1
1432    0       0       1
1433    0       0       1
1434    0       0       0
1435    0       0       1

[1436 rows x 3 columns]


Final Data :
       CNG  Diesel  Petrol  Unnamed: 0  ...  Automatic    CC Doors  Weight
0       0       1       0           0  ...          0  2000     3    1165
1       0       1       0           1  ...          0  2000     3    1165
2       0       1       0           2  ...          0  2000     3    1165
3       0       1       0           3  ...          0  2000     3    1165
4       0       1       0           4  ...          0  2000     3    1170
...   ...     ...     ...         ...  ...        ...   ...   ...     ...
1431    0       0       1        1431  ...          0  1300     3    1025
1432    0       0       1        1432  ...  

In [None]:
#Use One Hot Encoder from scikit learn
onehotencoder = OneHotEncoder()
#reshape the 1-D country array to 2-D as fit_transform expects 2-D and finally fit the object 
x = onehotencoder.fit_transform(datasets.FuelType.values.reshape(-1,1)).toarray()

In [None]:
x

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [35]:
dfOneHot = pd.DataFrame(x, columns = ["Fuel_"+str(int(i)) for i in range(4)]) 
df = pd.concat([datasets, dfOneHot], axis=1) #column
#droping the country column 
df= df.drop(['FuelType'], axis=1) 
#printing to verify 
print(df.head())

   CNG  Diesel  Petrol  Unnamed: 0  ...  Fuel_0 Fuel_1  Fuel_2  Fuel_3
0    0       1       0           0  ...     0.0    1.0     0.0     0.0
1    0       1       0           1  ...     0.0    1.0     0.0     0.0
2    0       1       0           2  ...     0.0    1.0     0.0     0.0
3    0       1       0           3  ...     0.0    1.0     0.0     0.0
4    0       1       0           4  ...     0.0    1.0     0.0     0.0

[5 rows x 15 columns]
