In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("seaborn")

In [2]:
df = pd.read_csv("iris-dataset.csv", header=0)
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
# reset index
df = df.set_index("Id")

In [4]:
# feature data types
df.dtypes

SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [5]:
# get feature list
features = list(df.columns)
features

['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']

In [6]:
# get numeric features
numeric_features = [x for x in features if df[x].dtype != "object"]

# get categorical features
categorical_features = [x for x in features if df[x].dtype == "object"]

print(numeric_features, categorical_features)

['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'] ['Species']


In [7]:
# get count of null values in numeric features
df[numeric_features].isnull().sum()

SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
dtype: int64

In [8]:
# count null values in categorical features
df[categorical_features].isnull().sum()

Species    0
dtype: int64

In [9]:
# let us perform normalization for numeric features
# normalization scales all values to a range [0, 1]
for x in numeric_features:
    min_val = min(df[x])
    max_val = max(df[x])
    df[x] = df[x].apply(lambda s: ((s - min_val) / (max_val - min_val)))

In [10]:
df.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.222222,0.625,0.067797,0.041667,Iris-setosa
2,0.166667,0.416667,0.067797,0.041667,Iris-setosa
3,0.111111,0.5,0.050847,0.041667,Iris-setosa
4,0.083333,0.458333,0.084746,0.041667,Iris-setosa
5,0.194444,0.666667,0.067797,0.041667,Iris-setosa


In [11]:
# get unique values for the categorcial features
for x in categorical_features:
    print(df[x].unique())

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [12]:
# encode string labels to numeric values
for x in categorical_features:
    # convert the feature to categorical
    df[x] = df[x].astype("category")
    # set new feature name
    new_feature = x + "Code"
    # encode the labels to numeric values
    df[new_feature] = df[x].cat.codes
    
    # create mapping
    mappings = list()
    for i, row in df[[x, new_feature]].iterrows():
        tmp = row[x] + "->" + str(row[new_feature])
        if tmp not in mappings:
            mappings.append(tmp)
    
    # open file and save the mapping
    file = open(x+"_"+new_feature+"_Mapping.txt", mode="w")
    for m in mappings:
        file.writelines(str(m)+"\n")
    file.close()

In [13]:
df.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,SpeciesCode
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.222222,0.625,0.067797,0.041667,Iris-setosa,0
2,0.166667,0.416667,0.067797,0.041667,Iris-setosa,0
3,0.111111,0.5,0.050847,0.041667,Iris-setosa,0
4,0.083333,0.458333,0.084746,0.041667,Iris-setosa,0
5,0.194444,0.666667,0.067797,0.041667,Iris-setosa,0


In [14]:
# remove the redundant features
for x in categorical_features:
    del df[x]

df.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,SpeciesCode
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.222222,0.625,0.067797,0.041667,0
2,0.166667,0.416667,0.067797,0.041667,0
3,0.111111,0.5,0.050847,0.041667,0
4,0.083333,0.458333,0.084746,0.041667,0
5,0.194444,0.666667,0.067797,0.041667,0


In [15]:
# save the new dataset for modelling
df.to_csv("train.csv")