In [1]:
import pandas as pd
import numpy as np

Link to Iris dataset : https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data

In [2]:
iris = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")

In [3]:
iris.head()

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [4]:
df = iris.copy()
df.head()

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


If we directly convert the column heads to meaningful names, we lose out on one row of data. We can preserve it, by adding a new column with the current header values typecasted to the datatype of the column values (which can be fetched using dataframe.dtypes). 

In [5]:
print(df.columns)
print(df.columns[0])
print(type(df.columns[0]))
print(df.iloc[0,:][0])
print(type(df.iloc[0,:][0]))

Index(['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'], dtype='object')
5.1
<class 'str'>
4.9
<class 'numpy.float64'>


In [6]:
#Type for pandas dataframe
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [7]:
df.shape

(149, 5)

In [8]:
df.columns = ["sl","sw","pl","pw","flower-type"]

In [9]:
#Gives insight into the numeric features of the dataset
df.describe()

Unnamed: 0,sl,sw,pl,pw
count,149.0,149.0,149.0,149.0
mean,5.848322,3.051007,3.774497,1.205369
std,0.828594,0.433499,1.759651,0.761292
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [10]:
df.dtypes

sl             float64
sw             float64
pl             float64
pw             float64
flower-type     object
dtype: object

In [11]:
#Methods to access dataframe columns
print(type(df.sl))
print(type(df['sl']))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [12]:
df.isnull().sum()

sl             0
sw             0
pl             0
pw             0
flower-type    0
dtype: int64

In [13]:
df.iloc[1:4,2:4]

Unnamed: 0,pl,pw
1,1.3,0.2
2,1.5,0.2
3,1.4,0.2


In [14]:
df.drop(0, inplace=True)

In [15]:
df.head()

Unnamed: 0,sl,sw,pl,pw,flower-type
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa


In [16]:
#Gives label of the tuple at the index i of the dataframe
df.index[0]

1

In [17]:
df.drop(df.index[[0,1]], inplace=True)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower-type
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa
6,5.0,3.4,1.5,0.2,Iris-setosa
7,4.4,2.9,1.4,0.2,Iris-setosa


In [18]:
print(type(df.sl>5))
# pd.merge(df.sl>5, df.sw>4) 

<class 'pandas.core.series.Series'>


In [19]:
df[(df.sl>5) & (df.sw>4)]

Unnamed: 0,sl,sw,pl,pw,flower-type
14,5.7,4.4,1.5,0.4,Iris-setosa
31,5.2,4.1,1.5,0.1,Iris-setosa
32,5.5,4.2,1.4,0.2,Iris-setosa


In [20]:
df.rename(columns = {'flower-type':'flower_type'}, inplace=True)

In [21]:
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa
6,5.0,3.4,1.5,0.2,Iris-setosa
7,4.4,2.9,1.4,0.2,Iris-setosa


In [22]:
df[df.flower_type=="Iris-setosa"].shape

(46, 5)

In [23]:
df[df.flower_type=="Iris-setosa"].describe()

Unnamed: 0,sl,sw,pl,pw
count,46.0,46.0,46.0,46.0
mean,5.021739,3.436957,1.469565,0.247826
std,0.358957,0.387791,0.178723,0.111034
min,4.3,2.3,1.0,0.1
25%,4.8,3.2,1.4,0.2
50%,5.0,3.4,1.5,0.2
75%,5.2,3.7,1.6,0.3
max,5.8,4.4,1.9,0.6


In [24]:
df["flower_type"].value_counts()

Iris-versicolor    50
Iris-virginica     50
Iris-setosa        46
Name: flower_type, dtype: int64

In [25]:
df.loc[0] = [1,2,3,4,"Iris-setosa"]

In [26]:
df.tail()

Unnamed: 0,sl,sw,pl,pw,flower_type
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica
148,5.9,3.0,5.1,1.8,Iris-virginica
0,1.0,2.0,3.0,4.0,Iris-setosa


In [27]:
df.reset_index(drop=True,inplace=True)

In [28]:
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,5.0,3.6,1.4,0.2,Iris-setosa
1,5.4,3.9,1.7,0.4,Iris-setosa
2,4.6,3.4,1.4,0.3,Iris-setosa
3,5.0,3.4,1.5,0.2,Iris-setosa
4,4.4,2.9,1.4,0.2,Iris-setosa


In [30]:
df.drop('sl',axis=1).head()
#alterante method del df['sl'] -> inplace deletion

Unnamed: 0,sw,pl,pw,flower_type
0,3.6,1.4,0.2,Iris-setosa
1,3.9,1.7,0.4,Iris-setosa
2,3.4,1.4,0.3,Iris-setosa
3,3.4,1.5,0.2,Iris-setosa
4,2.9,1.4,0.2,Iris-setosa


In [31]:
df["diff_pl_pw"] = df["pl"] - df["pw"]

In [32]:
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type,diff_pl_pw
0,5.0,3.6,1.4,0.2,Iris-setosa,1.2
1,5.4,3.9,1.7,0.4,Iris-setosa,1.3
2,4.6,3.4,1.4,0.3,Iris-setosa,1.1
3,5.0,3.4,1.5,0.2,Iris-setosa,1.3
4,4.4,2.9,1.4,0.2,Iris-setosa,1.2


In [33]:
df = df[["sl","sw","pl","pw","diff_pl_pw","flower_type"]]

In [34]:
df.head()

Unnamed: 0,sl,sw,pl,pw,diff_pl_pw,flower_type
0,5.0,3.6,1.4,0.2,1.2,Iris-setosa
1,5.4,3.9,1.7,0.4,1.3,Iris-setosa
2,4.6,3.4,1.4,0.3,1.1,Iris-setosa
3,5.0,3.4,1.5,0.2,1.3,Iris-setosa
4,4.4,2.9,1.4,0.2,1.2,Iris-setosa


### Handling NaN values

In [37]:
df.iloc[2:4,1:3] = np.nan
df.head()

Unnamed: 0,sl,sw,pl,pw,diff_pl_pw,flower_type
0,5.0,3.6,1.4,0.2,1.2,Iris-setosa
1,5.4,3.9,1.7,0.4,1.3,Iris-setosa
2,4.6,,,0.3,1.1,Iris-setosa
3,5.0,,,0.2,1.3,Iris-setosa
4,4.4,2.9,1.4,0.2,1.2,Iris-setosa


In [38]:
#Not in place
df.dropna().head()

Unnamed: 0,sl,sw,pl,pw,diff_pl_pw,flower_type
0,5.0,3.6,1.4,0.2,1.2,Iris-setosa
1,5.4,3.9,1.7,0.4,1.3,Iris-setosa
4,4.4,2.9,1.4,0.2,1.2,Iris-setosa
5,4.9,3.1,1.5,0.1,1.4,Iris-setosa
6,5.4,3.7,1.5,0.2,1.3,Iris-setosa


In [40]:
df.describe()

Unnamed: 0,sl,sw,pl,pw,diff_pl_pw
count,147.0,145.0,145.0,147.0,147.0
mean,5.838095,3.037931,3.850345,1.244898,2.572789
std,0.911595,0.445825,1.728604,0.786546,1.077641
min,1.0,2.0,1.0,0.1,-1.0
25%,5.1,2.8,1.6,0.35,1.4
50%,5.8,3.0,4.4,1.3,3.0
75%,6.4,3.3,5.1,1.8,3.3
max,7.9,4.4,6.9,4.0,4.7


In [42]:
df.sw.fillna(df.sw.mean(),inplace=True)

In [43]:
df.head()

Unnamed: 0,sl,sw,pl,pw,diff_pl_pw,flower_type
0,5.0,3.6,1.4,0.2,1.2,Iris-setosa
1,5.4,3.9,1.7,0.4,1.3,Iris-setosa
2,4.6,3.037931,,0.3,1.1,Iris-setosa
3,5.0,3.037931,,0.2,1.3,Iris-setosa
4,4.4,2.9,1.4,0.2,1.2,Iris-setosa


In [47]:
a = df[df["flower_type"]=="Iris-setosa"]
a.describe()

Unnamed: 0,sl,sw,pl,pw,diff_pl_pw
count,47.0,47.0,45.0,47.0,47.0
mean,4.93617,3.390976,1.504444,0.32766,1.174468
std,0.6857,0.443513,0.290732,0.55822,0.369764
min,1.0,2.0,1.0,0.1,-1.0
25%,4.8,3.1,1.4,0.2,1.1
50%,5.0,3.4,1.5,0.2,1.2
75%,5.2,3.7,1.6,0.3,1.4
max,5.8,4.4,3.0,4.0,1.7


In [49]:
df.pl.fillna(a.pl.mean(),inplace=True)

In [50]:
df.head()

Unnamed: 0,sl,sw,pl,pw,diff_pl_pw,flower_type
0,5.0,3.6,1.4,0.2,1.2,Iris-setosa
1,5.4,3.9,1.7,0.4,1.3,Iris-setosa
2,4.6,3.037931,1.504444,0.3,1.1,Iris-setosa
3,5.0,3.037931,1.504444,0.2,1.3,Iris-setosa
4,4.4,2.9,1.4,0.2,1.2,Iris-setosa


### Handling Strings

In [51]:
df["Gender"] = "Female"
df.head()

Unnamed: 0,sl,sw,pl,pw,diff_pl_pw,flower_type,Gender
0,5.0,3.6,1.4,0.2,1.2,Iris-setosa,Female
1,5.4,3.9,1.7,0.4,1.3,Iris-setosa,Female
2,4.6,3.037931,1.504444,0.3,1.1,Iris-setosa,Female
3,5.0,3.037931,1.504444,0.2,1.3,Iris-setosa,Female
4,4.4,2.9,1.4,0.2,1.2,Iris-setosa,Female


In [52]:
df.iloc[0:10,6] = "Male"

In [53]:
df.head()

Unnamed: 0,sl,sw,pl,pw,diff_pl_pw,flower_type,Gender
0,5.0,3.6,1.4,0.2,1.2,Iris-setosa,Male
1,5.4,3.9,1.7,0.4,1.3,Iris-setosa,Male
2,4.6,3.037931,1.504444,0.3,1.1,Iris-setosa,Male
3,5.0,3.037931,1.504444,0.2,1.3,Iris-setosa,Male
4,4.4,2.9,1.4,0.2,1.2,Iris-setosa,Male


In [56]:
df["Gender"].value_counts()

Female    137
Male       10
Name: Gender, dtype: int64

In [63]:
def male_ohe(s):
    if s == "Male":
        return 1
    else:
        return 0

In [66]:
def female_ohe(s):
    if s == "Female":
        return 1
    else:
        return 0

In [67]:
df["Male"] = df["Gender"].apply(male_ohe)
df.head()

Unnamed: 0,sl,sw,pl,pw,diff_pl_pw,flower_type,Gender,Sex,Male
0,5.0,3.6,1.4,0.2,1.2,Iris-setosa,Male,0,1
1,5.4,3.9,1.7,0.4,1.3,Iris-setosa,Male,0,1
2,4.6,3.037931,1.504444,0.3,1.1,Iris-setosa,Male,0,1
3,5.0,3.037931,1.504444,0.2,1.3,Iris-setosa,Male,0,1
4,4.4,2.9,1.4,0.2,1.2,Iris-setosa,Male,0,1


In [68]:
df["FeMale"] = df["Gender"].apply(female_ohe)

In [69]:
df.head()

Unnamed: 0,sl,sw,pl,pw,diff_pl_pw,flower_type,Gender,Sex,Male,FeMale
0,5.0,3.6,1.4,0.2,1.2,Iris-setosa,Male,0,1,0
1,5.4,3.9,1.7,0.4,1.3,Iris-setosa,Male,0,1,0
2,4.6,3.037931,1.504444,0.3,1.1,Iris-setosa,Male,0,1,0
3,5.0,3.037931,1.504444,0.2,1.3,Iris-setosa,Male,0,1,0
4,4.4,2.9,1.4,0.2,1.2,Iris-setosa,Male,0,1,0
