In [1]:
import pandas as pd
df = pd.read_csv('supershop.csv')

In [2]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


# Measures of Central Tendency

In [3]:
df.Transport.mean()

215331.73244897963

In [4]:
df.Transport.median()

214634.81

In [5]:
df.corr()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
Marketing Spend,1.0,0.230437,0.718574,0.937948
Administration,0.230437,1.0,0.009534,0.200717
Transport,0.718574,0.009534,1.0,0.782578
Profit,0.937948,0.200717,0.782578,1.0


In [6]:
df.describe()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
count,50.0,50.0,49.0,50.0
mean,73721.6156,121344.6396,215331.732449,112012.6392
std,45902.256482,28017.802755,119665.39155,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,134050.07,90138.9025
50%,73051.08,122699.795,214634.81,107978.19
75%,101602.8,144842.18,299737.29,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [7]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [8]:
#df.isnull()

# Handle Nan values

In [9]:
df.Transport.mean()

215331.73244897963

In [10]:
df.Transport = df.Transport.fillna(df.Transport.mean()) # you also can fill null values with median()

In [11]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [12]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [13]:
#df.isnull()

# Encoding in ML

In [14]:
x = df.columns

# Label Encoder

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
le = LabelEncoder()

In [17]:
df.Area = le.fit_transform(df['Area'])

In [18]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


# Label Encoder using Loop

In [19]:
import pandas as pd
df = pd.read_csv('supershop.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [20]:
import numpy as np

import warnings
warnings.filterwarnings('ignore')

for column in df.columns:
    if df[column].dtype==np.number:
        continue
    df[column] = le.fit_transform(df[column])
    

In [21]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


In [22]:
import pandas as pd
df = pd.read_csv('supershop.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


# Another way

In [23]:
from pandas.core.dtypes.common import is_numeric_dtype

In [24]:
for column in df.columns:
    if is_numeric_dtype(df[column]):
        continue
    else:
        df[column] = le.fit_transform(df[column])

In [25]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


# One Hot Encoding

In [26]:
import pandas as pd
df = pd.read_csv('supershop.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [27]:
dummy = pd.get_dummies(df['Area'])

In [28]:
dummy.head()

Unnamed: 0,Ctg,Dhaka,Rangpur
0,0,1,0
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [29]:
dummy = pd.get_dummies(df['Area'], drop_first=True)

In [30]:
dummy.head()

Unnamed: 0,Dhaka,Rangpur
0,1,0
1,0,0
2,0,1
3,1,0
4,0,1


In [31]:
df = df.drop('Area', axis=1)

In [32]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
0,114523.61,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [33]:
df2 = pd.concat([df, dummy], axis=1)

In [34]:
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Dhaka,Rangpur
0,114523.61,136897.8,471784.1,192261.83,1,0
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1
3,144372.41,118671.85,383199.62,182901.99,1,0
4,142107.34,91391.77,366168.42,166187.94,0,1


In [35]:
x = df2.drop('Profit', axis=1)

In [36]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Dhaka,Rangpur
0,114523.61,136897.8,471784.1,1,0
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,0,1
3,144372.41,118671.85,383199.62,1,0
4,142107.34,91391.77,366168.42,0,1


In [37]:
y = df2['Profit']

In [38]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [39]:
import pandas as pd
df = pd.read_csv('supershop.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [40]:
columns = ['Area']

for col in columns:
    one_hot = pd.get_dummies(df[col])
    df = pd.concat((df, one_hot), axis=1)

In [41]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Ctg,Dhaka,Rangpur
0,114523.61,136897.8,471784.1,Dhaka,192261.83,0,1,0
1,162597.7,151377.59,443898.53,Ctg,191792.06,1,0,0
2,153441.51,101145.55,407934.54,Rangpur,191050.39,0,0,1
3,144372.41,118671.85,383199.62,Dhaka,182901.99,0,1,0
4,142107.34,91391.77,366168.42,Rangpur,166187.94,0,0,1


In [42]:
import pandas as pd
df = pd.read_csv('supershop.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [43]:
columns = ['Area']

for col in columns:
    one_hot = pd.get_dummies(df[col], drop_first=True)
    df = pd.concat((df, one_hot), axis=1).drop(col, axis=1)

In [44]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Dhaka,Rangpur
0,114523.61,136897.8,471784.1,192261.83,1,0
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1
3,144372.41,118671.85,383199.62,182901.99,1,0
4,142107.34,91391.77,366168.42,166187.94,0,1


# Replace Function

In [45]:
import pandas as pd
df = pd.read_csv('supershop.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [46]:
df.Area = df.Area.replace(['Dhaka','Ctg','Rangpur'], [3,5,1])

In [47]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,3,192261.83
1,162597.7,151377.59,443898.53,5,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,3,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


# Ordinal Encoder

In [48]:
import pandas as pd
df = pd.read_csv('supershop.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [49]:
df.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

In [50]:
city = ['Dhaka', 'Ctg', 'Rangpur']

In [51]:
from sklearn.preprocessing import OrdinalEncoder

In [52]:
ordinal = OrdinalEncoder(categories=[city])

In [53]:
encoded = ordinal.fit_transform(df[['Area']])

In [54]:
#encoded
newdata_frame = pd.DataFrame(encoded, columns=['NewArea'])

In [55]:
newdata_frame.head()

Unnamed: 0,NewArea
0,0.0
1,1.0
2,2.0
3,0.0
4,2.0


In [56]:
df.Area = newdata_frame.NewArea

In [57]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,0.0,192261.83
1,162597.7,151377.59,443898.53,1.0,191792.06
2,153441.51,101145.55,407934.54,2.0,191050.39
3,144372.41,118671.85,383199.62,0.0,182901.99
4,142107.34,91391.77,366168.42,2.0,166187.94


# Ordinal using loop

In [58]:
import pandas as pd
df = pd.read_csv('supershop.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [59]:
cols=['Area']

for col in cols:
    unq = df[col].unique()
    df[col] =  OrdinalEncoder(categories=[unq]).fit_transform(df[[col]])

In [60]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,0.0,192261.83
1,162597.7,151377.59,443898.53,1.0,191792.06
2,153441.51,101145.55,407934.54,2.0,191050.39
3,144372.41,118671.85,383199.62,0.0,182901.99
4,142107.34,91391.77,366168.42,2.0,166187.94
