In [87]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import category_encoders as ce

In [88]:
df = pd.read_csv("agora.csv")
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [89]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [90]:
median = df.Transport.median()
mean = df.Transport.mean()

In [91]:
median

214634.81

In [92]:
mean

215331.7324489796

In [93]:
df.Transport = df.Transport.fillna(median)

In [94]:
df.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

In [95]:
df.Area.head()

0      Dhaka
1        Ctg
2    Rangpur
3      Dhaka
4    Rangpur
Name: Area, dtype: object

# Enconding

In [96]:
# without use any encoding technique, use replace method
df.Area = df.Area.replace(['Dhaka', 'Ctg', 'Rangpur'], [3, 2,1])
df.Area.head()

0    3
1    2
2    1
3    3
4    1
Name: Area, dtype: int64

In [97]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,3,192261.83
1,162597.7,151377.59,443898.53,2,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,3,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


In [98]:
# use label encoder
df = pd.read_csv("agora.csv")
df.head()
label = LabelEncoder()
df.Area = label.fit_transform(df['Area'])
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


In [99]:
# if you can do it on multiple columns.
# for column in df.columns:
#     if df[column].dtype == np.number:
#         continue
#     df[column] = LabelEncoder().fit_transform(df[column])

In [100]:
# one hot encoding
df = pd.read_csv("agora.csv")
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [101]:
pd.get_dummies(df['Area']).head()

Unnamed: 0,Ctg,Dhaka,Rangpur
0,0,1,0
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [102]:
dummy_variables = pd.get_dummies(df['Area'], drop_first=True)
dummy_variables.head()

Unnamed: 0,Dhaka,Rangpur
0,1,0
1,0,0
2,0,1
3,1,0
4,0,1


In [103]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [104]:
new_df = df.drop('Area', axis=1)

In [105]:
new_df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
0,114523.61,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [106]:
df = pd.concat([new_df, dummy_variables], axis=1)
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Dhaka,Rangpur
0,114523.61,136897.8,471784.1,192261.83,1,0
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1
3,144372.41,118671.85,383199.62,182901.99,1,0
4,142107.34,91391.77,366168.42,166187.94,0,1


In [107]:
# ordinal encoder
df = pd.read_csv("agora.csv")
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [108]:
city_list = ["Dhaka", "Ctg", "Rangpur"]
ordinal = OrdinalEncoder(categories=[city_list])
# alwasy 2d arra, cause it works number of sample and number of features
encoded_values=ordinal.fit_transform(df[['Area']])

In [109]:
new_area = pd.DataFrame(encoded_values, columns=['Area'])
new_area

Unnamed: 0,Area
0,0.0
1,1.0
2,2.0
3,0.0
4,2.0
5,0.0
6,1.0
7,2.0
8,0.0
9,1.0


In [110]:
df.Area = new_area

In [111]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,0.0,192261.83
1,162597.7,151377.59,443898.53,1.0,191792.06
2,153441.51,101145.55,407934.54,2.0,191050.39
3,144372.41,118671.85,383199.62,0.0,182901.99
4,142107.34,91391.77,366168.42,2.0,166187.94


In [112]:
# hasing enoder
df = pd.read_csv("agora.csv")
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [117]:
encoders = ce.HashingEncoder(cols='Area', n_components=3)

In [118]:
encoders.fit_transform(df)

Unnamed: 0,col_0,col_1,col_2,Marketing Spend,Administration,Transport,Profit
0,0,1,0,114523.61,136897.8,471784.1,192261.83
1,0,0,1,162597.7,151377.59,443898.53,191792.06
2,1,0,0,153441.51,101145.55,407934.54,191050.39
3,0,1,0,144372.41,118671.85,383199.62,182901.99
4,1,0,0,142107.34,91391.77,366168.42,166187.94
5,0,1,0,131876.9,99814.71,362861.36,156991.12
6,0,0,1,134615.46,147198.87,127716.82,156122.51
7,1,0,0,130298.13,145530.06,323876.68,155752.6
8,0,1,0,120542.52,148718.95,311613.29,152211.77
9,0,0,1,123334.88,108679.17,304981.62,149759.96


In [119]:
# colison occure or data missing sometimes
encoders = ce.HashingEncoder(cols='Area', n_components=2)
encoders.fit_transform(df)

Unnamed: 0,col_0,col_1,Marketing Spend,Administration,Transport,Profit
0,0,1,114523.61,136897.8,471784.1,192261.83
1,0,1,162597.7,151377.59,443898.53,191792.06
2,0,1,153441.51,101145.55,407934.54,191050.39
3,0,1,144372.41,118671.85,383199.62,182901.99
4,0,1,142107.34,91391.77,366168.42,166187.94
5,0,1,131876.9,99814.71,362861.36,156991.12
6,0,1,134615.46,147198.87,127716.82,156122.51
7,0,1,130298.13,145530.06,323876.68,155752.6
8,0,1,120542.52,148718.95,311613.29,152211.77
9,0,1,123334.88,108679.17,304981.62,149759.96
