In [55]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression

In [56]:
url = 'https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day40-iterative-imputer/50_Startups.csv'
df = np.round(pd.read_csv(url)[['R&D Spend','Administration','Marketing Spend','Profit']]/10000)
df.to_csv('trai.csv')
np.random.seed(9)
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
0,17.0,14.0,47.0,19.0
1,16.0,15.0,44.0,19.0
2,15.0,10.0,41.0,19.0
3,14.0,12.0,38.0,18.0
4,14.0,9.0,37.0,17.0


In [57]:
df = df.iloc[:,0:-1]
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,16.0,15.0,44.0
2,15.0,10.0,41.0
3,14.0,12.0,38.0
4,14.0,9.0,37.0
5,13.0,10.0,36.0
6,13.0,15.0,13.0
7,13.0,15.0,32.0
8,12.0,15.0,31.0
9,12.0,11.0,30.0


In [58]:
df.iloc[1,0] = np.NaN
df.iloc[3,1] = np.NaN
df.iloc[-1,-1] = np.NaN

In [59]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,,15.0,44.0
2,15.0,10.0,41.0
3,14.0,,38.0
4,14.0,9.0,37.0


In [60]:
# Step 1 - Impute all missing values with mean of respective col

df0 = pd.DataFrame()

df0['R&D Spend'] = df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration'] = df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean())

In [61]:
# 0th Iteration
df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,7.163265,15.0,44.0
2,15.0,10.0,41.0
3,14.0,12.163265,38.0
4,14.0,9.0,37.0
5,13.0,10.0,36.0
6,13.0,15.0,13.0
7,13.0,15.0,32.0
8,12.0,15.0,31.0
9,12.0,11.0,30.0


In [62]:
# Remove the col1 imputed value
df1 = df0.copy()

df1.iloc[1,0] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,,15.0,44.0
2,15.0,10.0,41.0
3,14.0,12.163265,38.0
4,14.0,9.0,37.0
5,13.0,10.0,36.0
6,13.0,15.0,13.0
7,13.0,15.0,32.0
8,12.0,15.0,31.0
9,12.0,11.0,30.0


In [63]:
# Use first 3 rows to build a model and use the last for prediction

X = df1.iloc[[0,2,3,4],1:3]
X

Unnamed: 0,Administration,Marketing Spend
0,14.0,47.0
2,10.0,41.0
3,12.163265,38.0
4,9.0,37.0


In [64]:
y = df1.iloc[[0,2,3,4],0]
y

0    17.0
2    15.0
3    14.0
4    14.0
Name: R&D Spend, dtype: float64

In [65]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[1,1:].values.reshape(1,2))



array([15.90294324])

In [66]:
df1.iloc[1,0] = 23.14

In [67]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,23.14,15.0,44.0
2,15.0,10.0,41.0
3,14.0,12.163265,38.0
4,14.0,9.0,37.0
5,13.0,10.0,36.0
6,13.0,15.0,13.0
7,13.0,15.0,32.0
8,12.0,15.0,31.0
9,12.0,11.0,30.0


In [68]:
# Remove the col2 imputed value

df1.iloc[3,1] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,23.14,15.0,44.0
2,15.0,10.0,41.0
3,14.0,,38.0
4,14.0,9.0,37.0
5,13.0,10.0,36.0
6,13.0,15.0,13.0
7,13.0,15.0,32.0
8,12.0,15.0,31.0
9,12.0,11.0,30.0


In [69]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[[0,1,2,4],[0,2]]
X

Unnamed: 0,R&D Spend,Marketing Spend
0,17.0,47.0
1,23.14,44.0
2,15.0,41.0
4,14.0,37.0


In [70]:
y = df1.iloc[[0,1,2,4],1]
y

0    14.0
1    15.0
2    10.0
4     9.0
Name: Administration, dtype: float64

In [71]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[3,[0,2]].values.reshape(1,2))



array([9.02655196])

In [72]:
df1.iloc[3,1] = 11.06

In [73]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,23.14,15.0,44.0
2,15.0,10.0,41.0
3,14.0,11.06,38.0
4,14.0,9.0,37.0
5,13.0,10.0,36.0
6,13.0,15.0,13.0
7,13.0,15.0,32.0
8,12.0,15.0,31.0
9,12.0,11.0,30.0


In [74]:
# Remove the col3 imputed value
df1.iloc[4,-1] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,23.14,15.0,44.0
2,15.0,10.0,41.0
3,14.0,11.06,38.0
4,14.0,9.0,
5,13.0,10.0,36.0
6,13.0,15.0,13.0
7,13.0,15.0,32.0
8,12.0,15.0,31.0
9,12.0,11.0,30.0


In [75]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[0:4,0:2]
X

Unnamed: 0,R&D Spend,Administration
0,17.0,14.0
1,23.14,15.0
2,15.0,10.0
3,14.0,11.06


In [76]:
y = df1.iloc[0:4,-1]
y

0    47.0
1    44.0
2    41.0
3    38.0
Name: Marketing Spend, dtype: float64

In [77]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[4,0:2].values.reshape(1,2))



array([37.55706491])

In [78]:
df1.iloc[4,-1] = 31.56

In [79]:
# After 1st Iteration
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,23.14,15.0,44.0
2,15.0,10.0,41.0
3,14.0,11.06,38.0
4,14.0,9.0,31.56
5,13.0,10.0,36.0
6,13.0,15.0,13.0
7,13.0,15.0,32.0
8,12.0,15.0,31.0
9,12.0,11.0,30.0


In [80]:
# Subtract 0th iteration from 1st iteration

df1 - df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,0.0,0.0,0.0
1,15.976735,0.0,0.0
2,0.0,0.0,0.0
3,0.0,-1.103265,0.0
4,0.0,0.0,-5.44
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0


In [81]:
df2 = df1.copy()

df2.iloc[1,0] = np.NaN

df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,,15.0,44.0
2,15.0,10.0,41.0
3,14.0,11.06,38.0
4,14.0,9.0,31.56
5,13.0,10.0,36.0
6,13.0,15.0,13.0
7,13.0,15.0,32.0
8,12.0,15.0,31.0
9,12.0,11.0,30.0


In [82]:
X = df2.iloc[[0,2,3,4],1:3]
y = df2.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[1,1:].values.reshape(1,2))



array([16.48489598])

In [83]:
df2.iloc[1,0] = 23.78

In [84]:
df2.iloc[3,1] = np.NaN
X = df2.iloc[[0,1,2,4],[0,2]]
y = df2.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[3,[0,2]].values.reshape(1,2))



array([10.02642414])

In [85]:
df2.iloc[3,1] = 11.22

In [86]:
df2.iloc[4,-1] = np.NaN

X = df2.iloc[0:4,0:2]
y = df2.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[4,0:2].values.reshape(1,2))



array([37.59261491])

In [87]:
df2.iloc[4,-1] = 31.56

In [88]:
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,23.78,15.0,44.0
2,15.0,10.0,41.0
3,14.0,11.22,38.0
4,14.0,9.0,31.56
5,13.0,10.0,36.0
6,13.0,15.0,13.0
7,13.0,15.0,32.0
8,12.0,15.0,31.0
9,12.0,11.0,30.0


In [89]:
df2 - df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,0.0,0.0,0.0
1,0.64,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.16,0.0
4,0.0,0.0,0.0
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0


In [90]:
df3 = df2.copy()

df3.iloc[1,0] = np.NaN

df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,,15.0,44.0
2,15.0,10.0,41.0
3,14.0,11.22,38.0
4,14.0,9.0,31.56
5,13.0,10.0,36.0
6,13.0,15.0,13.0
7,13.0,15.0,32.0
8,12.0,15.0,31.0
9,12.0,11.0,30.0


In [91]:
X = df3.iloc[[0,2,3,4],1:3]
y = df3.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[1,1:].values.reshape(1,2))



array([16.35591793])

In [92]:
df3.iloc[1,0] = 24.57

In [93]:
df3.iloc[3,1] = np.NaN
X = df3.iloc[[0,1,2,4],[0,2]]
y = df3.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[3,[0,2]].values.reshape(1,2))



array([10.06575135])

In [94]:
df3.iloc[3,1] = 11.37

In [95]:
df3.iloc[4,-1] = np.NaN

X = df3.iloc[0:4,0:2]
y = df3.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[4,0:2].values.reshape(1,2))



array([37.63453865])

In [96]:
df3.iloc[4,-1] = 45.53

In [97]:
df2.iloc[3,1] = 11.22

In [98]:
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,24.57,15.0,44.0
2,15.0,10.0,41.0
3,14.0,11.37,38.0
4,14.0,9.0,45.53
5,13.0,10.0,36.0
6,13.0,15.0,13.0
7,13.0,15.0,32.0
8,12.0,15.0,31.0
9,12.0,11.0,30.0


In [99]:
df3 - df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,0.0,0.0,0.0
1,0.79,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.15,0.0
4,0.0,0.0,13.97
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0
