##### **1. Base Line (Mean Prediction)**
##### **2. Data Cleaning & Preparation**
##### **3. Linear Regression Model**
##### **4. Feature Engg.**
##### **5. Tunning**
##### **6. Cross Validation**
##### **7. Residual Analysis**

In [36]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [37]:
data = {
    'StudyHours': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Attendance': [60, 65, 70, 72, 75, 80, 85, 88, 90],
    'PrevScore': [40, 45, 50, 55, 60, 65, 70, 75, 80],
    'FinalMarks': [50, 55, 60, 65, 70, 75, 78, 85, 88]
}
df=pd.DataFrame(data)
df

Unnamed: 0,StudyHours,Attendance,PrevScore,FinalMarks
0,2,60,40,50
1,3,65,45,55
2,4,70,50,60
3,5,72,55,65
4,6,75,60,70
5,7,80,65,75
6,8,85,70,78
7,9,88,75,85
8,10,90,80,88


In [59]:
# Check The Missing Values Row by Row
for index, row in df.iterrows():
    print(f"Row {index}: Missing = {row.isnull().any()}, Details = {row.isnull().to_dict()}")

Row 0: Missing = False, Details = {'StudyHours': False, 'Attendance': False, 'PrevScore': False, 'FinalMarks': False}
Row 1: Missing = False, Details = {'StudyHours': False, 'Attendance': False, 'PrevScore': False, 'FinalMarks': False}
Row 2: Missing = False, Details = {'StudyHours': False, 'Attendance': False, 'PrevScore': False, 'FinalMarks': False}
Row 3: Missing = False, Details = {'StudyHours': False, 'Attendance': False, 'PrevScore': False, 'FinalMarks': False}
Row 4: Missing = False, Details = {'StudyHours': False, 'Attendance': False, 'PrevScore': False, 'FinalMarks': False}
Row 5: Missing = False, Details = {'StudyHours': False, 'Attendance': False, 'PrevScore': False, 'FinalMarks': False}
Row 6: Missing = False, Details = {'StudyHours': False, 'Attendance': False, 'PrevScore': False, 'FinalMarks': False}
Row 7: Missing = False, Details = {'StudyHours': False, 'Attendance': False, 'PrevScore': False, 'FinalMarks': False}
Row 8: Missing = False, Details = {'StudyHours': False, 

In [60]:
y=df['FinalMarks']
y

0    50
1    55
2    60
3    65
4    70
5    75
6    78
7    85
8    88
Name: FinalMarks, dtype: int64

In [39]:
y_pred_baseline=np.repeat(y.mean(),len(y))

In [40]:
mae=mean_absolute_error(y,y_pred_baseline)
mae

10.716049382716049

In [41]:
mse=mean_squared_error(y,y_pred_baseline)
mse

154.02469135802468

In [42]:
rmse=np.sqrt(mse)
print(rmse)

12.410668449282847


In [43]:
r2=r2_score(y,y_pred_baseline)
print(f'Baseline MAE: \t {mae:.2f},\nRMSE: \t {rmse:.2f},\nR-Squared: \t {r2:.2f}')

Baseline MAE: 	 10.72,
RMSE: 	 12.41,
R-Squared: 	 0.00


In [44]:
# Clean The Data
print(df.isnull().sum())

StudyHours    0
Attendance    0
PrevScore     0
FinalMarks    0
dtype: int64


In [63]:
df=df.fillna(df.mean())
df

Unnamed: 0,StudyHours,Attendance,PrevScore,FinalMarks
0,2,60,40,50
1,3,65,45,55
2,4,70,50,60
3,5,72,55,65
4,6,75,60,70
5,7,80,65,75
6,8,85,70,78
7,9,88,75,85
8,10,90,80,88


In [85]:
# Data Prep, Find Out Input,Output
X=df[['StudyHours','Attendance','PrevScore']]
y=df['FinalMarks']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=6,random_state=0)

In [86]:
# Model Creation
model=LinearRegression()

In [87]:
# Train Model
model.fit(X_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [88]:
print('Original Final Marks')
print(y)
y_all_predict=model.predict(X)
y_all_predict

Original Final Marks
0    50
1    55
2    60
3    65
4    70
5    75
6    78
7    85
8    88
Name: FinalMarks, dtype: int64


array([50.        , 55.47619048, 60.95238095, 65.        , 69.52380952,
       75.        , 80.47619048, 85.        , 89.04761905])

In [89]:
y_pred=model.predict(X_test)
print(y_pred)

[85.         60.95238095 55.47619048 69.52380952 89.04761905 80.47619048]


In [90]:
mae=mean_absolute_error(y_test,y_pred)
print(mae)

0.9047619047619051


In [91]:
mse=mean_squared_error(y_test, y_pred)
print(mse)

1.4315948601662913


In [92]:
rmse=np.sqrt(mse)
print(mse)

1.4315948601662913


In [93]:
r2=r2_score(y_test,y_pred)
print(r2)

0.9904062890979176


In [94]:
print(f'Baseline MAE: \t {mae:.2f},\nRMSE: \t {rmse:.2f},\nR-Squared: \t {r2:.2f}')

Baseline MAE: 	 0.90,
RMSE: 	 1.20,
R-Squared: 	 0.99


In [98]:
# Comparison Predict vs Original
comparison = pd.DataFrame({
    'Original Final Marks': y.values.flatten(),
    'Predicted Final Marks': y_all_predict
})
 
# Display neatly
print("Comparison of Original vs Predicted Marks:")
print()
display(comparison)

Comparison of Original vs Predicted Marks:



Unnamed: 0,Original Final Marks,Predicted Final Marks
0,50,50.0
1,55,55.47619
2,60,60.952381
3,65,65.0
4,70,69.52381
5,75,75.0
6,78,80.47619
7,85,85.0
8,88,89.047619


##### Base Line (Mean Prediction)
1. Data Cleaning & Preparation
2. Linear Regression Model
3. Find Out MAE, RMSE, R-Squared
4. Data Set as Follows:
5. data={
'Area': [850, 900, 1000, 1100, 1200, 1500, 16000, 1800, 2000],
'Bedrooms' : [1, 2, 2, 2, 3, 3, 2, 4, 4],
'Age' : [1 ,1, 3, 2, 1, 2, 2, 1, 2].,
'PriceIn100k' : [5, 6, 7, 7, 70, 9, 9, 11,None]}

In [117]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [127]:
data={
'Area':[850, 900, 1000, 1100, 1200, 1500, 16000, 1800, 2000],
'Bedrooms':[1, 2, 2, 2, 3, 3, 2, 4, 4],
'Age':[1 ,1, 3, 2, 1, 2, 2, 1, 2],
'PriceIn100k':[5, 6, 7, 7, 70, 9, 9, 11,None]}

df=pd.DataFrame(data)
df

Unnamed: 0,Area,Bedrooms,Age,PriceIn100k
0,850,1,1,5.0
1,900,2,1,6.0
2,1000,2,3,7.0
3,1100,2,2,7.0
4,1200,3,1,70.0
5,1500,3,2,9.0
6,16000,2,2,9.0
7,1800,4,1,11.0
8,2000,4,2,


In [128]:
# Check The Missing Values Row by Row
for index, row in df.iterrows():
    print(f"Row {index}: Missing = {row.isnull().any()}, Details = {row.isnull().to_dict()}")

Row 0: Missing = False, Details = {'Area': False, 'Bedrooms': False, 'Age': False, 'PriceIn100k': False}
Row 1: Missing = False, Details = {'Area': False, 'Bedrooms': False, 'Age': False, 'PriceIn100k': False}
Row 2: Missing = False, Details = {'Area': False, 'Bedrooms': False, 'Age': False, 'PriceIn100k': False}
Row 3: Missing = False, Details = {'Area': False, 'Bedrooms': False, 'Age': False, 'PriceIn100k': False}
Row 4: Missing = False, Details = {'Area': False, 'Bedrooms': False, 'Age': False, 'PriceIn100k': False}
Row 5: Missing = False, Details = {'Area': False, 'Bedrooms': False, 'Age': False, 'PriceIn100k': False}
Row 6: Missing = False, Details = {'Area': False, 'Bedrooms': False, 'Age': False, 'PriceIn100k': False}
Row 7: Missing = False, Details = {'Area': False, 'Bedrooms': False, 'Age': False, 'PriceIn100k': False}
Row 8: Missing = True, Details = {'Area': False, 'Bedrooms': False, 'Age': False, 'PriceIn100k': True}


In [129]:
print(df.isnull().sum())

Area           0
Bedrooms       0
Age            0
PriceIn100k    1
dtype: int64


In [130]:
df = df.dropna()

In [143]:
df['PriceIn100k'].fillna(df['PriceIn100k'].mean(), inplace=True)
print("\nMissing Values Handled:")
print(df.isnull().sum())


Missing Values Handled:
Area           0
Bedrooms       0
Age            0
PriceIn100k    0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PriceIn100k'].fillna(df['PriceIn100k'].mean(), inplace=True)


In [144]:
X=df[['Area','Bedrooms','Age']]
y=df['PriceIn100k']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=6,random_state=0)

In [145]:
model=LinearRegression()

In [146]:
model.fit(X_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [147]:
y_pred=model.predict(X_test)
print(y_pred)

[-2939.30057444   110.66485928   130.99932223   -51.99864446
    90.33242964   141.16587593]


In [148]:
mae=mean_absolute_error(y_test, y_pred)
print(mae)

576.5769509968408


In [149]:
mse=mean_squared_error(y_test, y_pred)
print(mse)

1458050.2966069272


In [150]:
rmse=np.sqrt(mse)
print(mse)

1458050.2966069272


In [151]:
r2=r2_score(y_test,y_pred)
print(r2)

-372267.16083581117


In [152]:
print(f'Baseline MAE: \t {mae:.2f},\nRMSE: \t {rmse:.2f},\nR-Squared: \t {r2:.2f}')

Baseline MAE: 	 576.58,
RMSE: 	 1207.50,
R-Squared: 	 -372267.16
