Prediction using supervised ML

Goal : we want to predict students scores based on their studied hours : 

In [None]:
#importing librairies
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
#importing data
url = 'https://raw.githubusercontent.com/AdiPersonalWorks/Random/master/student_scores%20-%20student_scores.csv'
df=pd.read_csv(url)
print(df)

In [None]:
#we want to plot the data distribution in 2D graph to see the relationship between the variables
df.plot(x='Hours', y='Scores', style='o')
plt.title('Hours vs Scores')
plt.xlabel('Hours')
plt.ylabel('Scores')
plt.show

In [None]:
df.corr()

In [None]:
#inputs
x = df['Hours'].values.reshape(-1,1)

#outputs
y = df['Scores'].values.reshape(-1,1)

In [None]:
#creating training and test sets 
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
#creating a regressor 
reg = LinearRegression()

#fit the regressor to the training data
reg.fit(x_train,y_train)

print('Model Trained')

In [None]:
#predict values
y_pred = reg.predict(x_test)
y_test.shape

In [None]:
#comparing actual and predicted values
new_df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
new_df

In [None]:
#what will be predicted score if a student studies for 9.25 hrs/day
hours = np.array([9.25]).reshape(1, -1)  
predicted = reg.predict(hours)

print(f'No of Hours: {hours.flatten()[0]}')
print(f'Marks predicted: {predicted.flatten()[0]}')

In [None]:
#evaluate model
print("R^2 : {}".format(reg.score(x_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))