## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import StandardScaler

## Read the dataset 

In [2]:
df  = pd.read_csv("https://raw.githubusercontent.com/WidhyaOrg/datasets/master/instagram_reach.csv")

## Print the first 5 rows 

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
0,0,1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11 hours,139
1,1,2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2 hours,23
2,2,3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2 hours,25
3,3,4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3 hours,49
4,4,5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3 hours,30


## Drop the unnecessary columns 

In [4]:
df = df.drop(['Unnamed: 0','S.No','USERNAME','Caption','Hashtags'],axis=1)


In [5]:
df.head()

Unnamed: 0,Followers,Time since posted,Likes
0,1600,11 hours,139
1,880,2 hours,23
2,255,2 hours,25
3,340,3 hours,49
4,304,3 hours,30


## Create a new column for time (int data type)

In [6]:
df['Time']=df['Time since posted'].str.split(" ",n=1,expand=True)[0].astype(int)
df.head()

Unnamed: 0,Followers,Time since posted,Likes,Time
0,1600,11 hours,139,11
1,880,2 hours,23,2
2,255,2 hours,25,2
3,340,3 hours,49,3
4,304,3 hours,30,3


## Drop the column time since posted 

In [7]:
df.drop(['Time since posted'],axis=1)

Unnamed: 0,Followers,Likes,Time
0,1600,139,11
1,880,23,2
2,255,25,2
3,340,49,3
4,304,30,3
...,...,...,...
95,614,31,3
96,450,42,3
97,182,10,3
98,2039,222,3


In [8]:
df = df[['Followers','Time','Likes']]
df.head()

Unnamed: 0,Followers,Time,Likes
0,1600,11,139
1,880,2,23
2,255,2,25
3,340,3,49
4,304,3,30


## Convert the data into X input and y labels 
###Split the data into test set (20%) and training set (80%)

In [9]:
df1 = df.values
X, y = df1[:,:-1], df1[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Normalize the X input 

In [10]:
scaled_features = StandardScaler().fit_transform(X_train, X_test)


## Train the Model : Linear Regression

In [11]:
model=LinearRegression()
model.fit(X_train,y_train)
print("Training completed")

Training completed


## Evaluate the model

In [12]:
print('Training Score: ', round(model.score(X_train,y_train)*100,2),'%')
print('Testing Score: ', round(model.score(X_test,y_test)*100,2),'%')

Training Score:  35.83 %
Testing Score:  61.84 %


In [13]:
y_predict= model.predict(X_test)


In [14]:
from sklearn import metrics
acc=metrics.r2_score(y_test,y_predict)
print("Accuracy Score of Model: ",round(acc*100,2),'%')

Accuracy Score of Model:  61.84 %


In [15]:
from sklearn import metrics
print('Mean Absolute Error:',round(metrics.mean_absolute_error(y_test,y_predict),2))
print('Mean Squared Error:',round(metrics.mean_squared_error(y_test,y_predict),2))
print('Root Mean Squared Error:',round(np.sqrt(metrics.mean_squared_error(y_test,y_predict)),2))

Mean Absolute Error: 18.5
Mean Squared Error: 541.01
Root Mean Squared Error: 23.26


## Test the model on 300 followers and 10 hours of time since posted 


In [16]:
print("The predicted number of likes will be : ",model.predict([[300,10]]))


The predicted number of likes will be :  [108.75693473]
