In [60]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.multioutput import MultiOutputRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [61]:
df = pd.read_csv("D:\\Insta_Reach\\data\\insta1.csv")

In [62]:
df.head()

Unnamed: 0.1,Unnamed: 0,Username,Caption,Followers,Hashtags,Likes,Time since posted (hours)
0,0,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,139,11
1,1,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,23,2
2,2,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,25,2
3,3,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,49,3
4,4,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,30,3


In [63]:
df.columns

Index(['Unnamed: 0', 'Username', 'Caption', 'Followers', 'Hashtags', 'Likes',
       'Time since posted (hours)'],
      dtype='object')

In [64]:
df.shape

(94, 7)

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Unnamed: 0                 94 non-null     int64 
 1   Username                   94 non-null     object
 2   Caption                    94 non-null     object
 3   Followers                  94 non-null     int64 
 4   Hashtags                   94 non-null     object
 5   Likes                      94 non-null     int64 
 6   Time since posted (hours)  94 non-null     int64 
dtypes: int64(4), object(3)
memory usage: 5.3+ KB


In [66]:
df = df.drop(['Unnamed: 0'], axis = 1)

In [67]:
df.head()

Unnamed: 0,Username,Caption,Followers,Hashtags,Likes,Time since posted (hours)
0,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,139,11
1,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,23,2
2,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,25,2
3,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,49,3
4,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,30,3


In [68]:
X = df.drop(['Time since posted (hours)', 'Likes'], axis = 1)
y = df[['Time since posted (hours)', 'Likes']]

In [69]:
X.head()

Unnamed: 0,Username,Caption,Followers,Hashtags
0,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...
1,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...
2,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...
3,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...
4,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...


In [70]:
y.head()

Unnamed: 0,Time since posted (hours),Likes
0,11,139
1,2,23
2,2,25
3,3,49
4,3,30


In [71]:
def evaluate_clf(true, predicted):

    '''
    This Function Takes in true values and predicted values 
    Returns : MAE, MSE, RMSE, R2SCORE
    '''
    MAE = mean_absolute_error(true, predicted)
    MSE = mean_squared_error(true, predicted)
    RMSE = np.sqrt(MSE)
    R2SCORE = r2_score(true, predicted)*100
    return MAE, MSE, RMSE, R2SCORE

In [72]:
models = {
    "Linear Regression" : LinearRegression(),
    "Ridge Regression" : Ridge(),
    "Lasso Regression" : Lasso(),
    "ElasticNet Regression" : ElasticNet()
}

In [73]:
def evaluate_models(X, y, models):
    '''
    This Function takes in X and y and models dictionary as input
    It splits the data into Train Test Split
    Iterates through the given model dictionary and evaluates the metrics
    Returns : Dataframe which contains report of all models metrics
    '''

    # Seperate Dataset into Train & Test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 42)

    # Applying 
    encoder = OrdinalEncoder()
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.fit_transform(X_test)

    Model_List = []
    MAE_Train_List = []
    MSE_Train_List = []
    RSME_Train_List = [] 
    R2SCORE_Train_List = []
    MAE_Test_List = []
    MSE_Test_List = []
    RSME_Test_List = [] 
    R2SCORE_Test_List = []
    R2SCORE_DIFF = []


    for i in range(len(list(models))):
        model = list(models.values())[i]
        wrapper = MultiOutputRegressor(model)
        wrapper.fit(X_train, y_train)

        # Make Prediction
        y_train_pred = wrapper.predict(X_train)
        y_test_pred = wrapper.predict(X_test)

        # Training Set Performance
        Model_Train_MAE, Model_Train_MSE, Model_Train_RMSE, Model_Train_R2SCORE = evaluate_clf(y_train, y_train_pred)

        MAE_Train_List.append(Model_Train_MAE)
        MSE_Train_List.append(Model_Train_MSE)
        RSME_Train_List.append(Model_Train_RMSE)
        R2SCORE_Train_List.append(Model_Train_R2SCORE)


        # Test Set Performance
        Model_Test_MAE, Model_Test_MSE, Model_Test_RMSE, Model_Test_R2SCORE = evaluate_clf(y_test, y_test_pred)

        MAE_Test_List.append(Model_Test_MAE)
        MSE_Test_List.append(Model_Test_MSE)
        RSME_Test_List.append(Model_Test_RMSE)
        R2SCORE_Test_List.append(Model_Test_R2SCORE)


        # R2SCORE Train-Test Difference
        R2SCORE_DIFF.append(max(Model_Train_R2SCORE, Model_Test_R2SCORE) - min(Model_Train_R2SCORE, Model_Test_R2SCORE))

        print(list(models.keys())[i])
        Model_List.append(list(models.keys())[i])

        print(f"Model Performance For Training Set")
        print(f"- MAE : {Model_Train_MAE}")
        print(f"- MSE : {Model_Train_MSE}")
        print(f"- RMSE : {Model_Train_RMSE}")
        print(f"- R2SCORE : {Model_Train_R2SCORE}")

        print("*"*100)

        print(f"Model Performance For Testing Set")
        print(f"- MAE : {Model_Test_MAE}")
        print(f"- MSE : {Model_Test_MSE}")
        print(f"- RMSE : {Model_Test_RMSE}")
        print(f"- R2SCORE : {Model_Test_R2SCORE}")

        print("*"*100)
        print('\n')

    report = pd.DataFrame(list(zip(Model_List, MAE_Train_List, MAE_Test_List, MSE_Train_List, MSE_Test_List, RSME_Train_List, RSME_Test_List, R2SCORE_Train_List, R2SCORE_Test_List, R2SCORE_DIFF)), columns = ['Model Name', 'MAE_Train', 'MAE_Test',
                                                                                                                                                                                                               'MSE_Train', 'MSE_Test', 'RMSE_Train', 'RMSE_Test', 'R2SCORE_Train', 'R2SCORE_Test', 'R2SCOREDIFF']).sort_values(by = 'R2SCORE_Test', ascending = False)
    
    return report





In [75]:
report = evaluate_models(X, y, models)

Linear Regression
Model Performance For Training Set
- MAE : 18.240949792478272
- MSE : 1412.5513898702325
- RMSE : 37.583924620377694
- R2SCORE : 15.677491894328565
****************************************************************************************************
Model Performance For Testing Set
- MAE : 28.830944649516972
- MSE : 2233.3618809558116
- RMSE : 47.258458300666256
- R2SCORE : -114.66946601479057
****************************************************************************************************


Ridge Regression
Model Performance For Training Set
- MAE : 18.240803456252458
- MSE : 1412.5513902770451
- RMSE : 37.58392462578975
- R2SCORE : 15.677491876555683
****************************************************************************************************
Model Performance For Testing Set
- MAE : 28.830024749874415
- MSE : 2233.2645702232167
- RMSE : 47.25742873055216
- R2SCORE : -114.66227951783783
**********************************************************************

In [76]:
report

Unnamed: 0,Model Name,MAE_Train,MAE_Test,MSE_Train,MSE_Test,RMSE_Train,RMSE_Test,R2SCORE_Train,R2SCORE_Test,R2SCOREDIFF
2,Lasso Regression,18.216041,28.756228,1412.561262,2225.958157,37.584056,47.180061,15.636121,-113.855622,129.491742
3,ElasticNet Regression,18.222988,28.760879,1412.556084,2226.036775,37.583987,47.180894,15.665901,-114.099614,129.765515
1,Ridge Regression,18.240803,28.830025,1412.55139,2233.26457,37.583925,47.257429,15.677492,-114.66228,130.339771
0,Linear Regression,18.24095,28.830945,1412.55139,2233.361881,37.583925,47.258458,15.677492,-114.669466,130.346958


In [77]:
final_model = Lasso()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=42)



encoder = OrdinalEncoder()
X_train = encoder.fit_transform(X_train)
X_test = encoder.fit_transform(X_test)

wrapper = MultiOutputRegressor(final_model)

final_model = wrapper.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

print(f"Final Lasso Regression Accuracy Score (Train) : {final_model.score(X_train, y_train)}")

print(f"Final Lasso Regression Accuracy Score (Test) : {r2_score(y_pred, y_test)}")

Final Lasso Regression Accuracy Score (Train) : 0.15636120776328588
Final Lasso Regression Accuracy Score (Test) : -178.4879363610653


In [82]:
# Let's Predict

print(X_test[2])
y_pred = final_model.predict([X_test[0]])
y_pred

print(f"Time since posted : {y_pred[0][0]} hours")
print(f" No. of Likes: {y_pred[0][1]} ")

[ 1. 13.  9.  7.]
Time since posted : 1.6652258482327578 hours
 No. of Likes: -5.147234384015931 
