In [2]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.multioutput import MultiOutputRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
df = pd.read_csv("D:\\Insta_Reach\\data\\insta1.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Username,Caption,Followers,Hashtags,Likes,Time since posted (hours)
0,0,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,139,11
1,1,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,23,2
2,2,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,25,2
3,3,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,49,3
4,4,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,30,3


In [5]:
df.columns

Index(['Unnamed: 0', 'Username', 'Caption', 'Followers', 'Hashtags', 'Likes',
       'Time since posted (hours)'],
      dtype='object')

In [6]:
df.shape

(94, 7)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Unnamed: 0                 94 non-null     int64 
 1   Username                   94 non-null     object
 2   Caption                    94 non-null     object
 3   Followers                  94 non-null     int64 
 4   Hashtags                   94 non-null     object
 5   Likes                      94 non-null     int64 
 6   Time since posted (hours)  94 non-null     int64 
dtypes: int64(4), object(3)
memory usage: 5.3+ KB


In [8]:
df = df.drop(['Unnamed: 0'], axis = 1)

In [9]:
df.head()

Unnamed: 0,Username,Caption,Followers,Hashtags,Likes,Time since posted (hours)
0,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,139,11
1,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,23,2
2,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,25,2
3,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,49,3
4,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,30,3


In [10]:
X = df.drop(['Time since posted (hours)', 'Likes'], axis = 1)
y = df[['Time since posted (hours)', 'Likes']]

In [11]:
X.head()

Unnamed: 0,Username,Caption,Followers,Hashtags
0,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...
1,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...
2,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...
3,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...
4,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...


In [12]:
y.head()

Unnamed: 0,Time since posted (hours),Likes
0,11,139
1,2,23
2,2,25
3,3,49
4,3,30


**CATEGORICAL AND NUMERICAL COLUMNS OF DATASET**

In [21]:
cat_cols = X.columns[X.dtypes == 'object']
cat_cols

Index(['Username', 'Caption', 'Hashtags'], dtype='object')

In [22]:
num_cols = X.columns[X.dtypes != 'object']
num_cols

Index(['Followers'], dtype='object')

**FEATURE TRANSFORMATION**

In [23]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

**PIPELINES**

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

**Categorical Pipeline**

In [25]:
cat_pipeline = Pipeline(
    steps = [
        ('impute', SimpleImputer(strategy ='most_frequent')),
        ('encoder', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1))
         
    ]
)

In [26]:
cat_pipeline

**Numerical Pipeline**

In [27]:
num_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())
    ]
)

In [28]:
num_pipeline

**COMBINE TWO PIPELINES**

In [30]:
preprocessor = ColumnTransformer(
    transformers = [

    ('cat_pipeline', cat_pipeline, cat_cols),
    ('num_pipeline', num_pipeline, num_cols)
    ],
    remainder = 'drop'
)


In [31]:
preprocessor

In [32]:
final_pipeline = Pipeline(
    steps = [
        ('preprocessor', preprocessor)
    ]
)

In [33]:
final_pipeline

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 42)

In [38]:
print(f"X_train shape = {X_train.shape}")
print(f"X_test shape = {X_test.shape}")
print(f"y_train shape = {y_train.shape}")
print(f"y_test shape = {y_test.shape}")

X_train shape = (75, 4)
X_test shape = (19, 4)
y_train shape = (75, 2)
y_test shape = (19, 2)


In [37]:
final_pipeline.fit_transform(X_train)

array([[ 6.70000000e+01,  3.60000000e+01,  7.10000000e+01,
        -7.68162267e-01],
       [ 3.90000000e+01,  3.50000000e+01,  4.20000000e+01,
        -1.40883086e-02],
       [ 6.30000000e+01,  2.90000000e+01,  6.40000000e+01,
         1.96500638e-01],
       [ 3.70000000e+01,  7.20000000e+01,  5.80000000e+01,
        -1.29958210e-01],
       [ 3.10000000e+01,  5.30000000e+01,  5.40000000e+01,
        -1.45591450e-01],
       [ 1.90000000e+01,  3.40000000e+01,  3.00000000e+01,
        -3.46064771e-01],
       [ 2.40000000e+01,  5.80000000e+01,  1.30000000e+01,
         1.14553030e+00],
       [ 6.20000000e+01,  4.60000000e+01,  8.00000000e+00,
        -5.65849742e-01],
       [ 4.60000000e+01,  1.90000000e+01,  2.70000000e+01,
        -5.54814513e-01],
       [ 5.00000000e+01,  1.60000000e+01,  2.90000000e+01,
        -8.95986999e-01],
       [ 2.00000000e+00,  2.80000000e+01,  4.50000000e+01,
        -9.04153068e-02],
       [ 2.10000000e+01,  5.90000000e+01,  7.00000000e+01,
      

In [39]:
X_train = pd.DataFrame(final_pipeline.fit_transform(X_train), columns = final_pipeline.get_feature_names_out())

In [40]:
X_train.head()

Unnamed: 0,cat_pipeline__Username,cat_pipeline__Caption,cat_pipeline__Hashtags,num_pipeline__Followers
0,67.0,36.0,71.0,-0.768162
1,39.0,35.0,42.0,-0.014088
2,63.0,29.0,64.0,0.196501
3,37.0,72.0,58.0,-0.129958
4,31.0,53.0,54.0,-0.145591


In [41]:
X_test = pd.DataFrame(final_pipeline.transform(X_test), columns = final_pipeline.get_feature_names_out())

In [42]:
X_test.head()

Unnamed: 0,cat_pipeline__Username,cat_pipeline__Caption,cat_pipeline__Hashtags,num_pipeline__Followers
0,-1.0,-1.0,-1.0,-0.669765
1,-1.0,-1.0,-1.0,0.050284
2,-1.0,-1.0,-1.0,-0.419633
3,-1.0,-1.0,-1.0,1.459115
4,48.0,-1.0,-1.0,0.564342


In [14]:
def evaluate_clf(true, predicted):

    '''
    This Function Takes in true values and predicted values 
    Returns : MAE, MSE, RMSE, R2SCORE
    '''
    MAE = mean_absolute_error(true, predicted)
    MSE = mean_squared_error(true, predicted)
    RMSE = np.sqrt(MSE)
    R2SCORE = r2_score(true, predicted)*100
    return MAE, MSE, RMSE, R2SCORE

In [15]:
models = {
    "Linear Regression" : LinearRegression(),
    "Ridge Regression" : Ridge(),
    "Lasso Regression" : Lasso(),
    "ElasticNet Regression" : ElasticNet()
}

In [43]:
def evaluate_models(X, y, models):
    '''
    This Function takes in X and y and models dictionary as input
    It splits the data into Train Test Split
    Iterates through the given model dictionary and evaluates the metrics
    Returns : Dataframe which contains report of all models metrics
    '''

    Model_List = []
    MAE_Train_List = []
    MSE_Train_List = []
    RSME_Train_List = [] 
    R2SCORE_Train_List = []
    MAE_Test_List = []
    MSE_Test_List = []
    RSME_Test_List = [] 
    R2SCORE_Test_List = []
    R2SCORE_DIFF = []


    for i in range(len(list(models))):
        model = list(models.values())[i]
        wrapper = MultiOutputRegressor(model)
        wrapper.fit(X_train, y_train)

        # Make Prediction
        y_train_pred = wrapper.predict(X_train)
        y_test_pred = wrapper.predict(X_test)

        # Training Set Performance
        Model_Train_MAE, Model_Train_MSE, Model_Train_RMSE, Model_Train_R2SCORE = evaluate_clf(y_train, y_train_pred)

        MAE_Train_List.append(Model_Train_MAE)
        MSE_Train_List.append(Model_Train_MSE)
        RSME_Train_List.append(Model_Train_RMSE)
        R2SCORE_Train_List.append(Model_Train_R2SCORE)


        # Test Set Performance
        Model_Test_MAE, Model_Test_MSE, Model_Test_RMSE, Model_Test_R2SCORE = evaluate_clf(y_test, y_test_pred)

        MAE_Test_List.append(Model_Test_MAE)
        MSE_Test_List.append(Model_Test_MSE)
        RSME_Test_List.append(Model_Test_RMSE)
        R2SCORE_Test_List.append(Model_Test_R2SCORE)


        # R2SCORE Train-Test Difference
        R2SCORE_DIFF.append(max(Model_Train_R2SCORE, Model_Test_R2SCORE) - min(Model_Train_R2SCORE, Model_Test_R2SCORE))

        print(list(models.keys())[i])
        Model_List.append(list(models.keys())[i])

        print(f"Model Performance For Training Set")
        print(f"- MAE : {Model_Train_MAE}")
        print(f"- MSE : {Model_Train_MSE}")
        print(f"- RMSE : {Model_Train_RMSE}")
        print(f"- R2SCORE : {Model_Train_R2SCORE}")

        print("*"*100)

        print(f"Model Performance For Testing Set")
        print(f"- MAE : {Model_Test_MAE}")
        print(f"- MSE : {Model_Test_MSE}")
        print(f"- RMSE : {Model_Test_RMSE}")
        print(f"- R2SCORE : {Model_Test_R2SCORE}")

        print("*"*100)
        print('\n')

    report = pd.DataFrame(list(zip(Model_List, MAE_Train_List, MAE_Test_List, MSE_Train_List, MSE_Test_List, RSME_Train_List, RSME_Test_List, R2SCORE_Train_List, R2SCORE_Test_List, R2SCORE_DIFF)), columns = ['Model Name', 'MAE_Train', 'MAE_Test',
                                                                                                                                                                                                               'MSE_Train', 'MSE_Test', 'RMSE_Train', 'RMSE_Test', 'R2SCORE_Train', 'R2SCORE_Test', 'R2SCOREDIFF']).sort_values(by = 'R2SCORE_Test', ascending = False)
    
    return report





In [44]:
report = evaluate_models(X, y, models)

Linear Regression
Model Performance For Training Set
- MAE : 18.7019230576008
- MSE : 1445.69966559752
- RMSE : 38.02235744397656
- R2SCORE : 15.309348797365752
****************************************************************************************************
Model Performance For Testing Set
- MAE : 27.654681845137517
- MSE : 1979.1782831095416
- RMSE : 44.48795660748582
- R2SCORE : -92.61658776977451
****************************************************************************************************


Ridge Regression
Model Performance For Training Set
- MAE : 18.68296479726837
- MSE : 1445.7292684760444
- RMSE : 38.02274672450748
- R2SCORE : 15.30766746194579
****************************************************************************************************
Model Performance For Testing Set
- MAE : 27.55967942253493
- MSE : 1972.6322816013417
- RMSE : 44.414325184576896
- R2SCORE : -92.06076395422956
********************************************************************************

In [45]:
report

Unnamed: 0,Model Name,MAE_Train,MAE_Test,MSE_Train,MSE_Test,RMSE_Train,RMSE_Test,R2SCORE_Train,R2SCORE_Test,R2SCOREDIFF
3,ElasticNet Regression,18.247237,25.128705,1466.503814,1821.144186,38.294958,42.674866,12.708413,-76.684897,89.393311
2,Lasso Regression,18.619296,27.088741,1446.760799,1945.197827,38.036309,44.104397,10.90854,-83.566243,94.474783
1,Ridge Regression,18.682965,27.559679,1445.729268,1972.632282,38.022747,44.414325,15.307667,-92.060764,107.368431
0,Linear Regression,18.701923,27.654682,1445.699666,1979.178283,38.022357,44.487957,15.309349,-92.616588,107.925937


In [47]:
final_model = ElasticNet()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=42)



encoder = OrdinalEncoder()
X_train = encoder.fit_transform(X_train)
X_test = encoder.fit_transform(X_test)

wrapper = MultiOutputRegressor(final_model)

final_model = wrapper.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

print(f"Final ElasticNet Regression Accuracy Score (Train) : {final_model.score(X_train, y_train)}")

print(f"Final ElasticNet Regression Accuracy Score (Test) : {r2_score(y_pred, y_test)}")

Final ElasticNet Regression Accuracy Score (Train) : 0.1566590147809418
Final ElasticNet Regression Accuracy Score (Test) : -171.30206573528238


In [48]:
# Let's Predict

print(X_test[2])
y_pred = final_model.predict([X_test[0]])
y_pred

print(f"Time since posted : {y_pred[0][0]} hours")
print(f" No. of Likes: {y_pred[0][1]} ")

[ 1. 13.  9.  7.]
Time since posted : 1.646719431476879 hours
 No. of Likes: -5.144279982101029 
