In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def bad_line(x):
    print(x)
    return None

dtypes = {
    'Tweet':'object',
    'Date':'object',
    'time':'object',
    'Day of week':'object',
    'Cashtags':'object',
    'Hashtags':'object',
    'Language':'object',
    'Location':'bool',
    'Mentioned_users':'bool',
    'Followers':'float64',
    'Following':'float64',
    'User_created_date':'object',
    'Listed_count':'float64',
    'Favourite_count':'float64',
    'Tweet_count':'float64',
    'Verified':'bool',
    'Average_favourite_count':'float64',
    'account_age':'float64',
    'Likes':'float64',
    'Comments':'float64',
    'Retweets':'float64',
    'Views':'float64',
    'clean_tweet':'object',
    'subjectivity':'float64',
    'polarity':'float64',
    'sentiment':'object',
    'topics':'int64'
}

df_data = pd.read_csv('/content/drive/Othercomputers/My Laptop (1)/year4/fyp_repo/social_lifter/data_science/data_set/final_data/data_sentiment_final_lemmatized_single_topic_detected.csv', 
                      on_bad_lines=bad_line, 
                      engine='python',
                      dtype=dtypes)



In [4]:
data_back = df_data.copy()

In [35]:
df_data = data_back.copy()

In [5]:
# Drop the columns we won't use for prediction
df_data.drop(['Tweet', 'Date', 'Listed_count', 'Favourite_count', 'Tweet_count', 'User_created_date', 'Views','Language','clean_tweet'], axis=1, inplace=True)
df_data.shape

(1150394, 18)

In [6]:
df_data.dropna(subset=['Average_favourite_count'], inplace=True)
df_data = df_data[df_data['Likes'] != 0.0]
df_data.shape

(565272, 18)

In [7]:
# Convert "Cashtags" column to boolean
df_data['Cashtags'] = np.where(df_data['Cashtags'].notnull(), 1, 0)

# Convert "Hashtags" column to boolean
df_data['Hashtags'] = np.where(df_data['Hashtags'].notnull(), 1, 0)
df_data["Location"] = df_data["Location"].astype(int)
df_data["Mentioned_users"] = df_data["Mentioned_users"].astype(int)
df_data["Verified"] = df_data["Verified"].astype(int)
df_data["Day of week"] = df_data["Day of week"].astype(str)

In [8]:
# Define the mapping from day of week string to integer
day_of_week_map = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}

# Apply the mapping to the 'Day of week' column
df_data['Day of week'] = df_data['Day of week'].map(day_of_week_map)

df_data['time'] = pd.to_datetime(df_data['time']).dt.hour

In [9]:
# Define the mapping from sentiment string to integer
sentiment_map = {'positive': 0, 'negative': 1, 'neutral': 2}

# Apply the mapping to the 'Day of week' column
df_data['sentiment'] = df_data['sentiment'].map(sentiment_map)

In [10]:
X = df_data.drop(["Likes", "Comments", "Retweets"], axis=1)
y_likes = df_data["Likes"]
y_comments = df_data["Comments"]
y_retweets = df_data["Retweets"]

In [11]:
# Split the data into train and test sets for each target variable separately
X_train_likes, X_test_likes, y_train_likes, y_test_likes = train_test_split(X, y_likes, test_size=0.2, random_state=42)
X_train_comments, X_test_comments, y_train_comments, y_test_comments = train_test_split(X, y_comments, test_size=0.2, random_state=42)
X_train_retweets, X_test_retweets, y_train_retweets, y_test_retweets = train_test_split(X, y_retweets, test_size=0.2, random_state=42)

#Linear Regression

In [12]:
# Train separate linear regression models for each target variable
linear_reg_likes = LinearRegression()
linear_reg_likes.fit(X_train_likes, y_train_likes)
y_pred_likes_lr = linear_reg_likes.predict(X_test_likes)

In [13]:
from sklearn.metrics import mean_absolute_percentage_error
mse_lr = metrics.mean_squared_error(y_test_likes, y_pred_likes_lr)
mae_lr = metrics.mean_absolute_error(y_test_likes, y_pred_likes_lr)
r2_lr = np.sqrt(metrics.mean_squared_error(y_test_likes, y_pred_likes_lr))
mape = mean_absolute_percentage_error(y_test_likes, y_pred_likes_lr)


# Mean Absolute Error (MAE) - Likes -LR: 356254.8215719718
# Mean Squared Error (MSE) - Likes -LR: 22.857607123775978
# Root Mean Squared Error (RMSE) - Likes -LR: 596.8708583705288
# Mean absolute percentage error(MAPE) - Likes -LR: 28590730499220212.00%
print('Mean Absolute Error (MAE) - Likes -LR:', mse_lr)
print('Mean Squared Error (MSE) - Likes -LR:', mae_lr)
print('Root Mean Squared Error (RMSE) - Likes -LR:', r2_lr)
print(f"Mean absolute percentage error(MAPE) - Likes -LR: {mape:.2f}%")

Mean Absolute Error (MAE) - Likes -LR: 147469.56354420012
Mean Squared Error (MSE) - Likes -LR: 42.058856631083884
Root Mean Squared Error (RMSE) - Likes -LR: 384.0176604587348
Mean absolute percentage error(MAPE) - Likes -LR: 12.69%


In [14]:
# Train separate linear regression models for each target variable
linear_reg_comments = LinearRegression()
linear_reg_comments.fit(X_train_comments, y_train_comments)
y_pred_comments_lr = linear_reg_comments.predict(X_test_comments)

In [15]:
mse_lr_comments = metrics.mean_squared_error(y_test_comments, y_pred_comments_lr)
mae_lr_comments = metrics.mean_absolute_error(y_test_comments, y_pred_comments_lr)
r2_lr_comments = np.sqrt(metrics.mean_squared_error(y_test_comments, y_pred_comments_lr))
mape_comments = mean_absolute_percentage_error(y_test_comments, y_pred_comments_lr)


# Mean Absolute Error (MAE) - comments -LR: 321.7946548380683
# Mean Squared Error (MSE) - comments -LR: 1.3326986116464201
# Root Mean Squared Error (RMSE) - comments -LR: 17.93863581318458
# Mean absolute percentage error(MAPE) - comments -LR: 2794541878057321.50%
print('Mean Absolute Error (MAE) - comments -LR:', mse_lr_comments)
print('Mean Squared Error (MSE) - comments -LR:', mae_lr_comments)
print('Root Mean Squared Error (RMSE) - comments -LR:', r2_lr_comments)
print(f"Mean absolute percentage error(MAPE) - comments -LR: {mape_comments:.2f}%")

Mean Absolute Error (MAE) - comments -LR: 441.25746427223106
Mean Squared Error (MSE) - comments -LR: 2.280502378575699
Root Mean Squared Error (RMSE) - comments -LR: 21.00612920726308
Mean absolute percentage error(MAPE) - comments -LR: 4098343047057356.50%


In [None]:
# Train separate linear regression models for each target variable
linear_reg_retweets = LinearRegression()
linear_reg_retweets.fit(X_train, y_train_retweets)
y_pred_retweets_lr = linear_reg_retweets.predict(X_test)

In [None]:


mse_lr_retweets = metrics.mean_squared_error(y_test_retweets, y_pred_retweets_lr)
mae_lr_retweets = metrics.mean_absolute_error(y_test_retweets, y_pred_retweets_lr)
r2_lr_retweets = np.sqrt(metrics.mean_squared_error(y_test_retweets, y_pred_retweets_lr))
mape_retweets = mean_absolute_percentage_error(y_test_retweets, y_pred_retweets_lr)


# Mean Absolute Error (MAE) - retweets -LR: 5671.938106205176
# Mean Squared Error (MSE) - retweets -LR: 4.682655377350737
# Root Mean Squared Error (RMSE) - retweets -LR: 75.31227062175975
# Mean absolute percentage error(MAPE) - retweets -LR: 9566594420307914.00
print('Mean Absolute Error (MAE) - retweets -LR:', mse_lr_retweets)
print('Mean Squared Error (MSE) - retweets -LR:', mae_lr_retweets)
print('Root Mean Squared Error (RMSE) - retweets -LR:', r2_lr_retweets)
print(f"Mean absolute percentage error(MAPE) - retweets -LR: {mape_retweets:.2f}%")

Mean Absolute Error (MAE) - retweets -LR: 5669.33639052204
Mean Squared Error (MSE) - retweets -LR: 4.7398249092547
Root Mean Squared Error (RMSE) - retweets -LR: 75.29499578671906
Mean absolute percentage error(MAPE) - retweets -LR: 9694604481492128.00%


#DecisionTreeRegressor

In [16]:
from sklearn.tree import DecisionTreeRegressor

In [17]:
dt_reg_likes = DecisionTreeRegressor(random_state=42,max_depth=8, min_samples_split=8, min_samples_leaf=3, max_features= 'sqrt')
dt_reg_likes.fit(X_train_likes, y_train_likes)
y_pred_likes_dt = dt_reg_likes.predict(X_test_likes)

In [18]:
mae_dt_likes = metrics.mean_absolute_error(y_test_likes, y_pred_likes_dt)
mse_dt_likes = metrics.mean_squared_error(y_test_likes, y_pred_likes_dt)
r2_dt_likes = np.sqrt(metrics.mean_squared_error(y_test_likes, y_pred_likes_dt))
mape_dt_likes = mean_absolute_percentage_error(y_test_likes, y_pred_likes_dt)


# Mean Absolute Error (MAE) - Likes -dt: 22.320914147574396
# Mean Squared Error (MSE) - Likes -dt: 364359.32831504033
# Root Mean Squared Error (RMSE) - Likes -dt: 603.6218421454282
# Mean absolute percentage error(MAPE) - Likes -dt: 25872854608848556.00%
print('Mean Absolute Error (MAE) - Likes -dt:', mae_dt_likes)
print('Mean Squared Error (MSE) - Likes -dt:', mse_dt_likes)
print('Root Mean Squared Error (RMSE) - Likes -dt:', r2_dt_likes)
print(f"Mean absolute percentage error(MAPE) - Likes -dt: {mape_dt_likes:.2f}%")

Mean Absolute Error (MAE) - Likes -dt: 40.32950947240271
Mean Squared Error (MSE) - Likes -dt: 171663.5068395288
Root Mean Squared Error (RMSE) - Likes -dt: 414.32294993100345
Mean absolute percentage error(MAPE) - Likes -dt: 10.10%


In [19]:
import pickle

# Save the model to a file
with open('dt_reg_likes.pkl', 'wb') as f:
    pickle.dump(dt_reg_likes, f)

In [None]:
dt_reg_comments = DecisionTreeRegressor(random_state=42,max_depth=6, min_samples_split=2, min_samples_leaf=2, max_features= 'sqrt')
dt_reg_comments.fit(X_train, y_train_comments)
y_pred_comments_dt = dt_reg_comments.predict(X_test)

In [None]:
mae_dt_comments = metrics.mean_absolute_error(y_test_comments, y_pred_comments_dt)
mse_dt_comments = metrics.mean_squared_error(y_test_comments, y_pred_comments_dt)
r2_dt_comments = np.sqrt(metrics.mean_squared_error(y_test_comments, y_pred_comments_dt))
mape_dt_comments = mean_absolute_percentage_error(y_test_comments, y_pred_comments_dt)



# Mean Absolute Error (MAE) - comments -dt: 1.3546444937668691
# Mean Squared Error (MSE) - comments -dt: 315.8137877876347
# Root Mean Squared Error (RMSE) - comments -dt: 17.771150435119125
# Mean absolute percentage error(MAPE) - comments -dt: 2365343339570260.00%
print('Mean Absolute Error (MAE) - comments -dt:', mae_dt_comments)
print('Mean Squared Error (MSE) - comments -dt:', mse_dt_comments)
print('Root Mean Squared Error (RMSE) - comments -dt:', r2_dt_comments)
print(f"Mean absolute percentage error(MAPE) - comments -dt: {mape_dt_comments:.2f}%")

Mean Absolute Error (MAE) - comments -dt: 1.3546444937668691
Mean Squared Error (MSE) - comments -dt: 315.8137877876347
Root Mean Squared Error (RMSE) - comments -dt: 17.771150435119125
Mean absolute percentage error(MAPE) - comments -dt: 2365343339570260.00%


In [None]:
import pickle

# Save the model to a file
with open('dt_reg_comments.pkl', 'wb') as f:
    pickle.dump(dt_reg_comments, f)

In [None]:
dt_reg_retweets = DecisionTreeRegressor(random_state=42,max_depth=2, min_samples_split=2, min_samples_leaf=1, max_features= 'auto')
dt_reg_retweets.fit(X_train, y_train_retweets)
y_pred_retweets_dt = dt_reg_retweets.predict(X_test)



In [None]:
mae_dt_retweets = metrics.mean_absolute_error(y_test_retweets, y_pred_retweets_dt)
mse_dt_retweets = metrics.mean_squared_error(y_test_retweets, y_pred_retweets_dt)
r2_dt_retweets = np.sqrt(metrics.mean_squared_error(y_test_retweets, y_pred_retweets_dt))
mape_dt_retweets = mean_absolute_percentage_error(y_test_retweets, y_pred_retweets_dt)



# Mean Absolute Error (MAE) - retweets -dt: 3.323547578816166
# Mean Squared Error (MSE) - retweets -dt: 2706.6633371757343
# Root Mean Squared Error (RMSE) - retweets -dt: 52.025602708433226
print('Mean Absolute Error (MAE) - retweets -dt:', mae_dt_retweets)
print('Mean Squared Error (MSE) - retweets -dt:', mse_dt_retweets)
print('Root Mean Squared Error (RMSE) - retweets -dt:', r2_dt_retweets)
print(f"Mean absolute percentage error(MAPE) - retweets -dt: {mape_dt_retweets:.2f}%")

Mean Absolute Error (MAE) - retweets -dt: 4.486796101450401
Mean Squared Error (MSE) - retweets -dt: 5571.917991424511
Root Mean Squared Error (RMSE) - retweets -dt: 74.64528110620597
Mean absolute percentage error(MAPE) - retweets -dt: 7391445736721221.00%


In [None]:
import pickle

# Save the model to a file
with open('dt_reg_retweets.pkl', 'wb') as f:
    pickle.dump(dt_reg_retweets, f)

#RandomForestRegressor

In [20]:
from sklearn.ensemble import RandomForestRegressor

In [21]:
rf_reg_likes = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg_likes.fit(X_train_likes, y_train_likes)
y_pred_likes_rf = rf_reg_likes.predict(X_test_likes)

In [22]:
mae_rf_likes = metrics.mean_absolute_error(y_test_likes, y_pred_likes_rf)
mse_rf_likes = metrics.mean_squared_error(y_test_likes, y_pred_likes_rf)
r2_rf_likes = np.sqrt(metrics.mean_squared_error(y_test_likes, y_pred_likes_rf))
mape_rf_likes = mean_absolute_percentage_error(y_test_likes, y_pred_likes_rf)



# Mean Absolute Error (MAE) - Likes -rf: 20.05182340666568
# Mean Squared Error (MSE) - Likes -rf: 332818.47495836805
# Root Mean Squared Error (RMSE) - Likes -rf: 576.9042164505023
# Mean absolute percentage error(MAPE) - Likes -rf: 10237827442376686.00%
print('Mean Absolute Error (MAE) - Likes -rf:', mae_rf_likes)
print('Mean Squared Error (MSE) - Likes -rf:', mse_rf_likes)
print('Root Mean Squared Error (RMSE) - Likes -rf:', r2_rf_likes)
print(f"Mean absolute percentage error(MAPE) - Likes -rf: {mape_rf_likes:.2f}%")


Mean Absolute Error (MAE) - Likes -rf: 37.16257643580907
Mean Squared Error (MSE) - Likes -rf: 175227.9894642277
Root Mean Squared Error (RMSE) - Likes -rf: 418.60242410218757
Mean absolute percentage error(MAPE) - Likes -rf: 6.37%


In [23]:
import pickle
# Save the model to a file
with open('rf_reg_likes.pkl', 'wb') as f:
    pickle.dump(rf_reg_likes, f)

In [None]:
rf_reg_comments = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg_comments.fit(X_train, y_train_comments)
y_pred_comments_rf = rf_reg_comments.predict(X_test)

In [None]:
print('Mean Absolute Error (MAE) - comments -rf:', metrics.mean_absolute_error(y_test_comments, y_pred_comments_rf))
print('Mean Squared Error (MSE) - comments -rf:', metrics.mean_squared_error(y_test_comments, y_pred_comments_rf))
print('Root Mean Squared Error (RMSE) - comments -rf:', np.sqrt(metrics.mean_squared_error(y_test_comments, y_pred_comments_rf)))

Mean Absolute Error (MAE) - comments -rf: 1.3451760434978772
Mean Squared Error (MSE) - comments -rf: 304.9145981046793
Root Mean Squared Error (RMSE) - comments -rf: 17.46180397624138


In [None]:

# Save the model to a file
with open('rf_reg_comments.pkl', 'wb') as f:
    pickle.dump(rf_reg_comments, f)

In [None]:
rf_reg_retweets = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg_retweets.fit(X_train, y_train_retweets)
y_pred_retweets_rf = rf_reg_retweets.predict(X_test)

In [None]:
print('Mean Absolute Error (MAE) - retweets -rf:', metrics.mean_absolute_error(y_test_retweets, y_pred_retweets_rf))
print('Mean Squared Error (MSE) - retweets -rf:', metrics.mean_squared_error(y_test_retweets, y_pred_retweets_rf))
print('Root Mean Squared Error (RMSE) - retweets -rf:', np.sqrt(metrics.mean_squared_error(y_test_retweets, y_pred_retweets_rf)))
# Mean Absolute Error (MAE) - retweets -rf: 5.835093441733218
# Mean Squared Error (MSE) - retweets -rf: 2851.8090350816547
# Root Mean Squared Error (RMSE) - retweets -rf: 53.4023317382458

Mean Absolute Error (MAE) - retweets -rf: 4.002782245322752
Mean Squared Error (MSE) - retweets -rf: 5864.138913943106
Root Mean Squared Error (RMSE) - retweets -rf: 76.57766589511009


In [None]:
# Save the model to a file
with open('rf_reg_retweets.pkl', 'wb') as f:
    pickle.dump(rf_reg_retweets, f)

#RNN

In [None]:
X_train.shape

(920212, 15)

In [26]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_percentage_error

# create the GRU RNN model
model = Sequential()
model.add(GRU(50, input_shape=(X_train_likes.shape[1], 1)))
model.add(Dense(1))

# compile the model
model.compile(loss='mse', optimizer=Adam(lr=0.001))

# train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(X_train_likes, y_train_likes, validation_split=0.2, epochs=10, batch_size=256, callbacks=[early_stopping])






Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
# evaluate the model on the test set
y_pred_likes = model.predict(X_test_likes)
mape = mean_absolute_percentage_error(y_test_likes, y_pred_likes)
print('MAPE: {:.2f}%'.format(mape*100))

MAPE: 1384.34%


#XGBoost

In [28]:
import xgboost as xgb

In [29]:
xgb_model_likes = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)
# Fit the model to the training data
xgb_model_likes.fit(X_train_likes, y_train_likes)
y_pred_likes_xg = xgb_model_likes.predict(X_test_likes)

In [30]:
print('Mean Absolute Error (MAE) - likes - xg :', metrics.mean_absolute_error(y_test_likes, y_pred_likes_xg))
print('Mean Squared Error (MSE) - likes - xg :', metrics.mean_squared_error(y_test_retweets, y_pred_likes_xg))
print('Root Mean Squared Error (RMSE) - likes - xg :', np.sqrt(metrics.mean_squared_error(y_test_retweets, y_pred_likes_xg)))

mae_xg_likes = metrics.mean_absolute_error(y_test_likes, y_pred_likes_xg)
mse_xg_likes = metrics.mean_squared_error(y_test_likes, y_pred_likes_xg)
r2_xg_likes = np.sqrt(metrics.mean_squared_error(y_test_likes, y_pred_likes_xg))
mape_xg_likes = mean_absolute_percentage_error(y_test_likes, y_pred_likes_xg)



# Mean Absolute Error (MAE) - Likes -rf: 20.05182340666568
# Mean Squared Error (MSE) - Likes -rf: 332818.47495836805
# Root Mean Squared Error (RMSE) - Likes -rf: 576.9042164505023
# Mean absolute percentage error(MAPE) - Likes -rf: 10237827442376686.00%
print('Mean Absolute Error (MAE) - Likes -rf:', mae_xg_likes)
print('Mean Squared Error (MSE) - Likes -rf:', mae_xg_likes)
print('Root Mean Squared Error (RMSE) - Likes -rf:', mae_xg_likes)
print(f"Mean absolute percentage error(MAPE) - Likes -rf: {mae_xg_likes:.2f}%")

Mean Absolute Error (MAE) - likes - xg : 36.50740184487171
Mean Squared Error (MSE) - likes - xg : 96510.28439069098
Root Mean Squared Error (RMSE) - likes - xg : 310.6610442116793
Mean Absolute Error (MAE) - Likes -rf: 36.50740184487171
Mean Squared Error (MSE) - Likes -rf: 36.50740184487171
Root Mean Squared Error (RMSE) - Likes -rf: 36.50740184487171
Mean absolute percentage error(MAPE) - Likes -rf: 36.51%


In [None]:
xgb_model_comments = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)
# Fit the model to the training data
xgb_model_comments.fit(X_train, y_train_comments)
y_pred_comments_xg = xgb_model_comments.predict(X_test)

In [None]:
print('Mean Absolute Error (MAE) - comments - xg :', metrics.mean_absolute_error(y_test_comments, y_pred_comments_xg))
print('Mean Squared Error (MSE) - comments - xg :', metrics.mean_squared_error(y_test_comments, y_pred_comments_xg))
print('Root Mean Squared Error (RMSE) - comments - xg :', np.sqrt(metrics.mean_squared_error(y_test_comments, y_pred_comments_xg)))

Mean Absolute Error (MAE) - comments - xg : 1.6754527255308496
Mean Squared Error (MSE) - comments - xg : 282.45990436812184
Root Mean Squared Error (RMSE) - comments - xg : 16.80654349853419


In [None]:
xgb_model_retweets = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)
# Fit the model to the training data
xgb_model_retweets.fit(X_train, y_train_retweets)
y_pred_retweets_xg = xgb_model_retweets.predict(X_test)

In [None]:
print('Mean Absolute Error (MAE) - retweets - xg :', metrics.mean_absolute_error(y_test_retweets, y_pred_retweets_xg))
print('Mean Squared Error (MSE) - retweets - xg :', metrics.mean_squared_error(y_test_retweets, y_pred_retweets_xg))
print('Root Mean Squared Error (RMSE) - retweets - xg :', np.sqrt(metrics.mean_squared_error(y_test_retweets, y_pred_retweets_xg)))

Mean Absolute Error (MAE) - retweets - xg : 5.770601299611658
Mean Squared Error (MSE) - retweets - xg : 2512.758071409142
Root Mean Squared Error (RMSE) - retweets - xg : 50.12741835970751


In [31]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [32]:
# Define the input shape
input_shape = (X_train_likes.shape[1],)


In [33]:
# Define the model architecture
nn_model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=input_shape),
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1)
])

In [34]:
# Compile the model
nn_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss=keras.losses.MeanSquaredError(),
    metrics=['mse']
)


In [None]:

# Fit the model on the training data
nn_likes_model = nn_model.fit(
    X_train_likes, y_train_likes,
    batch_size=32,
    epochs=100,
    validation_split=0.2
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
y_pred_likes_nn = nn_model.predict(X_test_likes)



In [None]:
print('Mean Absolute Error (MAE) - likes - nn :', metrics.mean_absolute_error(y_test_likes, y_pred_likes_nn))
print('Mean Squared Error (MSE) - likes - nn :', metrics.mean_squared_error(y_test_likes, y_pred_likes_nn))
print('Root Mean Squared Error (RMSE) - likes - nn :', np.sqrt(metrics.mean_squared_error(y_test_likes, y_pred_likes_nn)))

Mean Absolute Error (MAE) - likes - nn : 21.53940425492583
Mean Squared Error (MSE) - likes - nn : 20380.56872076232
Root Mean Squared Error (RMSE) - likes - nn : 142.76052928159913


In [None]:

# Fit the model on the training data
nn_comments_model = nn_model.fit(
    X_train, y_train_comments,
    batch_size=32,
    epochs=100,
    validation_split=0.2
)

In [None]:
y_pred_comments_nn = nn_comments_model.predict(X_test)

In [None]:
print('Mean Absolute Error (MAE) - comments - nn :', metrics.mean_absolute_error(y_test_comments, y_pred_comments_nn))
print('Mean Squared Error (MSE) - comments - nn :', metrics.mean_squared_error(y_test_comments, y_pred_comments_nn))
print('Root Mean Squared Error (RMSE) - comments - nn :', np.sqrt(metrics.mean_squared_error(y_test_comments, y_pred_comments_nn)))

#Hyper parameter Tuning

In [None]:

# Assuming df_data_dt__tune is your dataset
fraction = 0.7
sampled_data = df_data_dt__tune.sample(frac=fraction)


In [None]:
X_sampled_data = sampled_data.drop(["Likes", "Comments", "Retweets"], axis=1)
y_likes_sampled_data = sampled_data["Likes"]
y_comments_sampled_data = sampled_data["Comments"]
y_retweets_sampled_data = sampled_data["Retweets"]

In [None]:
# Split the data into train and test sets for each target variable separately
X_train_sampled_data_likes, X_test_sampled_data, y_train_likes_sampled_data, y_test_likes_sampled_data = train_test_split(X_sampled_data, y_likes_sampled_data, test_size=0.2, random_state=42)
# X_train_sampled_data, X_test_sampled_data, y_train_comments_sampled_data, y_test_comments_sampled_data = train_test_split(X_sampled_data, y_comments_sampled_data, test_size=0.2, random_state=42)
# X_train_sampled_data, X_test_sampled_data, y_train_retweets_sampled_data, y_test_retweets_sampled_data = train_test_split(X_sampled_data, y_retweets_sampled_data, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [2, 4, 6, 8],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 3, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create the GridSearchCV object
grid_dt_reg = GridSearchCV(estimator=dt_reg_likes, param_grid=params, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_dt_reg.fit(X_train_sampled_data_likes, y_train_likes_sampled_data)

# Print the best parameters and the corresponding score
print(f"Best parameters: {grid_dt_reg.best_params_}")
print(f"Best score: {grid_dt_reg.best_score_}")
# Best parameters: {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2}
# Best score: 0.0005189076130399872
# Best parameters: {'max_depth': 2, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}
# Best score: -0.005519235262729394
# Best parameters: {'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 2}
# Best score: 0.005725634947679081
# Best parameters: {'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 8}
# Best score: 0.0008394816587610432
# Best parameters: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 8}
# Best score: 0.06012973362010281

Best parameters: {'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2}
Best score: -0.00076444238494211


In [None]:

params = {
    'max_depth': [2, 4, 6, 8],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 3, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create the GridSearchCV object
grid_dt_reg_comments = GridSearchCV(estimator=dt_reg_comments, param_grid=params, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_dt_reg_comments.fit(X_train_sampled_data, y_train_comments_sampled_data)

# Print the best parameters and the corresponding score
print(f"Best parameters: {grid_dt_reg_comments.best_params_}")
print(f"Best score: {grid_dt_reg_comments.best_score_}")

In [None]:

params = {
    'max_depth': [2, 4, 6, 8],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 3, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create the GridSearchCV object
grid_dt_reg_retweets = GridSearchCV(estimator=dt_reg_retweets, param_grid=params, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_dt_reg_retweets.fit(X_train_sampled_data, y_train_retweets_sampled_data)

# Print the best parameters and the corresponding score
print(f"Best parameters: {grid_dt_reg_retweets.best_params_}")
print(f"Best score: {grid_dt_reg_retweets.best_score_}")



Best parameters: {'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2}
Best score: -0.0033295605747529766


In [None]:
X = df_data_linear.drop(["Likes", "Comments", "Retweets"], axis=1)
y = df_data_linear[["Likes", "Comments", "Retweets"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred = linear_reg.predict(X_test)


In [None]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


Mean Absolute Error (MAE): 10.48666741719933
Mean Squared Error (MSE): 8153.077703609054
Root Mean Squared Error (RMSE): 90.29439464113514


In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train, y_train)

In [None]:
y_pred = dt_reg.predict(X_test)

In [None]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


Mean Absolute Error (MAE): 9.482577789256046
Mean Squared Error (MSE): 13993.063254857669
Root Mean Squared Error (RMSE): 118.2922789317108


In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)


In [None]:
y_pred = rf_reg.predict(X_test)

In [None]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


Mean Absolute Error (MAE): 8.722498306152056
Mean Squared Error (MSE): 8003.332198067025
Root Mean Squared Error (RMSE): 89.46134471416705


In [None]:
# Initialize LabelEncoder object
label_encoder = LabelEncoder()

# Convert categorical features to numerical values
day_of_week_encoded = label_encoder.fit_transform(["Monday"])
language_encoded = label_encoder.fit_transform(["English"])
clean_tweet_encoded = label_encoder.fit_transform(["These are the basic steps to create a RandomForestRegressor model to predict the number"])
sentiment_encoded = label_encoder.fit_transform(["Positive"])
key_words_encoded = label_encoder.fit_transform(["bank"])

In [None]:
X_test.dtypes

time                         int64
Day of week                  int64
Cashtags                     int64
Hashtags                     int64
Language                     int64
Location                     int64
Mentioned_users              int64
Followers                  float64
Following                  float64
Verified                     int64
Average_favourite_count    float64
account_age                float64
clean_tweet                  int64
subjectivity               float64
polarity                   float64
sentiment                    int64
topics                       int64
key_words                    int64
dtype: object

In [None]:
input_df = pd.DataFrame({
    "time": [0],  # add missing value
    "Day of week": [day_of_week_encoded[0]],
    "Cashtags": [0],  # add missing value
    "Hashtags": [0],  # add missing value
    "Language": [language_encoded[0]],
    "Location": [0],  # add missing value
    "Mentioned_users": [0],  # add missing value
    "Followers": [5.0],
    "Following": [5.0],
    "Verified": [1],
    "Average_favourite_count": [500.0],
    "account_age": [365.0],
    "clean_tweet": [clean_tweet_encoded[0]],
    "subjectivity": [0.5],
    "polarity": [0.2],
    "sentiment": [sentiment_encoded[0]],
    "topics": [0],  # add missing value
    "key_words": [key_words_encoded[0]]
})


In [None]:
# Make prediction using the RandomForestRegressor model
y_pred = rf_reg.predict(input_df)

# Print predicted values for likes, comments, and retweets
print("Predicted likes: ", y_pred[0][0])
print("Predicted comments: ", y_pred[0][1])
print("Predicted retweets: ", y_pred[0][2])

Predicted likes:  12.28
Predicted comments:  1.91
Predicted retweets:  0.83


In [None]:
import joblib

# Save label encoders
joblib.dump(label_encoder, "label_encoder_day_of_week.pkl")
joblib.dump(label_encoder, "label_encoder_language.pkl")
joblib.dump(label_encoder, "label_encoder_clean_tweet.pkl")
joblib.dump(label_encoder, "label_encoder_sentiment.pkl")
joblib.dump(label_encoder, "label_encoder_key_words.pkl")

['label_encoder_key_words.pkl']

In [None]:

# Save the trained model for Likes
joblib.dump(rf_reg, 'rf_reg_model.pkl')

['rf_reg_model.pkl']