In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def bad_line(x):
    print(x)
    return None

df_data = pd.read_csv('/content/drive/Othercomputers/My Laptop (1)/year4/final_project/data_science/data_set/topic_data_1.csv',  on_bad_lines=bad_line, engine='python')


df_data.dtypes

Tweet                       object
Date                        object
time                        object
Day of week                 object
Cashtags                    object
Hashtags                    object
Language                    object
Location                      bool
Mentioned_users               bool
Followers                  float64
Following                  float64
User_created_date           object
Listed_count               float64
Favourite_count            float64
Tweet_count                float64
Verified                      bool
Average_favourite_count    float64
account_age                float64
Likes                      float64
Comments                   float64
Retweets                   float64
Views                      float64
clean_tweet                 object
subjectivity               float64
polarity                   float64
sentiment                   object
topics                       int64
key_words                   object
dtype: object

In [4]:
df_data_linear = df_data.copy()

In [5]:
# Convert "Cashtags" column to boolean
df_data_linear['Cashtags'] = np.where(df_data_linear['Cashtags'].notnull(), 1, 0)

# Convert "Hashtags" column to boolean
df_data_linear['Hashtags'] = np.where(df_data_linear['Hashtags'].notnull(), 1, 0)

df_data_linear['Location'] = np.where(df_data_linear['Location'].notnull(), 1, 0)

df_data_linear['Mentioned_users'] = np.where(df_data_linear['Mentioned_users'].notnull(), 1, 0)

df_data_linear['Verified'] = np.where(df_data_linear['Verified'].notnull(), 1, 0)

In [6]:
# Drop the columns we won't use for prediction
df_data_linear.drop(['Tweet', 'Date', 'Listed_count', 'Favourite_count', 'Tweet_count', 'User_created_date', 'Views'], axis=1, inplace=True)

In [7]:
df_data_linear['time'] = pd.to_datetime(df_data_linear['time']).dt.hour

In [8]:
df_data_linear.dtypes

time                         int64
Day of week                 object
Cashtags                     int64
Hashtags                     int64
Language                    object
Location                     int64
Mentioned_users              int64
Followers                  float64
Following                  float64
Verified                     int64
Average_favourite_count    float64
account_age                float64
Likes                      float64
Comments                   float64
Retweets                   float64
clean_tweet                 object
subjectivity               float64
polarity                   float64
sentiment                   object
topics                       int64
key_words                   object
dtype: object

In [9]:
label_encoder = LabelEncoder()
df_data_linear["Day of week"] = label_encoder.fit_transform(df_data_linear["Day of week"])
df_data_linear["Language"] = label_encoder.fit_transform(df_data_linear["Language"])
df_data_linear["clean_tweet"] = label_encoder.fit_transform(df_data_linear["clean_tweet"])
df_data_linear["sentiment"] = label_encoder.fit_transform(df_data_linear["sentiment"])
df_data_linear["key_words"] = label_encoder.fit_transform(df_data_linear["key_words"])

In [10]:
X = df_data_linear.drop(["Likes", "Comments", "Retweets"], axis=1)
y = df_data_linear[["Likes", "Comments", "Retweets"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred = linear_reg.predict(X_test)


In [12]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


Mean Absolute Error (MAE): 10.48666741719933
Mean Squared Error (MSE): 8153.077703609054
Root Mean Squared Error (RMSE): 90.29439464113514


In [13]:
from sklearn.tree import DecisionTreeRegressor

In [14]:
dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train, y_train)

In [15]:
y_pred = dt_reg.predict(X_test)

In [16]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


Mean Absolute Error (MAE): 9.482577789256046
Mean Squared Error (MSE): 13993.063254857669
Root Mean Squared Error (RMSE): 118.2922789317108


In [17]:
from sklearn.ensemble import RandomForestRegressor

In [18]:
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)


In [19]:
y_pred = rf_reg.predict(X_test)

In [20]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


Mean Absolute Error (MAE): 8.722498306152056
Mean Squared Error (MSE): 8003.332198067025
Root Mean Squared Error (RMSE): 89.46134471416705


In [21]:
# Initialize LabelEncoder object
label_encoder = LabelEncoder()

# Convert categorical features to numerical values
day_of_week_encoded = label_encoder.fit_transform(["Monday"])
language_encoded = label_encoder.fit_transform(["English"])
clean_tweet_encoded = label_encoder.fit_transform(["These are the basic steps to create a RandomForestRegressor model to predict the number"])
sentiment_encoded = label_encoder.fit_transform(["Positive"])
key_words_encoded = label_encoder.fit_transform(["bank"])

In [22]:
X_test.dtypes

time                         int64
Day of week                  int64
Cashtags                     int64
Hashtags                     int64
Language                     int64
Location                     int64
Mentioned_users              int64
Followers                  float64
Following                  float64
Verified                     int64
Average_favourite_count    float64
account_age                float64
clean_tweet                  int64
subjectivity               float64
polarity                   float64
sentiment                    int64
topics                       int64
key_words                    int64
dtype: object

In [213]:
input_df = pd.DataFrame({
    "time": [0],  # add missing value
    "Day of week": [day_of_week_encoded[0]],
    "Cashtags": [0],  # add missing value
    "Hashtags": [0],  # add missing value
    "Language": [language_encoded[0]],
    "Location": [0],  # add missing value
    "Mentioned_users": [0],  # add missing value
    "Followers": [5.0],
    "Following": [5.0],
    "Verified": [1],
    "Average_favourite_count": [500.0],
    "account_age": [365.0],
    "clean_tweet": [clean_tweet_encoded[0]],
    "subjectivity": [0.5],
    "polarity": [0.2],
    "sentiment": [sentiment_encoded[0]],
    "topics": [0],  # add missing value
    "key_words": [key_words_encoded[0]]
})


In [214]:
# Make prediction using the RandomForestRegressor model
y_pred = rf_reg.predict(input_df)

# Print predicted values for likes, comments, and retweets
print("Predicted likes: ", y_pred[0][0])
print("Predicted comments: ", y_pred[0][1])
print("Predicted retweets: ", y_pred[0][2])

Predicted likes:  12.28
Predicted comments:  1.91
Predicted retweets:  0.83


In [10]:
import joblib

# Save label encoders
joblib.dump(label_encoder, "label_encoder_day_of_week.pkl")
joblib.dump(label_encoder, "label_encoder_language.pkl")
joblib.dump(label_encoder, "label_encoder_clean_tweet.pkl")
joblib.dump(label_encoder, "label_encoder_sentiment.pkl")
joblib.dump(label_encoder, "label_encoder_key_words.pkl")

['label_encoder_key_words.pkl']

In [216]:

# Save the trained model for Likes
joblib.dump(rf_reg, 'rf_reg_model.pkl')

['rf_reg_model.pkl']