#Importing libaries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack

#Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def bad_line(x):
    print(x)
    return None

df_data = pd.read_csv('/content/drive/Othercomputers/My Laptop (1)/year4/final_project/data_science/data_set/topic_data_1.csv',  on_bad_lines=bad_line, engine='python')


df_data.dtypes

Tweet                       object
Date                        object
time                        object
Day of week                 object
Cashtags                    object
Hashtags                    object
Language                    object
Location                      bool
Mentioned_users               bool
Followers                  float64
Following                  float64
User_created_date           object
Listed_count               float64
Favourite_count            float64
Tweet_count                float64
Verified                      bool
Average_favourite_count    float64
account_age                float64
Likes                      float64
Comments                   float64
Retweets                   float64
Views                      float64
clean_tweet                 object
subjectivity               float64
polarity                   float64
sentiment                   object
topics                       int64
key_words                   object
dtype: object

In [None]:
df_data_linear = df_data.copy()

In [None]:
# Convert "Cashtags" column to boolean
df_data_linear['Cashtags'] = np.where(df_data_linear['Cashtags'].notnull(), True, False)

# Convert "Hashtags" column to boolean
df_data_linear['Hashtags'] = np.where(df_data_linear['Hashtags'].notnull(), True, False)


In [None]:
# Drop the columns we won't use for prediction
df_data_linear.drop(['Tweet', 'Date', 'Listed_count', 'Favourite_count', 'Tweet_count', 'User_created_date', 'Views'], axis=1, inplace=True)


#Cleaning the data for Linear regression


# Convert 'time' column to numerical

In [None]:
df_data_linear['time'] = pd.to_datetime(df_data_linear['time']).dt.hour

# Encode categorical variables

In [None]:
cat_cols = ['Day of week', 'Language', 'sentiment', 'topics', 'key_words', 'time']
for col in cat_cols:
    label_encoder = LabelEncoder()
    df_data_linear[col] = label_encoder.fit_transform(df_data_linear[col])


In [None]:
# Create feature matrix with one-hot encoded categorical variables and token counts
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(df_data_linear['clean_tweet'])
X_text = vectorizer.transform(df_data_linear['clean_tweet'])
cat_cols = ['Cashtags', 'Hashtags', 'Location', 'Mentioned_users', 'Verified', 'Day of week', 'Language', 'sentiment', 'topics', 'key_words', 'time']
cat_transformer = OneHotEncoder()
cat_transformer.fit(df_data_linear[cat_cols])
X_cat = cat_transformer.transform(df_data_linear[cat_cols])
X_num = StandardScaler().fit_transform(df_data_linear.select_dtypes(include=np.number))
X = hstack([X_cat, X_text, X_num])



In [None]:

# Create target vectors
y_likes = df_data_linear['Likes']
y_comments = df_data_linear['Comments']
y_retweets = df_data_linear['Retweets']


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_likes_train, y_likes_test, y_comments_train, y_comments_test, y_retweets_train, y_retweets_test = train_test_split(X, y_likes, y_comments, y_retweets, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Create a Linear Regression model for likes
lr_likes = LinearRegression()

# Fit the model on the training data for likes
lr_likes.fit(X_train, y_likes_train)

# Predict on the testing data for likes
y_likes_pred = lr_likes.predict(X_test)

# Evaluate the model using mean squared error and R-squared for likes
mse_likes = mean_squared_error(y_likes_test, y_likes_pred)
r2_likes = r2_score(y_likes_test, y_likes_pred)

print('Mean squared error (Likes):', mse_likes)
print('R-squared (Likes):', r2_likes)

# Create a Linear Regression model for comments
lr_comments = LinearRegression()

# Fit the model on the training data for comments
lr_comments.fit(X_train, y_comments_train)

# Predict on the testing data for comments
y_comments_pred = lr_comments.predict(X_test)

# Evaluate the model using mean squared error and R-squared for comments
mse_comments = mean_squared_error(y_comments_test, y_comments_pred)
r2_comments = r2_score(y_comments_test, y_comments_pred)

print('Mean squared error (Comments):', mse_comments)
print('R-squared (Comments):', r2_comments)

# Create a Linear Regression model for retweets
lr_retweets = LinearRegression()

# Fit the model on the training data for retweets
lr_retweets.fit(X_train, y_retweets_train)

# Predict on the testing data for retweets
y_retweets_pred = lr_retweets.predict(X_test)

# Evaluate the model using mean squared error and R-squared for retweets
mse_retweets = mean_squared_error(y_retweets_test, y_retweets_pred)
r2_retweets = r2_score(y_retweets_test, y_retweets_pred)

print('Mean squared error (Retweets):', mse_retweets)
print('R-squared (Retweets):', r2_retweets)

Mean squared error (Likes): 1.775328852541558e-05
R-squared (Likes): 0.9999999991289109
Mean squared error (Comments): 7.281810200888075e-08
R-squared (Comments): 0.9999999997215915
Mean squared error (Retweets): 1.5945309175491014e-06
R-squared (Retweets): 0.9999999995967612


In [None]:
import joblib

# Save the trained model for Likes
joblib.dump(lr_likes, 'lr_likes_model.pkl')


# Save the trained model for Comments
joblib.dump(lr_comments, 'lr_comments_model.pkl')

# Save the trained model for Retweets
joblib.dump(lr_retweets, 'lr_retweets_model.pkl')

['lr_retweets_model.pkl']

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Create a Decision Tree model for likes
dt_likes = DecisionTreeRegressor()

# Fit the model on the training data for likes
dt_likes.fit(X_train, y_likes_train)

# Predict on the testing data for likes
y_likes_pred = dt_likes.predict(X_test)

# Evaluate the model using mean squared error and R-squared for likes
mse_likes = mean_squared_error(y_likes_test, y_likes_pred)
r2_likes = r2_score(y_likes_test, y_likes_pred)

print('Mean squared error (Likes):', mse_likes)
print('R-squared (Likes):', r2_likes)


Mean squared error (Likes): 18.897371045882018
R-squared (Likes): 0.9990727749866369


In [None]:

# Create a Decision Tree model for comments
dt_comments = DecisionTreeRegressor()

# Fit the model on the training data for comments
dt_comments.fit(X_train, y_comments_train)

# Predict on the testing data for comments
y_comments_pred = dt_comments.predict(X_test)

# Evaluate the model using mean squared error and R-squared for comments
mse_comments = mean_squared_error(y_comments_test, y_comments_pred)
r2_comments = r2_score(y_comments_test, y_comments_pred)

print('Mean squared error (Comments):', mse_comments)
print('R-squared (Comments):', r2_comments)


Mean squared error (Comments): 2.6180535765175263
R-squared (Comments): 0.9899902839690496


In [None]:

# Create a Decision Tree model for retweets
dt_retweets = DecisionTreeRegressor()

# Fit the model on the training data for retweets
dt_retweets.fit(X_train, y_retweets_train)

# Predict on the testing data for retweets
y_retweets_pred = dt_retweets.predict(X_test)

# Evaluate the model using mean squared error and R-squared for retweets
mse_retweets = mean_squared_error(y_retweets_test, y_retweets_pred)
r2_retweets = r2_score(y_retweets_test, y_retweets_pred)

print('Mean squared error (Retweets):', mse_retweets)
print('R-squared (Retweets):', r2_retweets)



Mean squared error (Retweets): 46.23774579652323
R-squared (Retweets): 0.9883069990393644


In [None]:

# Save the trained model for Likes
joblib.dump(dt_likes, 'dt_likes_model.pkl')


# Save the trained model for Comments
joblib.dump(dt_comments, 'dt_comments_model.pkl')

# Save the trained model for Retweets
joblib.dump(dt_retweets, 'dt_retweets_model.pkl')

['dt_retweets_model.pkl']

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Create a Random Forest model for Likes
rf_likes = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data for likes
rf_likes.fit(X_train, y_likes_train)

# Predict on the testing data for likes
y_likes_pred = rf_likes.predict(X_test)

# Evaluate the model using mean squared error and R-squared for likes
mse_likes = mean_squared_error(y_likes_test, y_likes_pred)
r2_likes = r2_score(y_likes_test, y_likes_pred)

print('Mean squared error (Likes):', mse_likes)
print('R-squared (Likes):', r2_likes)

# Create a Random Forest model for Comments
rf_comments = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data for comments
rf_comments.fit(X_train, y_comments_train)

# Predict on the testing data for comments
y_comments_pred = rf_comments.predict(X_test)

# Evaluate the model using mean squared error and R-squared for comments
mse_comments = mean_squared_error(y_comments_test, y_comments_pred)
r2_comments = r2_score(y_comments_test, y_comments_pred)

print('Mean squared error (Comments):', mse_comments)
print('R-squared (Comments):', r2_comments)

# Create a Random Forest model for Retweets
rf_retweets = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data for retweets
rf_retweets.fit(X_train, y_retweets_train)

# Predict on the testing data for retweets
y_retweets_pred = rf_retweets.predict(X_test)

# Evaluate the model using mean squared error and R-squared for retweets
mse_retweets = mean_squared_error(y_retweets_test, y_retweets_pred)
r2_retweets = r2_score(y_retweets_test, y_retweets_pred)

print('Mean squared error (Retweets):', mse_retweets)
print('R-squared (Retweets):', r2_retweets)


Mean squared error (Likes): 4.1064778889997156
R-squared (Likes): 0.9997985101204682
Mean squared error (Comments): 7.33630169920205
R-squared (Comments): 0.9719508044506591
Mean squared error (Retweets): 18.673644432174424
R-squared (Retweets): 0.9952776473307139


In [None]:

# Save the trained model for Likes
joblib.dump(rf_likes, 'rf_likes_model.pkl')

# Save the trained model for Comments
joblib.dump(rf_comments, 'rf_comments_model.pkl')

# Save the trained model for Retweets
joblib.dump(rf_retweets, 'rf_retweets_model.pkl')

In [None]:
label_encoder = LabelEncoder()
df_data_linear['Day of week'] = label_encoder.fit_transform(df_data_linear['Day of week'])
df_data_linear['Language'] = label_encoder.fit_transform(df_data_linear['Language'])
df_data_linear['sentiment'] = label_encoder.fit_transform(df_data_linear['sentiment'])
df_data_linear['topics'] = label_encoder.fit_transform(df_data_linear['topics'])
df_data_linear['key_words'] = label_encoder.fit_transform(df_data_linear['key_words'])


# create an instance of CountVectorizer with desired settings
vectorizer = CountVectorizer(stop_words='english')

# fit the vectorizer to the clean_tweet column
vectorizer.fit(df_data_linear['clean_tweet'])

# transform the clean_tweet column into a matrix of token counts
X_text = vectorizer.transform(df_data_linear['clean_tweet'])

# One-hot encode boolean variables

In [None]:
transformer = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), ['Cashtags', 'Hashtags', 'Location', 'Mentioned_users', 'Verified'])], remainder='passthrough')
df_data_linear = pd.DataFrame(transformer.fit_transform(df_data_linear))

In [None]:
# concatenate the one-hot encoded features, text features, and numerical features
X = np.concatenate((transformer.transform(df_data_linear), X_text.toarray(), df_data_linear[['Followers', 'Friends', 'time', 'Day of week', 'Language', 'sentiment']].values), axis=1)

# scale the numerical features using StandardScaler
scaler = StandardScaler()
X[:, -6:] = scaler.fit_transform(X[:, -6:])

# set the target variables (Likes, Comments, and Retweets)
y = df_data_linear[['Likes', 'Comments', 'Retweets']].values

ValueError: ignored