#**BERT Model for Popularity Prediction**
---

# **Connecting to drive**

In [None]:
from google.colab import drive

drive.mount("/content/Drive")

Mounted at /content/Drive


In [None]:
import os

# Mount your Google Drive to access files stored there
drive.mount('/content/drive')

# Replace 'your_file_name.csv' with the actual name of your file.
file_name = 'final_data.csv'

# Set the root directory to your Google Drive
root_dir = '/content/drive/My Drive/'

# Function to recursively search for the file in all directories and subdirectories
def find_file(directory):
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        if os.path.isfile(item_path) and item == file_name:
            return directory
        elif os.path.isdir(item_path):
            result = find_file(item_path)
            if result:
                return result
    return None

# Call the function to find the file directory
file_directory = find_file(root_dir)

# Print the file directory
if file_directory:
    print("File directory:", file_directory)
else:
    print("File not found in Google Drive.")

Mounted at /content/drive
File directory: /content/drive/My Drive/AI_Desicion_Scineces2_endterm


In [None]:
os.chdir('/content/drive/My Drive/AI_Desicion_Scineces2_endterm')

In [None]:
#Basic libraries
import pandas as pd
import numpy as np


#NLTK libraries
import nltk
import re
import string
from wordcloud import WordCloud,STOPWORDS
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

#Visualization libraries
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
from textblob import TextBlob
from plotly import tools
import plotly.graph_objs as go
from plotly.offline import iplot
%matplotlib inline

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

#Other miscellaneous libraries
from scipy import interp
from itertools import cycle
import cufflinks as cf
from collections import defaultdict
from collections import Counter
from imblearn.over_sampling import SMOTE

#**Reading the data**
---

In [None]:
# Read the csv file and convert it to DataFrame
df = pd.read_csv('final_data.csv')

In [None]:
df.head()

Unnamed: 0,IDLink,Title,Headline,Topic,PublishDate,SentimentTitle,SentimentHeadline,Facebook,GooglePlus,LinkedIn,PublishTime,Weekday,Facebook_scaled,GooglePlus_scaled,LinkedIn_scaled,SentimentTitle_Category,SentimentHeadline_Category,Source_type,Hour
0,80690.0,"Monday, 29 Feb 2016","RAMALLAH, February 25, 2016 (WAFA) - Palestine...",palestine,2016-02-28,0.0,-0.005906,1.0,1.0,1.0,14:03:00,Sunday,-0.64969,-0.617774,-0.563154,neutral,negative,D,14
1,28854.0,Buffett: Politicians 'Dead Wrong' on Economy,Warren Buffett has a message for presidential ...,economy,2016-02-28,0.051031,-0.037921,0.0,0.0,0.0,19:17:00,Sunday,0.0,0.0,0.0,positive,negative,D,19
2,81052.0,"Monday, 29 Feb 2016","RAMALLAH, February 29, 2016 (WAFA) - The Gover...",palestine,2016-03-01,0.0,0.048546,1.0,1.0,1.0,09:29:00,Tuesday,-0.64969,-0.617774,-0.563154,neutral,positive,D,9
3,80994.0,"Tuesday, 1 Mar 2016","RAMALLAH, February 29, 2016 (WAFA) - The Gover...",palestine,2016-03-01,-0.243068,0.048546,1.0,1.0,1.0,00:15:00,Tuesday,-0.64969,-0.617774,-0.563154,negative,positive,D,0
4,946.0,Microsoft Takes Six Billion Dollars From Android,"A long time ago, Microsoft MSFT +0.00% purchas...",microsoft,2015-11-01,0.0,0.115928,0.0,0.0,0.0,00:00:00,Sunday,0.0,0.0,0.0,neutral,positive,D,0


In [None]:
df.shape

(92808, 19)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92808 entries, 0 to 92807
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   IDLink                      92808 non-null  float64
 1   Title                       92808 non-null  object 
 2   Headline                    92808 non-null  object 
 3   Topic                       92808 non-null  object 
 4   PublishDate                 92808 non-null  object 
 5   SentimentTitle              92808 non-null  float64
 6   SentimentHeadline           92808 non-null  float64
 7   Facebook                    92808 non-null  float64
 8   GooglePlus                  92808 non-null  float64
 9   LinkedIn                    92808 non-null  float64
 10  PublishTime                 92808 non-null  object 
 11  Weekday                     92808 non-null  object 
 12  Facebook_scaled             92808 non-null  float64
 13  GooglePlus_scaled           928

##**Sampling data**
- to make it managable

In [None]:
# The percentage of data to sample
sample_percentage = 0.05

# Perform simple random sampling
sample_data = df.sample(frac=sample_percentage, random_state=42)

In [None]:
sample_data['text']=sample_data['Title'] + ''+sample_data['Headline']

#**Preprocessing**
---

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Define stopwords
stop_words = set(stopwords.words('english'))

# Define stemmer
stemmer = PorterStemmer()

# Define lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuations and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Tokenize and remove stop words
    tokenized_text = [w for w in word_tokenize(text) if w not in stop_words]
    text = ' '.join(tokenized_text)

    # Perform stemming and lemmatization
    stemmed_lemmatized_text = [stemmer.stem(lemmatizer.lemmatize(w)) for w in word_tokenize(text)]
    text = ' '.join(stemmed_lemmatized_text)

    return text

In [None]:
sample_data['text'] = sample_data['text'].apply(preprocess_text)
print(sample_data)

         IDLink                                              Title  \
57406   39210.0  Job report shows upswing in labor statistic, b...   
7781     5687.0  China Reshapes Energy Sector, Shifts Towards M...   
19846   69591.0  Thanks, Obama: Highest Earners' Tax Rates Rose...   
59320   40399.0  Japan economy recovering despite slowing emerg...   
17606   11527.0  FP Watchlist: Canadian economy stalls unexpect...   
...         ...                                                ...   
2638     5353.0  Microsoft's Nadella Makes New Security Push Wi...   
6812     5001.0  Black Friday UK: Sony's PS4 sale and Microsoft...   
87352  102385.0  Rep. Darrell Issa Spent 7 Years Lying About Ob...   
76345   51784.0  Microsoft Unveils Plan To Combat 'Terrorist Co...   
43712   27591.0  UPDATE 1-Irish economy frames voter debate as ...   

                                                Headline      Topic  \
57406  The report shows that the economy and labor-fo...    economy   
7781   Chinese Pr

####**Dropping and one-hot encoding columns**

In [None]:
# Dropping irrelevant columns, i.e. other than
irrelevant_columns = ['IDLink', 'Title', 'Headline', 'PublishDate', 'Facebook',
                      'SentimentTitle','SentimentHeadline', 'GooglePlus',
                      'LinkedIn']
sample_data.drop(columns = irrelevant_columns, inplace = True)


In [None]:
sample_data = pd.get_dummies(sample_data, columns=['Topic','SentimentTitle_Category','SentimentHeadline_Category','Source_type','Weekday'])

print(sample_data.shape)
sample_data.head()

(4640, 27)


Unnamed: 0,PublishTime,Facebook_scaled,GooglePlus_scaled,LinkedIn_scaled,Hour,text,Topic_economy,Topic_microsoft,Topic_obama,Topic_palestine,...,Source_type_B,Source_type_C,Source_type_D,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
57406,18:01:17,-0.615655,-0.617774,-0.563154,18,job report show upsw labor statist economi sti...,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7781,10:33:00,-0.64969,-0.617774,-0.563154,10,china reshap energi sector shift toward market...,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
19846,00:53:42,2.328406,2.386772,2.434875,0,thank obama highest earner tax rate rose sharp...,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
59320,02:13:19,-0.64969,-0.617774,-0.563154,2,japan economi recov despit slow emerg market b...,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
17606,17:20:20,-0.615655,-0.617774,-0.563154,17,fp watchlist canadian economi stall unexpected...,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [None]:
# Assuming you have a DataFrame df with multiple target columns: target1, target2, target3
X = sample_data.drop(['Facebook_scaled', 'LinkedIn_scaled', 'GooglePlus_scaled'], axis=1)  # Features
y = sample_data[['Facebook_scaled', 'LinkedIn_scaled', 'GooglePlus_scaled']]  # Multiple target columns

###**Splitting the dataset**

In [None]:
from sklearn.model_selection import train_test_split
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4640 entries, 57406 to 43712
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Facebook_scaled    4640 non-null   float64
 1   LinkedIn_scaled    4640 non-null   float64
 2   GooglePlus_scaled  4640 non-null   float64
dtypes: float64(3)
memory usage: 145.0 KB


#**BERT MODEL**

In [None]:
!pip install transformers



In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [None]:
# Tokenize text data using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_text_train = X_train['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
tokenized_text_test = X_test['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
# Step 3: Define the BERT Model
# Load the pre-trained BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
# Analyze the distribution of sequence lengths in your tokenized text data
sequence_lengths = tokenized_text_train.apply(len)

# Calculate the maximum sequence length considering the percentile
# For example, you can choose the 95th percentile as your max_seq_length
max_seq_length = int(np.percentile(sequence_lengths, 95))

In [None]:
# Define additional layers for regression
input_ids = Input(shape=(max_seq_length,), dtype=tf.int32)
bert_output = bert_model(input_ids)[0]  # BERT embedding layer
x = Flatten()(bert_output)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(3)(x)  # Output layer with 3 units for 3 target variables

model = Model(inputs=input_ids, outputs=output)

In [None]:
# Step 4: Compile the Model
optimizer = Adam(learning_rate=1e-5)
model.compile(optimizer=optimizer, loss='mean_squared_error')

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming tokenized_text_train is a list of lists containing tokenized sequences
padded_sequences_train = pad_sequences(tokenized_text_train, maxlen=max_seq_length, padding='post', truncating='post')


In [None]:
model.fit(padded_sequences_train, y_train, epochs=7, batch_size=32, validation_split=0.2)

Epoch 1/7




Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.src.callbacks.History at 0x7fa8a93464d0>

In [None]:
# Assuming tokenized_text_train is a list of lists containing tokenized sequences
padded_sequences_test = pad_sequences(tokenized_text_test, maxlen=max_seq_length, padding='post', truncating='post')


In [None]:
# Step 6: Evaluate the Model
y_pred = model.predict(padded_sequences_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 0.7774585444947104
R-squared (R2): 0.10051964715246682


#**Model Performance:**
---

**Popularity prediction:**

- BERT model seemed to show significant improvement with each epoch, with more epochs the model could give much better results.

