In [1]:
!pip install bertopic

[0m

In [None]:
import pandas as pd
from bertopic import BERTopic
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
data = pd.read_csv("/content/Instagram.csv")
data

In [None]:
data.Hashtags = data.Hashtags.astype('str')
data.Captions = data.Captions.astype('str')

In [None]:
data["Clickbait"].value_counts()

In [None]:
def clean_and_preprocess(text):
    # 1. Convert text to lowercase
    text = text.lower()

    # 2. Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 3. Tokenize the text into words
    tokens = word_tokenize(text)

    # 4. Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 5. Lemmatize words (convert words to their base form)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # 6. Join the preprocessed tokens back into a clean text
    cleaned_text = ' '.join(tokens)

    return cleaned_text

In [None]:
Hashtags = list(data["Hashtags"])
Captions = list(data["Captions"])

In [None]:
cleaned_Hashtags_ = []
cleaned_Captions_ = []

similarity_score = []
for i in range(len(data)):
    cleaned_Hashtags = clean_and_preprocess(Hashtags[i])
    cleaned_Captions = clean_and_preprocess(Captions[i])

    cleaned_Hashtags_.append(cleaned_Hashtags)
    cleaned_Captions_.append(cleaned_Captions)

In [None]:
# Load BertTokenizer and BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
# Tokenize and encode the video titles using BERT
encoded_captions = [tokenizer(title, padding=True, truncation=True, return_tensors='pt') for title in cleaned_Captions_]

# Extract embeddings from BERT for titles
caption_embeddings = []
for encoded_caption in encoded_captions:
    with torch.no_grad():
        outputs = model(**encoded_caption)
    caption_embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())

In [None]:
# Stack the title embeddings into a numpy array
caption_embeddings_matrix = np.stack(caption_embeddings)

In [None]:
# Tokenize and encode the video transcripts using BERT
encoded_hashtags = [tokenizer(hashtag, padding=True, truncation=True, return_tensors='pt') for hashtag in cleaned_Hashtags_]

# Extract embeddings from BERT for transcripts
hashtag_embeddings = []
for encoded_hashtag in encoded_hashtags:
    with torch.no_grad():
        outputs = model(**encoded_hashtag)
    hashtag_embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())

# Stack the transcript embeddings into a numpy array
hashtag_embeddings_matrix = np.stack(hashtag_embeddings)

In [None]:
# Perform topic modeling using BERTopic for titles
caption_bertopic = BERTopic(min_topic_size=2)  # Increase the min_topic_size if needed
caption_topics, _ = caption_bertopic.fit_transform(cleaned_Captions_)

# Perform topic modeling using BERTopic for transcripts
hashtag_bertopic = BERTopic(min_topic_size=2)  # Increase the min_topic_size if needed
hashtag_topics, _ = hashtag_bertopic.fit_transform(cleaned_Hashtags_)

# Create DataFrames to display the results
caption_results = pd.DataFrame({'Video Title': cleaned_Captions_, 'Topic (Caption)': caption_topics})
hashtag_results = pd.DataFrame({'Transcript': cleaned_Hashtags_, 'Topic (Hashtag)': hashtag_topics})

In [None]:
caption_results

In [None]:
hashtag_results

In [None]:
label_list = []
for i in range(len(data)):
  if caption_results["Topic (Caption)"][i] == hashtag_results["Topic (Hashtag)"][i]:
    label_list.append(0)
  else:
    label_list.append(1)

In [None]:
true_labels = data["Clickbait"]

In [None]:
True_clickbait = 0
False_clickbait = 0

True_nonclickbait = 0
False_nonclickbiat = 0

for i in range(len(true_labels)):
  if label_list[i] == 1 and  true_labels[i] == 1:
    True_clickbait += 1
  elif label_list[i] == 1 and  true_labels[i] == 0:
    False_clickbait += 1
  elif label_list[i] == 0 and  true_labels[i] == 1:
    False_nonclickbiat += 1
  elif label_list[i] == 0 and  true_labels[i] == 0:
    True_nonclickbait += 1

In [None]:
print("True_clickbait", True_clickbait)
print("False_clickbait", False_clickbait)

print("True_nonclickbait", True_nonclickbait)
print("False_nonclickbiat", False_nonclickbiat)

In [None]:
accuracy = (True_clickbait + True_nonclickbait) / (True_clickbait + False_clickbait + True_nonclickbait + False_nonclickbiat)
accuracy

In [None]:
Precision = True_clickbait / (True_clickbait + False_clickbait)
Precision

In [None]:
Recall = True_clickbait / (True_clickbait + False_nonclickbiat)
Recall

In [None]:
scores_df = data
scores_df["Bert Feature"] = 0

for i in range (len(hashtag_results)):
  if hashtag_results["Topic (Hashtag)"].iloc[i] == caption_results["Topic (Caption)"].iloc[i]:
    scores_df["Bert Feature"].iloc[i] = 0
  else:
    scores_df["Bert Feature"].iloc[i] = 1

#scores_df = scores_df.drop(columns=["Captions", "Hashtags", "SearchedTag", "LengthOfHashtags", "LengthOfCaptions", "URLInclusion", "BodySnap", "Marketing", "ProductOnly", "NonFashion", "Face", "Logo", "BrandLogo", "Smile", "Outdoor", "MentionInclusion", "EmojiCount", "EmojiExistence", "EmojiPortion", "Top100HashOfInsta", "Top100HashWithinData", "Top100ComentionedHashPair", "Selfie"])
scores_df = scores_df.dropna()
scores_df = scores_df.drop(columns=["Captions", "Hashtags", "SearchedTag"])
scores_df


In [None]:
scores_df["Bert Feature"].value_counts()

In [None]:
#SVM Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics

#Logistic Regression Imports
from sklearn import linear_model

#KNN Imports
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

def train_svm(df, df_xcols, df_ycol, kernel_type):
    x_svm = df[df_xcols]
    y_svm = df[df_ycol]

    X_train, X_test, y_train, y_test = train_test_split(x_svm, y_svm, test_size=0.3) # 70% training and 30% test

    clf = SVC(kernel=kernel_type)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    #Model Accuracy
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

    #Model Precision
    print("Precision:",metrics.precision_score(y_test, y_pred))

    #Model Recall
    print("Recall:",metrics.recall_score(y_test, y_pred))

    #Model F1 Score
    print("F1 Score: ", metrics.f1_score(y_test, y_pred))

    return clf

def train_logr(df, feature_cols, df_ycol):
    x_logr = df[feature_cols]
    y_logr = df[df_ycol]
    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(x_logr, y_logr, test_size=0.3)

    logr = linear_model.LogisticRegression(max_iter = 1000)
    logr.fit(X_train, y_train.values.ravel())
    y_pred = logr.predict(X_test)

    #Model Accuracy
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

    #Model Precision
    print("Precision:",metrics.precision_score(y_test, y_pred))

    #Model Recall
    print("Recall:",metrics.recall_score(y_test, y_pred))

    #Model F1 Score
    print("F1 Score: ", metrics.f1_score(y_test, y_pred))

    return logr

def train_KNN(df, feature_cols, df_ycol, neighbors):

    # Create feature and target arrays
    x_KNN = df[feature_cols]
    y_KNN = df[df_ycol]

    # Split into training and test set
    X_train, X_test, y_train, y_test = train_test_split(x_KNN, y_KNN, test_size = 0.3)

    knn = KNeighborsClassifier(n_neighbors=neighbors)

    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)

    #Model Accuracy
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

    #Model Precision
    print("Precision:",metrics.precision_score(y_test, y_pred))

    #Model Recall
    print("Recall:",metrics.recall_score(y_test, y_pred))

    #Model F1 Score
    print("F1 Score: ", metrics.f1_score(y_test, y_pred))

    return knn

In [None]:
model = train_svm(scores_df, ["Likes", "Comments", "Followings", "Followers", "Bert Feature"], 'Clickbait', 'rbf')

In [None]:
model = train_logr(scores_df, ["Likes", "Comments", "Followings", "Followers", "Bert Feature"], 'Clickbait')

In [None]:
model = train_KNN(scores_df, ["Likes", "Comments", "Followings", "Followers", "Bert Feature"], 'Clickbait', 7)

In [None]:
noscores_df = scores_df.drop(columns=["Bert Feature"])

In [None]:
model = train_svm(noscores_df, ["Likes", "Comments", "Followings", "Followers"], 'Clickbait', 'rbf')

In [None]:
model = train_logr(noscores_df, ["Likes", "Comments", "Followings", "Followers"], 'Clickbait')

In [None]:
model = train_KNN(noscores_df, ["Likes", "Comments", "Followings", "Followers"], 'Clickbait', 7)