In [1]:
%reload_ext autoreload
%autoreload 2

In [36]:
import os, sys
import re
import json
import glob
import datetime
from collections import Counter

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import nltk

from nltk.corpus import stopwords
from wordcloud import WordCloud
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer


In [3]:
os.chdir('..')

In [32]:
from src.loader import NewsDataLoader

In [5]:
from src.config import cfg

Output File: news_output.csv
Path: data
News Data: rating.csv
Traffic Data: trafiic.csv
Domain Location Data: domains_location.csv


In [33]:
import src.utils as utils

In [7]:
news_data_loader = NewsDataLoader(cfg.path)

In [8]:
original_news_df = news_data_loader.get_news_data()

In [9]:
# check columns
original_news_df.columns

Index(['article_id', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'category', 'article', 'title_sentiment'],
      dtype='object')

In [10]:
original_news_df.isna().sum()

article_id             0
source_id          40585
source_name            0
author              2163
title                  0
description           10
url                    0
url_to_image        3451
published_at           0
content                0
category              21
article                0
title_sentiment        0
dtype: int64

In [37]:

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [64]:
test_content = original_news_df['content']
test_title = original_news_df['title']
print(test_title.count())
print(test_content.count())
#cleantxt = utils.clean_text(test_content)
#print('The Cleaned Text is: \n')
#print(cleantxt)

58356
58356


In [28]:

# this is grat but we need vectorizers to get the keywords that sound grammatically correct
kw_model = KeyBERT()

In [65]:
title_keywords = kw_model.extract_keywords(docs=test_title, vectorizer=KeyphraseCountVectorizer())
content_keywords = kw_model.extract_keywords(docs=test_content, vectorizer=KeyphraseCountVectorizer())

def flatten_and_extract_keywords(list_of_tuples):
    return [item[0] for item in list_of_tuples]

def jaccard_similarity(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection
    return float(intersection) / union

# Calculate Jaccard similarity for each pair of lists
similarities = []
for title_list, content_list in zip(title_keywords, content_keywords):
    title_keywords_flat = flatten_and_extract_keywords(title_list)
    content_keywords_flat = flatten_and_extract_keywords(content_list)
    similarity = jaccard_similarity(title_keywords_flat, content_keywords_flat)
    similarities.append(similarity)
    print(f"Jaccard similarity: {similarity} for title keywords: {title_keywords_flat} and content keywords: {content_keywords_flat}")

plt.figure(figsize=(10, 6))
plt.bar(range(len(similarities)), similarities)
plt.xlabel('Pair Index')
plt.ylabel('Jaccard Similarity')
plt.title('Jaccard Similarity between Title and Content Keywords')
plt.show()