In [26]:
import pandas as pd

In [27]:
data=pd.read_csv('news_data.csv')
data.head()

Unnamed: 0,source,title,link,summary,published
0,BBC News,Analysis: Biden’s pardon for son shows preside...,https://www.bbc.com/news/articles/c4gplr65prno,Joe Biden may be criticised for protecting his...,"Mon, 02 Dec 2024 03:23:57 GMT"
1,BBC News,BBC faces pressure to pause MasterChef over Wa...,https://www.bbc.com/news/articles/cvgx5ngze3xo,Rupa Huq said the continued broadcast of the s...,"Mon, 02 Dec 2024 10:23:30 GMT"
2,BBC News,Jeremy Bowen: Syria's rebel offensive is aston...,https://www.bbc.com/news/articles/cvgpdpgz4kdo,The country's president has already shown he i...,"Sun, 01 Dec 2024 21:16:05 GMT"
3,BBC News,‘Italian’ purees in UK supermarkets likely to ...,https://www.bbc.com/news/articles/crezlw4y152o,Some products described as “Italian” appear to...,"Sun, 01 Dec 2024 22:01:42 GMT"
4,BBC News,Georgia's PM hits back as protests and resigna...,https://www.bbc.com/news/articles/cp878819l8wo,After days of demonstrations and public resign...,"Mon, 02 Dec 2024 09:27:11 GMT"


In [28]:
data.isnull().sum()

source        0
title         0
link          0
summary      17
published     0
dtype: int64

In [29]:
# Fill missing summaries with the article title
data['summary'] = data['summary'].fillna(data['title'])
data.isnull().sum()

source       0
title        0
link         0
summary      0
published    0
dtype: int64

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use TF-IDF to convert the article summaries to a matrix of numbers
tfidf = TfidfVectorizer(stop_words='english')

# Create the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(data['summary'])

# Check the shape of the TF-IDF matrix
print(tfidf_matrix.shape)


(245, 4837)


In [31]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity between articles based on their summaries
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Print the cosine similarity matrix shape to confirm the computation
print(cosine_sim.shape)


(245, 245)


In [32]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_tfidf_and_svd(data, n_components=100):
    # Combine title and summary to create a richer representation of the articles
    corpus = data['title'] + " " + data['summary']  # Concatenate title and summary for each article

    # Create a TF-IDF vectorizer to convert text to vectors
    vectorizer = TfidfVectorizer(stop_words='english')
    
    # Fit and transform the corpus (convert text to numerical form)
    tfidf_matrix = vectorizer.fit_transform(corpus)
    
    # Apply SVD (Truncated SVD is used to perform dimensionality reduction)
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    reduced_matrix = svd.fit_transform(tfidf_matrix)
    
    # Compute cosine similarity between all articles in the reduced space
    cosine_sim = cosine_similarity(reduced_matrix, reduced_matrix)
    
    return cosine_sim

In [33]:
def recommend_articles_svd(title, cosine_sim, top_n=5):
    # Find the index of the article that matches the title
    idx = data.index[data['title'] == title].tolist()
    
    if not idx:
        return "Article not found in dataset."

    idx = idx[0]  # Get the first index if multiple matches

    # Get the pairwise similarity scores of all articles with that article
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the articles based on similarity scores (from highest to lowest)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top N most similar articles (excluding the article itself)
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the indices of the most similar articles
    article_indices = [i[0] for i in sim_scores]

    # Return the top N most similar articles with their titles and links
    recommended_articles = data.iloc[article_indices][['title', 'link']]
    
    return recommended_articles

In [35]:
article_title = "Joe Biden issues ‘full and unconditional’ pardon to son Hunter"
recommended_articles = recommend_articles_svd(article_title, cosine_sim)
print(recommended_articles)

                                                 title  \
115  With his pardon of son Hunter, Joe Biden deliv...   
223  Biden Issues a ‘Full and Unconditional Pardon’...   
0    Analysis: Biden’s pardon for son shows preside...   
224  Analysis: In Pardoning Hunter, Biden Sounds a ...   
152  UK house prices rise at fastest rate in nearly...   

                                                  link  
115  https://www.theguardian.com/us-news/2024/dec/0...  
223  https://www.nytimes.com/2024/12/01/us/politics...  
0       https://www.bbc.com/news/articles/c4gplr65prno  
224  https://www.nytimes.com/2024/12/01/us/politics...  
152  https://www.theguardian.com/money/2024/dec/02/...  


In [36]:
import numpy as np
article_titles = data['title'].tolist()

# Simulate synthetic user interactions (random 0 or 1 for interaction)
num_users = 100  # Simulate 10 users
num_articles = len(article_titles)

# Randomly generate interactions (1 = interacted, 0 = not interacted)
np.random.seed(42)
interaction_matrix = np.random.randint(0, 2, size=(num_users, num_articles))

# Create a DataFrame for the interaction matrix
user_ids = [f"user{i+1}" for i in range(num_users)]
interaction_df = pd.DataFrame(interaction_matrix, index=user_ids, columns=article_titles)

print(interaction_df)

         Analysis: Biden’s pardon for son shows presidents now act differently  \
user1                                                    0                       
user2                                                    1                       
user3                                                    1                       
user4                                                    0                       
user5                                                    1                       
...                                                    ...                       
user96                                                   1                       
user97                                                   0                       
user98                                                   1                       
user99                                                   0                       
user100                                                  0                       

         BBC fa

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute Cosine Similarity between users based on their interactions
user_similarity = cosine_similarity(interaction_df)

# Convert similarity matrix into a DataFrame for better readability
user_similarity_df = pd.DataFrame(user_similarity, index=interaction_df.index, columns=interaction_df.index)
print(user_similarity_df)


            user1     user2     user3     user4     user5     user6     user7  \
user1    1.000000  0.516019  0.539474  0.500913  0.533500  0.510126  0.496380   
user2    0.516019  1.000000  0.587855  0.446959  0.463967  0.537654  0.484698   
user3    0.539474  0.587855  1.000000  0.473408  0.472970  0.537295  0.518505   
user4    0.500913  0.446959  0.473408  1.000000  0.504741  0.490287  0.390925   
user5    0.533500  0.463967  0.472970  0.504741  1.000000  0.523042  0.421199   
...           ...       ...       ...       ...       ...       ...       ...   
user96   0.434519  0.465891  0.467025  0.407277  0.480464  0.475191  0.448435   
user97   0.547774  0.534884  0.564951  0.497558  0.504666  0.506028  0.434557   
user98   0.469172  0.498318  0.546176  0.428442  0.481051  0.500067  0.424565   
user99   0.530617  0.510159  0.541681  0.433588  0.451982  0.528473  0.481223   
user100  0.433469  0.553504  0.553134  0.460466  0.495726  0.481531  0.421199   

            user8     user9

In [41]:
def recommend_for_user(user_id, interaction_df, user_similarity_df, top_n=3):
    # Get the similarity scores for the given user
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)

    # Exclude the user itself (remove diagonal)
    similar_users = similar_users.drop(user_id)

    # Get the articles interacted by similar users
    similar_users_articles = interaction_df.loc[similar_users.index]
    
    # Calculate the average interaction for each article (weighted by similarity score)
    article_scores = similar_users_articles.T.dot(similar_users) / similar_users.sum()
    
    # Sort articles by score and recommend top_n articles that the user hasn't interacted with
    recommended_articles = article_scores[interaction_df.loc[user_id] == 0].sort_values(ascending=False).head(top_n)
    
    return recommended_articles.index.tolist()

# Example of recommending articles for user1
recommended_articles = recommend_for_user('user5', interaction_df, user_similarity_df)
print("Recommended articles for user5:", recommended_articles)

Recommended articles for user5: ['Weather tracker: lake-effect snow blankets parts of north-east US', "Here's what you need to know if you haven't filed your return yet — and even if you have", 'Swansea mother ‘traumatised’ by arrest under Terrorism Act']


In [50]:
data['published'] = pd.to_datetime(data['published'], errors='coerce')

# 2. Create a 'time_of_day' column based on the hour of publication
def get_time_of_day(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

# Apply the function to get the time of day
data['time_of_day'] = data['published'].dt.hour.apply(get_time_of_day)

# 3. Contextual recommendation based on current time of day
def recommend_based_on_time(current_time_of_day, data, top_n=5):
    # Filter articles by the current time of day
    filtered_data = data[data['time_of_day'] == current_time_of_day]

    # If there are no articles for the current time of day, return a message
    if filtered_data.empty:
        print(f"No articles found for {current_time_of_day}!")
        return None

    # Assuming we want to recommend based on the summary (you can adjust based on your need)
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(filtered_data['summary'])

    # Compute cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Create a function to get top N recommendations
    def get_recommendations(idx, cosine_sim=cosine_sim, top_n=5):
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:top_n+1]  # Exclude the first one, as it's the article itself
        article_indices = [i[0] for i in sim_scores]
        return filtered_data.iloc[article_indices]

    # Ensure there's at least one article in filtered_data
    if len(filtered_data) > 0:
        # Recommend based on the most recent article (just an example)
        # Ensure the index is valid
        recent_article_idx = filtered_data.index[-1]  # Last article in the filtered dataset
        
        # Now, we need to map this index to the filtered_data indices
        filtered_data_idx = filtered_data.index.get_loc(recent_article_idx)  # Get position in filtered_data
        
        # Get recommendations based on this mapped index
        recommendations = get_recommendations(filtered_data_idx, cosine_sim, top_n)
        return recommendations
    else:
        return None

# Example of recommending articles based on 'evening' context
current_time_of_day = 'evening'  # This would dynamically change based on the current time
recommended_articles = recommend_based_on_time(current_time_of_day, data)

# Show the recommended articles
if recommended_articles is not None:
    print("Recommended Articles based on time of day:")
    print(recommended_articles[['title', 'published', 'time_of_day']])
else:
    print("No articles available for the selected time of day.")

Recommended Articles based on time of day:
                                                 title           published  \
195  Rebels behind Aleppo’s surprise fall took adva... 2024-12-01 19:10:19   
134  The Guardian view on a race for missile suprem... 2024-12-01 17:30:01   
130  Ella Baron on the 600 Brazilians deported from... 2024-12-01 17:15:18   
150  New plan would ‘transform’ end of life care fo... 2024-12-01 20:45:03   
158  Weather blamed for small boat arrivals under L... 2024-12-01 18:00:01   

    time_of_day  
195     evening  
134     evening  
130     evening  
150     evening  
158     evening  


In [52]:
import random

# Step 1: Simulate locations for articles (assigning random locations to each article)
locations = ['New York', 'London', 'Tokyo', 'Berlin', 'Sydney']
data['location'] = [random.choice(locations) for _ in range(len(data))]

# Step 2: Simulate user location (for example, the user is in 'New York')
user_location = 'New York'

# Step 3: Define a function to get recommendations based on cosine similarity
def get_recommendations(idx, cosine_sim, top_n=5):
    # Get pairwise similarity scores for the article at idx
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort articles by similarity score in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get top N similar articles (excluding the article itself)
    sim_scores = sim_scores[1:top_n + 1]
    
    # Get article indices and their similarity scores
    article_indices = [i[0] for i in sim_scores]
    return data.iloc[article_indices]

# Step 4: Filter articles based on user location
def recommend_based_on_location(user_location, data, cosine_sim, top_n=5):
    # Filter articles that are in the same location as the user
    location_filtered_data = data[data['location'] == user_location]

    if len(location_filtered_data) > 0:
        # Recommend based on the most recent article in the user's location
        recent_article_idx = location_filtered_data.index[-1]  # Last article in the filtered dataset
        recommendations = get_recommendations(recent_article_idx, cosine_sim, top_n)
        return recommendations
    else:
        print("No articles found for the given location.")
        return None

# Step 5: Test the location-based recommendation
recommended_articles_location = recommend_based_on_location(user_location, data, cosine_sim)

# Display the recommendations
if recommended_articles_location is not None:
    for idx, article in recommended_articles_location.iterrows():
        print(f"Recommended Article: {article['title']} - {article['link']}")


Recommended Article: Biden Issues a ‘Full and Unconditional Pardon’ of His Son Hunter Biden - https://www.nytimes.com/2024/12/01/us/politics/biden-pardon-son-hunter.html
Recommended Article: Kash Patel Would Bring Bravado and Baggage to F.B.I. Role - https://www.nytimes.com/2024/12/01/us/politics/kash-patel-bravado-baggage-fbi.html
Recommended Article: Weather blamed for small boat arrivals under Labour passing 20,000 - https://www.theguardian.com/uk-news/2024/dec/01/weather-blamed-for-small-boat-arrivals-rising-to-20000-under-labour
Recommended Article: Former world snooker champion Griffiths dies aged 77 - https://www.bbc.com/sport/snooker/articles/c5y78g8828xo
Recommended Article: Analysis: Biden’s pardon for son shows presidents now act differently - https://www.bbc.com/news/articles/c4gplr65prno
