In [12]:
import os
import pandas as pd
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge
import plotly.express as px

In [13]:
# Load the dataset
dataset_path = 'C:\\Users\\sshah77\\Desktop\\5660 project\\BBC News Summary' 
articles_path = os.path.join(dataset_path, 'BBC News Summary', 'News Articles')
summaries_path = os.path.join(dataset_path, 'BBC News Summary', 'Summaries')

In [14]:
# Initialize list
articles = []
summaries = []

In [15]:
for category in os.listdir(articles_path):
    category_articles_path = os.path.join(articles_path, category)
    category_summaries_path = os.path.join(summaries_path, category)
    
    for file_name in os.listdir(category_articles_path):
        file_path = os.path.join(category_articles_path, file_name)
        with open(file_path, 'r', encoding='latin-1') as file:  # Change the encoding to 'latin-1'
            article = file.read()
            articles.append(article)
        
        summary_file_name = file_name  # Use the same file name for the summary file
        summary_file_path = os.path.join(category_summaries_path, summary_file_name)
        with open(summary_file_path, 'r', encoding='latin-1') as file:  # Change the encoding to 'latin-1'
            summary = file.read()
            summaries.append(summary)


In [16]:
# Generate summaries using TextRank algorithm
generated_summaries = []
for article in articles:
    # Tokenize the article into sentences
    sentences = sent_tokenize(article)
    
    # Calculate the sentence scores using TextRank
    vectorizer = CountVectorizer()
    sentence_vectors = vectorizer.fit_transform(sentences)
    sentence_similarities = cosine_similarity(sentence_vectors)
    sentence_scores = sentence_similarities.sum(axis=1)
    
    # Sort the sentences based on scores
    ranked_sentences = sorted(zip(sentences, sentence_scores), key=lambda x: x[1], reverse=True)
    
    # Select the top N sentences as the summary
    top_n = 3
    summary = " ".join([sentence for sentence, _ in ranked_sentences[:top_n]])
    generated_summaries.append(summary)


In [17]:
print("Generated Summary:")
for line in generated_summaries[:6]:
    print(line)

Generated Summary:
But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-speed broadband. It will now book the sale of its stake in AOL Europe as a loss on the value of that stake.
Dollar gains on Greenspan speech

The dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise. In the meantime, the US Federal Reserve's decision on 2 February to boost interest rates by a quarter of a point - the sixth such move in as many months - has opened up a differential with European rates. The recent falls have partly been the result of big budget deficits, as well as the US's yawning cur

In [18]:
# Compute ROUGE scores
rouge = Rouge()
scores = rouge.get_scores(generated_summaries, summaries, avg=True)

In [19]:
# Extract the ROUGE scores
rouge_1_score = scores['rouge-1']['f']
rouge_2_score = scores['rouge-2']['f']
rouge_l_score = scores['rouge-l']['f']

In [20]:
# Print the ROUGE scores
print(f"ROUGE-1: {rouge_1_score}")
print(f"ROUGE-2: {rouge_2_score}")
print(f"ROUGE-L: {rouge_l_score}")

ROUGE-1: 0.527848093361438
ROUGE-2: 0.42860668364064525
ROUGE-L: 0.519328796849987


In [21]:
# Create a DataFrame for the ROUGE scores
scores_df = pd.DataFrame({'ROUGE Metric': ['ROUGE-1', 'ROUGE-2', 'ROUGE-L'],
                          'Score': [rouge_1_score, rouge_2_score, rouge_l_score]})

In [22]:
# Plot the ROUGE scores
fig = px.bar(scores_df, x='ROUGE Metric', y='Score', title='ROUGE Scores')
fig.show()