In [None]:
'''
MFG 598: Final Project

Automatic Text Summarization using PageRank Algorithm and Cosine Similarity Matrix

Faisal Ali Khan
'''

In [None]:
# Import necessary libraries

import nltk                                             # Natural Language Toolkit for text processing and analysis
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance

import os                                               # Interacting with the operating system
import csv                                              # Reading and writing CSV files
import pandas as pd                                     # Data manipulation and analysis
import numpy as np                                      # Numerical computing
import networkx as nx                                   # Graphs and networks (Pager Rank Algoritm)
import matplotlib.pyplot as plt                         # Visualization for creating graphs and charts

In [None]:
# Download the stopwords corpus from NLTK
nltk.download('stopwords')
# list of commonly used words in English that don't really  contribute to the meaning of text
stop_words = stopwords.words('english')

In [None]:
# Takes file path as input and return a list of sentences from the file

def read_folder(file_path):
    with open(file_path, "r") as f:
        filedata = f.read()
    file = filedata.split(". ")

    sentences = []                                                              # sentences is a list of lists of words, where each inner list represents a sentence

    # Remove non-alphabetic characters from the sentence and split it into a list of words
    for sentence in file:                                                              
        cleaned_sentence = sentence.replace("[^a-zA-Z]"," ").split(" ")         
        sentences.append(cleaned_sentence)
    return sentences

In [None]:
# Compute similarity between two sentences based on words they share

def sentence_similarity(sentence_1, sententence_2, stopwords=None):
    if stopwords is None:
        stopwords=[]                                                            # optional list of words to ignore in the sentence comparisons
    # Convert all words to lowercase                                            
    sentence_1 = [w.lower() for w in sentence_1]                                          # sent1 and sent2 are lists of words representing two sentences
    sentence_2 = [w.lower() for w in sentence_2]

    # Create a list of all unique words in both sentences
    all_words = list(set(sentence_1 + sentence_2))

    # Initialize two vectors with zeros, one for each sentence
    vector_1 = [0] *len(all_words)
    vector_2 = [0] *len(all_words)

    # Count the number of occurrences of each word in each sentence
    for w in sentence_1:
        if w in stopwords:
            continue
        vector_1[all_words.index(w)] += 1
    for w in sentence_2:
        if w in stopwords:
            continue
        vector_2[all_words.index(w)] += 1

    # Compute the cosine similarity between the two sentence vectors and return the similarity score    
    return 1-cosine_distance(vector_1,vector_2)

In [None]:
# Generate a similarity matrix for a list of sentences

def gen_sim_matrix(sentences, stop_words):
  
    # Initialize a square matrix of zeros with dimensions equal to the number of sentences
    similarity_matrix = np.zeros((len(sentences), len(sentences)))

    # Compute similarity score through each pair of sentences
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):

            # Skip comparing a sentence to itself
            if idx1 == idx2:
                continue
            similarity_matrix[idx1][idx2]=sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    return similarity_matrix

In [None]:
# Generate summary for a given file


def generate_summary(file_name, top_n=5):                                       # file_name is the path to the file to be summarized
                                                                                # top_n is the number of sentences to include in the summary (default is 5)
    # Load the list of stop words
    stop_words = stopwords.words('english')
    
    # Read the contents of the file and split it into a list of sentences
    sentences = list(map(tuple, read_folder(file_name)))
    
    # Generate a similarity matrix based on the sentence pairs
    sentence_similarity_matrix = gen_sim_matrix(sentences, stop_words)
    
    # Create a graph from the similarity matrix
    sentences_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    
    # Compute a PageRank score for each sentence based on the graph
    scores = nx.pagerank(sentences_similarity_graph)
    
    # Rank the sentences by their PageRank scores and select the top N sentences for the summary
    ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    summarize_text = []
    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))
    
    # Combine the selected sentences into a single string and return the summary
    return ". ".join(summarize_text)


In [None]:
# Summarize all text files in a given folder

def summarize_files_in_folder(folder_path):                                     # folder_path is the path to the folder containing the text files
    summaries = []

    # Loop through each file in the folder and generate a summary for each text file
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            summary = generate_summary(file_path)
            summaries.append(summary)

    # Create a pandas DataFrame to store the summaries, filenames, and grades (which are currently empty)
    df = pd.DataFrame(summaries, columns=["Summary"])
    df["Filename"] = os.listdir(folder_path)
    df["Grade"] = ""

    # Reorder the columns of the DataFrame
    df = df[["Filename", "Summary", "Grade"]]
    
    # Return the DataFrame
    return df


In [None]:
def grade_summaries(df):
    for i, row in df.iterrows():
        print("Filename:", row["Filename"])
        print(row["Summary"])
        grade = input("Enter grade for this summary: ")
        df.loc[i, "Grade"] = grade
    return df

In [None]:
# Grade summaries in a DataFrame

def grade_summaries(df):                                                        # df is the DataFrame containing the summaries to grade

    # Loop through each row in the DataFrame and prompt the user to enter a grade for each summary
    for i, row in df.iterrows():
        print("Filename:", row["Filename"])
        print(row["Summary"])
        grade = input("Enter grade for this summary: ")
        df.loc[i, "Grade"] = grade
        
    # Return the DataFrame with the grades added
    return df


In [None]:
# Define the path to the folder containing the text files to summarize
folder_path = "/content/textfiles"

# Summarize the files in the folder and generate a DataFrame of the summaries
df = summarize_files_in_folder(folder_path)

# Prompt the user to grade the summaries in the DataFrame
df = grade_summaries(df)

# Save the DataFrame to a CSV file
df.to_csv("summaries.csv", index=False)


In [None]:
# Data Visualization

# Read the CSV file containing the graded summaries into a pandas DataFrame
df = pd.read_csv('summaries.csv')

# Create a bar plot of the grades for each summary
plt.bar(df['Filename'], df['Grade'])

# Rotate the x-axis labels by 90 degrees for better readability
plt.xticks(rotation=90)

# Add x and y axis labels and a plot title
plt.xlabel('Filename')
plt.ylabel('Grade')
plt.title('Summary Grades')

# Display the plot
plt.show()

