In [2]:
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from heapq import nlargest

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [5]:
os.chdir('/Users/viviando/Desktop/MSADS/ads599_capstone/climate-lang-processing')
df = pd.read_csv('data/news_cleaned.csv')

In [9]:
sample_df = df.sample(50)

### Extractive summarization, where an exact sentence (determined to be the most important) from the text is extracted to be the summary. 

In [10]:
def summarize_text(text):
    # Tokenize sentences
    sentences = sent_tokenize(text)

    # Tokenize words
    words = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

    # Calculate word frequency
    word_freq = FreqDist(words)

    # Calculate sentence scores based on word frequency
    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_freq:
                if sentence not in sentence_scores:
                    sentence_scores[sentence] = word_freq[word]
                else:
                    sentence_scores[sentence] += word_freq[word]

    # Select top 30% sentences with highest scores as summary
    summary_sentences = nlargest(int(0.3*len(sentences)), sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)

    return summary

# Apply summarization function to each snippet in the DataFrame
sample_df['summary'] = sample_df['snippet'].apply(summarize_text)

# Display the original snippet and its summary
for index, row in sample_df.iterrows():
    print("Original Text:")
    print(row['snippet'])
    print("\nSummary:")
    print(row['summary'])
    print("\n")

Original Text:
wrong. i have to go because wayne has agreed with juan in two of the three blocks so far, we're going to have to leave it there. my head is spinning. thanks to michelle fields and juan williams for joining us this week. coming up snowed in and sick of it. hey, global warming crowd, i got

Summary:
i have to go because wayne has agreed with juan in two of the three blocks so far, we're going to have to leave it there.


Original Text:
and i followed grimm's response to hurricane sandy and traced his trajectory on the issue of climate change and something amazing happened over the course of our time with him. republican michael grimm went from being a climate change denialist, someone who rejected

Summary:



Original Text:
that climate change exists than a blizzard that it doesn't. unless a blizzard hits miami. we have a panel of experts to help us understand the science and consequences of climate change and yes, ideas to break the political paralysis over it. kate marv

### Summarization using entity recognition

In [11]:
import spacy
import pandas as pd


# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def summarize_text_with_spacy(text):
    # Process the text with spaCy
    doc = nlp(text)

    # Extract the main subject (entity) from the text
    main_subjects = []
    for ent in doc.ents:
        if ent.label_ in ["ORG", "PERSON", "GPE"]:  # Consider only organizations, people, and geopolitical entities
            main_subjects.append(ent.text)

    # If no named entities are found, use the root of the sentence
    if not main_subjects:
        for sent in doc.sents:
            main_subjects.append(sent.root.text)

    # Return the main subjects as a summarized text
    return ", ".join(main_subjects)

# Apply summarization function to each snippet in the DataFrame
sample_df['summary'] = sample_df['snippet'].apply(summarize_text_with_spacy)

# Display the original snippet and its summary
for index, row in sample_df.iterrows():
    print("Original Text:")
    print(row['snippet'])
    print("\nSummary:")
    print(row['summary'])
    print("\n")


Original Text:
wrong. i have to go because wayne has agreed with juan in two of the three blocks so far, we're going to have to leave it there. my head is spinning. thanks to michelle fields and juan williams for joining us this week. coming up snowed in and sick of it. hey, global warming crowd, i got

Summary:
michelle fields, juan williams


Original Text:
and i followed grimm's response to hurricane sandy and traced his trajectory on the issue of climate change and something amazing happened over the course of our time with him. republican michael grimm went from being a climate change denialist, someone who rejected

Summary:
grimm, michael grimm


Original Text:
that climate change exists than a blizzard that it doesn't. unless a blizzard hits miami. we have a panel of experts to help us understand the science and consequences of climate change and yes, ideas to break the political paralysis over it. kate marvel is a scientist at

Summary:
miami, kate


Original Text:
it is up on

### Abstractive summarization

In [12]:
import spacy
import pandas as pd

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def summarize_text_without_entities(text):
    # Process the text with spaCy
    doc = nlp(text)

    # Extract the most important words or phrases using dependency parsing
    main_ideas = []
    for sent in doc.sents:
        root = None
        for token in sent:
            if token.dep_ == "ROOT":
                root = token
                break
        if root:
            # Add the root token and its direct children to the main ideas
            main_ideas.append(root.text)
            for child in root.children:
                if child.dep_ != "punct":  # Exclude punctuation
                    main_ideas.append(child.text)

    # Return the main ideas as a summarized text
    return " ".join(main_ideas[:4])  # Limit to 3-4 words

# Apply summarization function to each snippet in the DataFrame
sample_df['summary'] = sample_df['snippet'].apply(summarize_text_without_entities)

# Display the original snippet and its summary
for index, row in sample_df.iterrows():
    print("Original Text:")
    print(row['snippet'])
    print("\nSummary:")
    print(row['summary'])
    print("\n")


Original Text:
wrong. i have to go because wayne has agreed with juan in two of the three blocks so far, we're going to have to leave it there. my head is spinning. thanks to michelle fields and juan williams for joining us this week. coming up snowed in and sick of it. hey, global warming crowd, i got

Summary:
wrong have i go


Original Text:
and i followed grimm's response to hurricane sandy and traced his trajectory on the issue of climate change and something amazing happened over the course of our time with him. republican michael grimm went from being a climate change denialist, someone who rejected

Summary:
followed and i response


Original Text:
that climate change exists than a blizzard that it doesn't. unless a blizzard hits miami. we have a panel of experts to help us understand the science and consequences of climate change and yes, ideas to break the political paralysis over it. kate marvel is a scientist at

Summary:
exists change than hits


Original Text:
it is up on