In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from math import ceil
import re
import csv
from sklearn.model_selection import train_test_split

In [87]:
# Loading the csv using pandas
df = pd.read_csv("news.csv")
df = df.fillna(method = "ffill")
contents = df["content"]

# deleting title column from dataframe
df = df.drop('title', axis=1)

# Renaming content column to original content
df = df.rename(columns={'content': 'original_content'})

# Inserting new columns in the dataframe
df.insert(1, 'new_content', '')
df.insert(2, 'removed_lines', '')
df.insert(3, 'top_sentence_tf_idf', '')
print(contents)

0      After reaching his hotel in the city, RM revea...
1      RM aka Kim Namjoon was the first member to joi...
2      Billie Eilish's concert was held in Seoul, Sou...
3      BTS ARMY y'all would be missing the members a ...
4      BTS member Kim Seokjin aka Jin has the capacit...
                             ...                        
805    BTS has conquered the world with their group r...
806    Today marks 700 days since BTS' worldwide hand...
807    BTS' youngest member Jungkook came online on W...
808    BTS' eldest member Jin has shared pictures and...
809    After a lot of teasing, Benny Blanco’s collabo...
Name: content, Length: 810, dtype: object


In [88]:
for i, content in enumerate(contents):
    
    # using regex to replace bad characters with their actual conterparts.
    content = re.sub(r"â€™|â€˜|â€|â€œ", "'", content)
    content = re.sub(r"Â", "", content)
    content = re.sub(r"&nbsp;", " ", content)
    
    # Some content data doesn't have periods "." (due to web scraping issues probably) 
    # so that can't be used for splitting. They have double whitespaces ("  ") instead
    content = content.replace("  ", ". ")
    
    # Split the data into individual sentences
    sentences = content.split(".")

    # Apply a tf-idf vectorizer to the sentences to calculate the tf-idf scores of each word in the sentence
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)

    # Calculate the tf-idf score of each sentence by summing the tf-idf scores of all the words in the sentence
    sentence_scores = tfidf_matrix.sum(axis=1)

    # Rank the sentences based on their tf-idf scores, from highest to lowest
    ranked_sentences = sorted(((score, index) for index, score in enumerate(sentence_scores)), reverse=True)
    
    # Select the top 10% sentences as the most important sentences in the document
    top_sentences_count = ceil(len(sentences) * 0.1)
    top_sentences = sorted([sentences[i] for score, i in ranked_sentences[:top_sentences_count]])
    
    # Getting the top tf-idf scores
    top_scores = sorted([score for score, i in ranked_sentences[:top_sentences_count]])
    curr_top_tfidf = sorted(top_scores, reverse = True)[0]
    curr_top_tfidf = curr_top_tfidf[0,0]
    
    # Selecting the remaining sentences
    remaining_sentences = sorted([sentences[i] for score, i in ranked_sentences[top_sentences_count:]])

    # Generate a summary of the text using the selected sentences
    summary = '. '.join(top_sentences)
    
    # Join all the remaining sentences to get the removed lines
    removed_content = ". ".join(remaining_sentences)
    
    # Omitting the extra periods and spaces at the starting of the texts
    while summary.startswith('.') or summary.startswith(' ') or summary.startswith('\n'):
        summary = summary[1:]
    
    while removed_content.startswith('.') or removed_content.startswith(' ') or removed_content.startswith('\n'):
        removed_content = removed_content[1:]
    
    # Inserting the new data into the dataframe
    df.loc[i, 'new_content'] = summary
    df.loc[i, 'removed_lines'] = removed_content
    df.loc[i, 'top_sentence_tf_idf'] = curr_top_tfidf

In [89]:
# Creating the final cleaned dataset from the dataframe
df.to_csv('cleaned_news_set.csv', index=False)

In [90]:
# Load the data from the CSV file for splitting
data = pd.read_csv('cleaned_news_set.csv')

# Split the data into training and testing sets
train, test = train_test_split(data, test_size=0.1, random_state=42)

# Save the training and testing sets to CSV files
train.to_csv('train_set.csv', index=False)
test.to_csv('test_set.csv', index=False)