In [30]:
import pandas as pd

In [31]:
data = pd.read_csv("news_summary.csv", encoding='latin1')  

In [32]:
data.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


#### columns in the dataset
author: Name of the author.

date: Date of publication.

headlines: Headline of the news article. 

read_more: URL link for further reading.

text: Summarized version of the article.

ctext: Complete article text.

In [33]:
data.isnull().sum()


author         0
date           0
headlines      0
read_more      0
text           0
ctext        118
dtype: int64

In [34]:
data['ctext'] = data['ctext'].fillna("No content available")


In [35]:
data.isnull().sum()

author       0
date         0
headlines    0
read_more    0
text         0
ctext        0
dtype: int64

In [36]:
import re
import nltk
from nltk.tokenize import word_tokenize


# Preprocessing

### Cleaning

In [37]:
# Function to clean text
def clean_text(text):
    # Remove non-alphabetic characters (e.g., punctuation, digits)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Apply text cleaning on 'ctext' (complete article) and 'text' (summary)
data['ctext'] = data['ctext'].apply(clean_text)
data['text'] = data['text'].apply(clean_text)

# Verify cleaning
print(data[['ctext', 'text']].head())

                                               ctext  \
0  the daman and diu administration on wednesday ...   
1  from her special numbers to tvappearances boll...   
2  the indira gandhi institute of medical science...   
3  lashkaretaibas kashmir commander abu dujana wa...   
4  hotels in mumbai and other indian cities are t...   

                                                text  
0  the administration of union territory daman an...  
1  malaika arora slammed an instagram user who tr...  
2  the indira gandhi institute of medical science...  
3  lashkaretaibas kashmir commander abu dujana wh...  
4  hotels in maharashtra will train their staff t...  


### Tokenization

In [38]:
import spacy

# Loading spaCy's English model
nlp = spacy.load('en_core_web_sm')

# Function to tokenize using spaCy
def spacy_tokenizer(text):
    doc = nlp(text)
    return [token.text for token in doc]

# Apply tokenization to the data
data['ctext_tokens'] = data['ctext'].apply(spacy_tokenizer)
data['text_tokens'] = data['text'].apply(spacy_tokenizer)


In [39]:
print(data['ctext'].head())


0    the daman and diu administration on wednesday ...
1    from her special numbers to tvappearances boll...
2    the indira gandhi institute of medical science...
3    lashkaretaibas kashmir commander abu dujana wa...
4    hotels in mumbai and other indian cities are t...
Name: ctext, dtype: object


### Lowercasing

In [40]:
#Converting all text to lowercase to make the text consistent and reduce the size of the vocabulary.
data['ctext'] = data['ctext'].str.lower()
data['text'] = data['text'].str.lower()


### Handling Contractions

In [41]:
#Contractions are shortened forms of words or phrases created by omitting certain
#letters and replacing them with an apostrophe (') like cannot --> can't
#Expanding contractions can be important for text preprocessing, especially for NLP tasks, 
#as it ensures consistency in the text and helps models understand the full form of words.
import contractions
#This imports the contractions library, which provides a fix() function to expand contractions in a string.

##### 
The apply() method is used to apply a function to every element in the data['ctext'] column. 


The lambda function (lambda x: contractions.fix(x)) passes each string x from the column to the contractions.fix() function.

In [42]:
data['ctext'] = data['ctext'].apply(lambda x: contractions.fix(x))
data['text'] = data['text'].apply(lambda x: contractions.fix(x))

### Lemmatization

##### Lemmatization is the process of reducing words to their base or root form (known as a lemma) while ensuring that the resulting word is valid in the language. 
why?

+ Reduces Inflectional Variants: Words like running, runs, and ran are reduced to run, improving consistency in the dataset.

+ Improves Model Performance: Helps in reducing dimensionality and improves the efficiency of machine learning or NLP models.

+ Preserves Meaning: Unlike stemming, lemmatization ensures the root word is meaningful in the language.

In [43]:
from nltk.stem import WordNetLemmatizer
#The WordNetLemmatizer is part of the Natural Language Toolkit (nltk) library and is specifically
#designed to perform lemmatization by leveraging the WordNet lexical database.

In [44]:
lemmatizer = WordNetLemmatizer()
#Creates an instance of the WordNetLemmatizer class, which provides the lemmatize() method.

In [45]:
def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [46]:
data['ctext'] = data['ctext'].apply(lemmatize_text)
data['text'] = data['text'].apply(lemmatize_text)

### Truncation

##### 
+ Truncation ensures that overly long sequences are cut down to a manageable size, reducing memory usage and improving training efficiency.
+ Padding ensures that shorter sequences are extended to a fixed length, so they align with longer sequences during batch processing.

For text summarization:

+ Articles (input) might need a longer maximum sequence length (e.g., 512 tokens).
+ Summaries (output) typically require shorter lengths (e.g., 128 tokens).
+ Processing overly long texts can be computationally expensive.
+ Limiting the length reduces memory and computation costs.


##### 
In text summarization tasks, input sequences (articles) and output sequences (summaries) often have varying lengths. Neural networks require inputs of uniform size, so truncation and padding are necessary preprocessing steps.


In [47]:
#Defining the maximum lengths
max_article_len = 512
max_summary_len = 128

In [48]:
data['ctext'] = data['ctext'].apply(lambda x: " ".join(x.split()[:max_article_len]))
data['text'] = data['text'].apply(lambda x: " ".join(x.split()[:max_summary_len]))
#splitting the lines then only taking the specified number of words 

### Removing rare words

##### 
+ Rare words (those that appear only once) can often be unimportant for many NLP tasks, especially in text classification or topic modeling.
+ Removing these words helps reduce the noise in the data and can improve model performance.
+ By reducing the dimensionality (i.e., removing rare words), the size of the vocabulary decreases, which can speed up training and reduce overfitting.
+ In some cases, rare words may be outliers or irrelevant for the analysis, so removing them can help focus on the more important and consistent terms.

In [49]:
from collections import Counter

# Get word frequencies
all_words = " ".join(data['ctext']).split()
word_freq = Counter(all_words)  #returns a dictionary of all words with their frequencies

# Remove rare words
rare_words = set(word for word, freq in word_freq.items() if freq == 1)
data['ctext'] = data['ctext'].apply(lambda x: " ".join(word for word in x.split() if word not in rare_words))

### Text Vectorization

##### 
+ Converting the cleaned text into numerical format that the model can understand.
+ Essential because neural networks require inputs as numerical data. Text needs to be converted into sequences of integers where each integer represents a specific word or token in your vocabulary.
+ Padding is the process of adding extra tokens (usually zero) to the end (or sometimes the beginning) of a sequence to make all sequences in the dataset have the same length. These padding tokens don’t carry any meaningful information but are used solely to standardize the length of all sequences in the dataset.

In [57]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [58]:
#Initialize the tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on both articles (ctext) and summaries (text)
tokenizer.fit_on_texts(data['ctext'].tolist() + data['text'].tolist())

# Convert the texts to sequences of integers
data['ctext_seq'] = tokenizer.texts_to_sequences(data['ctext'])
data['text_seq'] = tokenizer.texts_to_sequences(data['text'])

In [59]:
# Define the maximum lengths
max_article_len = 512
max_summary_len = 128

# Apply padding to make all sequences the same length
padded_ctext_seq = pad_sequences(data['ctext_seq'], maxlen=max_article_len, padding='post', truncating='post')
padded_text_seq = pad_sequences(data['text_seq'], maxlen=max_summary_len, padding='post', truncating='post')

# padded sequences for training
data['padded_ctext_seq'] = list(padded_ctext_seq)
data['padded_text_seq'] = list(padded_text_seq)

##### 
+ Tokenizer: This is a class from Keras' tensorflow.keras.preprocessing.text module. It's used to convert text into a sequence of integers. Each integer corresponds to a word in the dataset's vocabulary. The Tokenizer object will create a mapping between words and integers based on their frequency in the training data.
+ fit_on_texts: This method is used to learn the word indices (integer mappings) from a list of texts.
+  It processes all the words in both the ctext (articles) and text (summaries) columns, counting the frequency of each word. Words that appear more frequently will have a lower index, while rare words will have a higher index. The fit_on_texts method creates a word-to-integer mapping, called the word index.
+  texts_to_sequences: This method converts each text (article or summary) into a sequence of integers based on the word-to-integer mapping learned earlier in the fit_on_texts() method.
+  For each word in an article or summary, it replaces the word with its corresponding integer value from the vocabulary built by fit_on_texts().
+  pad_sequences: This function ensures that all sequences (of articles and summaries) have the same length. It handles both truncation (shortening long sequences) and padding (extending short sequences).
+  maxlen=max_article_len: Ensures the sequences are at most 512 words long (articles).
maxlen=max_summary_len: Ensures the sequences are at most 128 words long (summaries).
+ padding='post': This means that if the sequence is shorter than the desired length, zeros will be added at the end of the sequence. If padding='pre', zeros would be added at the beginning instead.
truncating='post': If the sequence is longer than the desired length, it will be truncated (cut off) from the end. If truncating='pre', truncation would happen from the beginning.
+ Padding and truncation ensure that all sequences have a uniform length. Deep learning models typically expect fixed-size inputs, and padding ensures that all sequences match the specified length.
+ After applying padding and truncation, the padded sequences are stored in the DataFrame as data['padded_ctext_seq']
(for articles) and data['padded_text_seq'] (for summaries).



##### Summary of the Workflow
+ Text Tokenization: Convert the raw text (articles and summaries) into sequences of integers, where each integer represents a word.
+ Padding and Truncation: Ensure that all sequences (both input articles and output summaries) have the same length by either truncating longer sequences or padding shorter ones.
+ Preprocessing: Once the sequences are tokenized and padded, they are stored in the DataFrame for use in your model.