# Gwar Project

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import tarfile
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import datasets
from datasets import load_dataset
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from multiprocessing import Pool
from nltk.stem import WordNetLemmatizer

# Loading CNN Dataset

In [2]:
directory_path = '/Users/garimasingh/Desktop/Data Analyst Process/Project/Parquet/'
file_name = 'CNNdataset.parquet'
file_path = os.path.join(directory_path, file_name)

df = pd.read_parquet(file_path)

In [3]:
df.head(15)

Unnamed: 0,article,highlights,id
0,"(CNN)Share, and your gift will be multiplied. ...",Zully Broussard decided to give a kidney to a ...,a4942dd663020ca54575471657a0af38d82897d6
1,"(CNN)On the 6th of April 1996, San Jose Clash ...",The 20th MLS season begins this weekend .\nLea...,4157bc4da185971e2742f349d69a037343bc0d95
2,"(CNN)French striker Bafetimbi Gomis, who has a...",Bafetimbi Gomis collapses within 10 minutes of...,60736693e3b1b32d14337a317190c6606e879a85
3,(CNN)It was an act of frustration perhaps more...,Rory McIlroy throws club into water at WGC Cad...,8cdf9cc3ed0276b7a7944cc18ba459355b5984ad
4,(CNN)A Pennsylvania community is pulling toget...,"Cayman Naib, 13, hasn't been heard from since ...",2e6613d531843515bf5401286cc3e45c4df530d2
5,(CNN)My vote for Father of the Year goes to Cu...,Ruben Navarrette: Schilling deserves praise fo...,fbc5ac3a3a7bb6c4d628cfbeef92b67bb18562f9
6,"(CNN)Another one for the ""tourists behaving ba...",Two American women arrested for carving initia...,d093aa07380f75e63265793fcaa171772a6f4616
7,(CNN)Following last year's successful U.K. tou...,It will be a first time for the tour stateside...,0485e4f199828cd03857391cbe573142193ca953
8,(CNN)A shooting at a bar popular with expatria...,A jihadist group claims responsibility in an a...,c6df0bbd0b5eaa67e1f2b70892bf0a5dfb761895
9,(CNN)Manchester United defender Jonny Evans an...,Alleged incident happened in match at St James...,4a323dd3a1be975d93e941361082937cd2aafc88


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311971 entries, 0 to 311970
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   article     311971 non-null  object
 1   highlights  311971 non-null  object
 2   id          311971 non-null  object
dtypes: object(3)
memory usage: 7.1+ MB


# Data Cleaning

In [5]:
# Function to remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))  
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(filtered_tokens)

# 1. Remove duplicates
df = df.drop_duplicates()

# 2. Remove rows where any column is null (if null values are not expected)
df = df.dropna()

# 3. Text normalization - removing special characters and trimming excess whitespace
df['article'] = df['article'].str.replace('[^\w\s]', '', regex=True).str.strip()
df['highlights'] = df['highlights'].str.replace('[^\w\s]', '', regex=True).str.strip()

# 4. Remove stopwords from 'article' and 'highlights'
df['article'] = df['article'].apply(remove_stopwords)
df['highlights'] = df['highlights'].apply(remove_stopwords)

In [6]:
df.head(15)

Unnamed: 0,article,highlights,id
0,CNNShare gift multiplied That may sound like e...,Zully Broussard decided give kidney stranger A...,a4942dd663020ca54575471657a0af38d82897d6
1,CNNOn 6th April 1996 San Jose Clash DC United ...,The 20th MLS season begins weekend League chan...,4157bc4da185971e2742f349d69a037343bc0d95
2,CNNFrench striker Bafetimbi Gomis history fain...,Bafetimbi Gomis collapses within 10 minutes ki...,60736693e3b1b32d14337a317190c6606e879a85
3,CNNIt act frustration perhaps commonly associa...,Rory McIlroy throws club water WGC Cadillac Ch...,8cdf9cc3ed0276b7a7944cc18ba459355b5984ad
4,CNNA Pennsylvania community pulling together s...,Cayman Naib 13 hasnt heard since Wednesday Pol...,2e6613d531843515bf5401286cc3e45c4df530d2
5,CNNMy vote Father Year goes Curt Schilling The...,Ruben Navarrette Schilling deserves praise tak...,fbc5ac3a3a7bb6c4d628cfbeef92b67bb18562f9
6,CNNAnother one tourists behaving badly file Tw...,Two American women arrested carving initials C...,d093aa07380f75e63265793fcaa171772a6f4616
7,CNNFollowing last years successful UK tour Pri...,It first time tour stateside First show Louisv...,0485e4f199828cd03857391cbe573142193ca953
8,CNNA shooting bar popular expatriates Mali Sat...,A jihadist group claims responsibility audio r...,c6df0bbd0b5eaa67e1f2b70892bf0a5dfb761895
9,CNNManchester United defender Jonny Evans Newc...,Alleged incident happened match St James Park ...,4a323dd3a1be975d93e941361082937cd2aafc88


# Lemmetizing

In [7]:
lemmatizer = WordNetLemmatizer()

def lemmetize(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

In [8]:
df['article'] = df['article'].apply(lemmetize)
df['highlights'] = df['highlights'].apply(lemmetize)

In [9]:
df.head(15)

Unnamed: 0,article,highlights,id
0,CNNShare gift multiplied That may sound like e...,Zully Broussard decided give kidney stranger A...,a4942dd663020ca54575471657a0af38d82897d6
1,CNNOn 6th April 1996 San Jose Clash DC United ...,The 20th MLS season begin weekend League chang...,4157bc4da185971e2742f349d69a037343bc0d95
2,CNNFrench striker Bafetimbi Gomis history fain...,Bafetimbi Gomis collapse within 10 minute kick...,60736693e3b1b32d14337a317190c6606e879a85
3,CNNIt act frustration perhaps commonly associa...,Rory McIlroy throw club water WGC Cadillac Cha...,8cdf9cc3ed0276b7a7944cc18ba459355b5984ad
4,CNNA Pennsylvania community pulling together s...,Cayman Naib 13 hasnt heard since Wednesday Pol...,2e6613d531843515bf5401286cc3e45c4df530d2
5,CNNMy vote Father Year go Curt Schilling The f...,Ruben Navarrette Schilling deserves praise tak...,fbc5ac3a3a7bb6c4d628cfbeef92b67bb18562f9
6,CNNAnother one tourist behaving badly file Two...,Two American woman arrested carving initial Co...,d093aa07380f75e63265793fcaa171772a6f4616
7,CNNFollowing last year successful UK tour Prin...,It first time tour stateside First show Louisv...,0485e4f199828cd03857391cbe573142193ca953
8,CNNA shooting bar popular expatriate Mali Satu...,A jihadist group claim responsibility audio re...,c6df0bbd0b5eaa67e1f2b70892bf0a5dfb761895
9,CNNManchester United defender Jonny Evans Newc...,Alleged incident happened match St James Park ...,4a323dd3a1be975d93e941361082937cd2aafc88


In [10]:
# Remove "CNN" from headlines
df['article'] = df['article'].str.replace('^CNN', '', regex=True)
df['article'] = df['article'].str.strip()

In [11]:
df.head(15)

Unnamed: 0,article,highlights,id
0,Share gift multiplied That may sound like esot...,Zully Broussard decided give kidney stranger A...,a4942dd663020ca54575471657a0af38d82897d6
1,On 6th April 1996 San Jose Clash DC United str...,The 20th MLS season begin weekend League chang...,4157bc4da185971e2742f349d69a037343bc0d95
2,French striker Bafetimbi Gomis history faintin...,Bafetimbi Gomis collapse within 10 minute kick...,60736693e3b1b32d14337a317190c6606e879a85
3,It act frustration perhaps commonly associated...,Rory McIlroy throw club water WGC Cadillac Cha...,8cdf9cc3ed0276b7a7944cc18ba459355b5984ad
4,A Pennsylvania community pulling together sear...,Cayman Naib 13 hasnt heard since Wednesday Pol...,2e6613d531843515bf5401286cc3e45c4df530d2
5,My vote Father Year go Curt Schilling The form...,Ruben Navarrette Schilling deserves praise tak...,fbc5ac3a3a7bb6c4d628cfbeef92b67bb18562f9
6,Another one tourist behaving badly file Two Am...,Two American woman arrested carving initial Co...,d093aa07380f75e63265793fcaa171772a6f4616
7,Following last year successful UK tour Prince ...,It first time tour stateside First show Louisv...,0485e4f199828cd03857391cbe573142193ca953
8,A shooting bar popular expatriate Mali Saturda...,A jihadist group claim responsibility audio re...,c6df0bbd0b5eaa67e1f2b70892bf0a5dfb761895
9,Manchester United defender Jonny Evans Newcast...,Alleged incident happened match St James Park ...,4a323dd3a1be975d93e941361082937cd2aafc88


# Loading Xsum Dataset

In [12]:
_CITATION = """
@article{Narayan2018DontGM,
  title={Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization},
  author={Shashi Narayan and Shay B. Cohen and Mirella Lapata},
  journal={ArXiv},
  year={2018},
  volume={abs/1808.08745}
}
"""

_DESCRIPTION = """
Extreme Summarization (XSum) Dataset.
There are three features:
  - document: Input news article.
  - summary: One sentence summary of the article.
  - id: BBC ID of the article.
"""

# From https://github.com/EdinburghNLP/XSum/issues/12
_URL_DATA = "/Users/garimasingh/Desktop/Data Analyst Process/Project/Parquet/XSUM-EMNLP18-Summary-Data-Original.tar.gz"
_URL_SPLITS = (
    "https://raw.githubusercontent.com/EdinburghNLP/XSum/master/XSum-Dataset/XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json"
)

_DOCUMENT = "document"
_SUMMARY = "summary"
_ID = "id"

_REMOVE_LINES = set(
    [
        "Share this with\n",
        "Email\n",
        "Facebook\n",
        "Messenger\n",
        "Twitter\n",
        "Pinterest\n",
        "WhatsApp\n",
        "Linkedin\n",
        "LinkedIn\n",
        "Copy this link\n",
        "These are external links and will open in a new window\n",
    ]
)


class Xsum(datasets.GeneratorBasedBuilder):
    """Extreme Summarization (XSum) Dataset."""

    # Version 1.2.0 expands coverage, includes ids, and removes web contents.
    VERSION = datasets.Version("1.2.0")

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    _DOCUMENT: datasets.Value("string"),
                    _SUMMARY: datasets.Value("string"),
                    _ID: datasets.Value("string"),
                }
            ),
            supervised_keys=(_DOCUMENT, _SUMMARY),
            homepage="https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""

        files_to_download = {"data": _URL_DATA, "splits": _URL_SPLITS}
        downloaded_files = dl_manager.download(files_to_download)

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "split_path": downloaded_files["splits"],
                    "split_name": "train",
                    "data_dir": "bbc-summary-data",
                    "files": dl_manager.iter_archive(downloaded_files["data"]),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "split_path": downloaded_files["splits"],
                    "split_name": "validation",
                    "data_dir": "bbc-summary-data",
                    "files": dl_manager.iter_archive(downloaded_files["data"]),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "split_path": downloaded_files["splits"],
                    "split_name": "test",
                    "data_dir": "bbc-summary-data",
                    "files": dl_manager.iter_archive(downloaded_files["data"]),
                },
            ),
        ]

    def _generate_examples(self, split_path, split_name, data_dir, files):
        """Yields examples."""

        with open(split_path, "r", encoding="utf-8") as f:
            split_ids = json.load(f)
        split_ids = {k: set(v) for k, v in split_ids.items()}

        for path, f in files:
            if not split_ids[split_name]:
                break
            elif path.startswith(data_dir) and path.endswith(".summary"):
                i = os.path.basename(path).split(".")[0]
                if i in split_ids[split_name]:
                    split_ids[split_name].remove(i)
                    text = "".join(
                        [
                            line.decode("utf-8")
                            for line in f.readlines()
                            if line.decode("utf-8") not in _REMOVE_LINES and line.strip()
                        ]
                    )
                    # Each file follows below format:
                    # [SN]URL[SN]
                    # http://somelink
                    #
                    # [SN]TITLE[SN]
                    # some intro
                    #
                    # [SN]FIRST-SENTENCE[SN]
                    # some intro
                    #
                    # [SN]RESTBODY[SN]
                    # text line.
                    # another text line.
                    # "another text line."

                    # According to the following issue, FIRST-SENTENCE
                    # is the reference summary and TITLE is unused:
                    # https://github.com/EdinburghNLP/XSum/issues/22
                    segs = text.split("[SN]")
                    yield i, {_DOCUMENT: segs[8].strip(), _SUMMARY: segs[6].strip(), _ID: i}

In [13]:
# Load the Xsum dataset
xsum_dataset = load_dataset("xsum")

# Convert the dataset splits to pandas DataFrames
train_df = pd.DataFrame(xsum_dataset["train"])
validation_df = pd.DataFrame(xsum_dataset["validation"])
test_df = pd.DataFrame(xsum_dataset["test"])

# Display information about the DataFrames
print("Train Dataset Info:")
print(train_df.info())
print("\nValidation Dataset Info:")
print(validation_df.info())
print("\nTest Dataset Info:")
print(test_df.info())

Found cached dataset xsum (/Users/garimasingh/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)


  0%|          | 0/3 [00:00<?, ?it/s]

Train Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204045 entries, 0 to 204044
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   document  204045 non-null  object
 1   summary   204045 non-null  object
 2   id        204045 non-null  object
dtypes: object(3)
memory usage: 4.7+ MB
None

Validation Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11332 entries, 0 to 11331
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   document  11332 non-null  object
 1   summary   11332 non-null  object
 2   id        11332 non-null  object
dtypes: object(3)
memory usage: 265.7+ KB
None

Test Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11334 entries, 0 to 11333
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   document  11334 non-null  object
 1   summary   11

In [14]:
xsum_dataset = load_dataset("xsum")

full_df = pd.concat([pd.DataFrame(split) for split in xsum_dataset.values()])

print("Full Dataset Info:")
print(full_df.info())

Found cached dataset xsum (/Users/garimasingh/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)


  0%|          | 0/3 [00:00<?, ?it/s]

Full Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 226711 entries, 0 to 11333
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   document  226711 non-null  object
 1   summary   226711 non-null  object
 2   id        226711 non-null  object
dtypes: object(3)
memory usage: 6.9+ MB
None


In [15]:
full_df.head(15)

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984
5,Simone Favaro got the crucial try with the las...,Defending Pro12 champions Glasgow Warriors bag...,34540833
6,"Veronica Vanessa Chango-Alverez, 31, was kille...",A man with links to a car that was involved in...,20836172
7,Belgian cyclist Demoitie died after a collisio...,Welsh cyclist Luke Rowe says changes to the sp...,35932467
8,"Gundogan, 26, told BBC Sport he ""can see the f...",Manchester City midfielder Ilkay Gundogan says...,40758845
9,The crash happened about 07:20 GMT at the junc...,A jogger has been hit by an unmarked police ca...,30358490


# Cleaning the dataset

In [16]:
# 1. Remove duplicates
full_df = full_df.drop_duplicates()

# 2. Handle missing values (assuming any row with a missing column should be removed)
full_df = full_df.dropna()

# 3. Text normalization - removing extra spaces and special characters if necessary
full_df['document'] = full_df['document'].str.replace('[^\w\s]', '', regex=True).str.strip()
full_df['summary'] = full_df['summary'].str.replace('[^\w\s]', '', regex=True).str.strip()

# 4. Remove stopwords from 'document' and 'summary'
full_df['document'] = full_df['document'].apply(remove_stopwords)
full_df['summary'] = full_df['summary'].apply(remove_stopwords)

In [17]:
full_df.head(15)

Unnamed: 0,document,summary,id
0,The full cost damage Newton Stewart one areas ...,Cleanup operations continuing across Scottish ...,35232142
1,A fire alarm went Holiday Inn Hope Street 0420...,Two tourist buses destroyed fire suspected ars...,40143035
2,Ferrari appeared position challenge final laps...,Lewis Hamilton stormed pole position Bahrain G...,35951548
3,John Edward Bates formerly Spalding Lincolnshi...,A former Lincolnshire Police officer carried s...,36266422
4,Patients staff evacuated Cerahpasa hospital We...,An armed man locked room psychiatric hospital ...,38826984
5,Simone Favaro got crucial try last move game f...,Defending Pro12 champions Glasgow Warriors bag...,34540833
6,Veronica Vanessa ChangoAlverez 31 killed anoth...,A man links car involved fatal bus stop crash ...,20836172
7,Belgian cyclist Demoitie died collision motorb...,Welsh cyclist Luke Rowe says changes sport mus...,35932467
8,Gundogan 26 told BBC Sport see finishing line ...,Manchester City midfielder Ilkay Gundogan says...,40758845
9,The crash happened 0720 GMT junction A127 Prog...,A jogger hit unmarked police car responding em...,30358490


# Lemmetize

In [18]:
full_df['document'] = full_df['document'].apply(lemmetize)
full_df['summary'] = full_df['summary'].apply(lemmetize)

In [19]:
full_df.head(15)

Unnamed: 0,document,summary,id
0,The full cost damage Newton Stewart one area w...,Cleanup operation continuing across Scottish B...,35232142
1,A fire alarm went Holiday Inn Hope Street 0420...,Two tourist bus destroyed fire suspected arson...,40143035
2,Ferrari appeared position challenge final lap ...,Lewis Hamilton stormed pole position Bahrain G...,35951548
3,John Edward Bates formerly Spalding Lincolnshi...,A former Lincolnshire Police officer carried s...,36266422
4,Patients staff evacuated Cerahpasa hospital We...,An armed man locked room psychiatric hospital ...,38826984
5,Simone Favaro got crucial try last move game f...,Defending Pro12 champion Glasgow Warriors bagg...,34540833
6,Veronica Vanessa ChangoAlverez 31 killed anoth...,A man link car involved fatal bus stop crash s...,20836172
7,Belgian cyclist Demoitie died collision motorb...,Welsh cyclist Luke Rowe say change sport must ...,35932467
8,Gundogan 26 told BBC Sport see finishing line ...,Manchester City midfielder Ilkay Gundogan say ...,40758845
9,The crash happened 0720 GMT junction A127 Prog...,A jogger hit unmarked police car responding em...,30358490


# Regularization

In [20]:
def regularize_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Optional: remove digits
    # text = re.sub(r'\d+', '', text)
    return text

# Apply regularization to both dataframes
df['article'] = df['article'].apply(regularize_text)
df['highlights'] = df['highlights'].apply(regularize_text)
full_df['document'] = full_df['document'].apply(regularize_text)
full_df['summary'] = full_df['summary'].apply(regularize_text)

In [21]:
df.head() #CNNDataset

Unnamed: 0,article,highlights,id
0,share gift multiplied that may sound like esot...,zully broussard decided give kidney stranger a...,a4942dd663020ca54575471657a0af38d82897d6
1,on 6th april 1996 san jose clash dc united str...,the 20th mls season begin weekend league chang...,4157bc4da185971e2742f349d69a037343bc0d95
2,french striker bafetimbi gomis history faintin...,bafetimbi gomis collapse within 10 minute kick...,60736693e3b1b32d14337a317190c6606e879a85
3,it act frustration perhaps commonly associated...,rory mcilroy throw club water wgc cadillac cha...,8cdf9cc3ed0276b7a7944cc18ba459355b5984ad
4,a pennsylvania community pulling together sear...,cayman naib 13 hasnt heard since wednesday pol...,2e6613d531843515bf5401286cc3e45c4df530d2


In [22]:
full_df.head() #XsumDataset

Unnamed: 0,document,summary,id
0,the full cost damage newton stewart one area w...,cleanup operation continuing across scottish b...,35232142
1,a fire alarm went holiday inn hope street 0420...,two tourist bus destroyed fire suspected arson...,40143035
2,ferrari appeared position challenge final lap ...,lewis hamilton stormed pole position bahrain g...,35951548
3,john edward bates formerly spalding lincolnshi...,a former lincolnshire police officer carried s...,36266422
4,patients staff evacuated cerahpasa hospital we...,an armed man locked room psychiatric hospital ...,38826984


# Transformation of Data

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import IncrementalPCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error

In [25]:
# Apply TF-IDF Vectorization across both DataFrames with the same vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
df_tfidf = tfidf_vectorizer.fit_transform(df['article'])
full_df_tfidf = tfidf_vectorizer.transform(full_df['document']) 
print("Numerical conversion of df:",df_tfidf.shape)
print("Numerical conversion of full_df:",full_df_tfidf.shape)

# Apply SVD for dimensionality reduction
svd = TruncatedSVD(n_components=100)
df_reduced = svd.fit_transform(df_tfidf)  
full_df_reduced = svd.transform(full_df_tfidf)  

print("Reduced shape for df:", df_reduced.shape)
print("Reduced shape for full_df:", full_df_reduced.shape)

Numerical conversion of df: (311971, 1000)
Numerical conversion of full_df: (226711, 1000)
Reduced shape for df: (311971, 100)
Reduced shape for full_df: (226711, 100)


In [26]:
# Adding new features such as text length
df['article_length'] = df['article'].apply(len)
df['highlights_length'] = df['highlights'].apply(len)
full_df['document_length'] = full_df['document'].apply(len)
full_df['summary_length'] = full_df['summary'].apply(len)

In [27]:
df.head()

Unnamed: 0,article,highlights,id,article_length,highlights_length
0,share gift multiplied that may sound like esot...,zully broussard decided give kidney stranger a...,a4942dd663020ca54575471657a0af38d82897d6,2964,118
1,on 6th april 1996 san jose clash dc united str...,the 20th mls season begin weekend league chang...,4157bc4da185971e2742f349d69a037343bc0d95,5322,151
2,french striker bafetimbi gomis history faintin...,bafetimbi gomis collapse within 10 minute kick...,60736693e3b1b32d14337a317190c6606e879a85,1760,215
3,it act frustration perhaps commonly associated...,rory mcilroy throw club water wgc cadillac cha...,8cdf9cc3ed0276b7a7944cc18ba459355b5984ad,1306,110
4,a pennsylvania community pulling together sear...,cayman naib 13 hasnt heard since wednesday pol...,2e6613d531843515bf5401286cc3e45c4df530d2,1786,86


In [28]:
full_df.head()

Unnamed: 0,document,summary,id,document_length,summary_length
0,the full cost damage newton stewart one area w...,cleanup operation continuing across scottish b...,35232142,1587,98
1,a fire alarm went holiday inn hope street 0420...,two tourist bus destroyed fire suspected arson...,40143035,566,73
2,ferrari appeared position challenge final lap ...,lewis hamilton stormed pole position bahrain g...,35951548,3317,92
3,john edward bates formerly spalding lincolnshi...,a former lincolnshire police officer carried s...,36266422,1070,96
4,patients staff evacuated cerahpasa hospital we...,an armed man locked room psychiatric hospital ...,38826984,741,94


# Splitting datasets into test, train and validation

CNN Dataset

In [29]:
# Splitting data into training set (90%) and the remaining 10%
train, remaining = train_test_split(df, test_size=0.1, random_state=42)

# Splitting the remaining data into validation and test sets (50% each of remaining data, which equals 5% each of total data)
validation, test = train_test_split(remaining, test_size=0.5, random_state=42)

# Saving datasets to CSV files
train.to_csv('/Users/garimasingh/Desktop/Data Analyst Process/Project/CNN Dataset/CNNTrain.csv', index=False)
validation.to_csv('/Users/garimasingh/Desktop/Data Analyst Process/Project/CNN Dataset/CNNValidation.csv', index=False)
test.to_csv('/Users/garimasingh/Desktop/Data Analyst Process/Project/CNN Dataset/CNNTest.csv', index=False)

Xsum dataset

In [30]:
# Splitting data into training set (90%) and the remaining 10%
train, remaining = train_test_split(full_df, test_size=0.1, random_state=42)

# Splitting the remaining data into validation and test sets (50% each of remaining data, which equals 5% each of total data)
validation, test = train_test_split(remaining, test_size=0.5, random_state=42)

# Define the path to save the files
output_path = '/Users/garimasingh/Desktop/Data Analyst Process/Project/Xsum dataset/'

# Saving datasets to CSV files
train.to_csv(output_path + 'XsumTrain.csv', index=False)
validation.to_csv(output_path + 'XsumValidation.csv', index=False)
test.to_csv(output_path + 'XsumTest.csv', index=False)