## import libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pickle

# import clean text data
import re
import html
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string
import unicodedata
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')



#Dat_preprocssing
from sklearn.model_selection import train_test_split
from datasets import Dataset

#model
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback

* Read dataset

In [None]:

df=pd.read_csv('text_summarizer_data.csv')
df.head()

## EDA

In [None]:
# to know shape of data
df.shape

In [None]:
#to see null or no
df.isnull().sum()

In [None]:
df.info()

In [None]:
#drop unnecessary column
df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
#to know if there is duplicated or no
df.duplicated().sum()

In [None]:
# to drop duplicated
df.drop_duplicates(inplace=True)

In [None]:
#check again
df.duplicated().sum()

In [None]:
# to know number of characters in row 4

len(df['content'][4])

In [None]:
#for example to know number of tokenz in row 4

len(df['content'][4].split())

### `VIS`

In [None]:
length_statment = df['content'].str.len()
## Plotting
plt.figure(figsize=(10, 6))
plt.hist(length_statment, bins=50)
plt.title('Length of statment')
plt.xlabel('Content')
plt.ylabel('Frequency')
plt.show()

In [None]:
length_statment = df['title'].str.len()
## Plotting
plt.figure(figsize=(10, 6))
plt.hist(length_statment, bins=50)
plt.title('Length of title')
plt.xlabel('title')
plt.ylabel('Frequency')
plt.show()

In [None]:
# merge all text in only text to create wordcloude
all_text = ' '.join(df['content'].astype(str))

#to see more repeat word in curpus
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# merge all text in only text to create wordcloude
all_text = ' '.join(df['title'].astype(str))

#to see more repeat word in curpus
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:

sid = SentimentIntensityAnalyzer()

# implement analysis on 'title','content'
df['sentiment_title'] = df['title'].apply(lambda x: sid.polarity_scores(x)['compound'])
df['sentiment_content'] = df['content'].apply(lambda x: sid.polarity_scores(x)['compound'])

# Plotting by scatterplot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['sentiment_title'], y=df['sentiment_content'])
plt.title('Sentiment Analysis of Titles vs. Content')
plt.xlabel('Sentiment of Titles')
plt.ylabel('Sentiment of Content')
plt.show()


### `Clean Data`

In [None]:
def remove_special_chars(text):
    re1 = re.compile(r'  +')
    x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x1))


def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')


def to_lowercase(text):
    return text.lower()



def remove_punctuation(text):
    """Remove punctuation from list of tokenized words"""
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)


def replace_numbers(text):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    return re.sub(r'\d+', '', text)


def remove_whitespaces(text):
    return text.strip()


def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]



def stem_words(words):
    """Stem words in text"""
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words]

def text2words(text):
  return word_tokenize(text)

def normalize_text( text):
    text = remove_special_chars(text)
    text = remove_non_ascii(text)
    text = remove_punctuation(text)
    text = to_lowercase(text)
    text = replace_numbers(text)
    words = text2words(text)
    words = remove_stopwords(words)
    words = stem_words(words)


    return ' '.join(words)

In [None]:
#Apply function
df['content'] = df['content'].apply(normalize_text)
df['title'] = df['title'].apply(normalize_text)

In [None]:
# to see data after clean
df['content'][0]

In [None]:
# to see data after clean
df['title'][0]

In [None]:
# Calculate word counts
df['content_word_count'] = df['content'].apply(lambda x: len(x.split()))
df['title_word_count'] = df['title'].apply(lambda x: len(x.split()))

In [None]:
# calculate max length from distribution data
max_content_len = int(df['content_word_count'].quantile(0.95))
max_title_len = int(df['title_word_count'].quantile(0.95))

In [None]:
print(f"Max content length: {max_content_len}")
print(f"Max title length: {max_title_len}")

In [None]:
# add names for model(T5)
df['input_text'] = 'summarize: ' + df['content']
df['target_text'] = df['title']

In [None]:
#Split data to train,validation,test
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [None]:
# convert data from dataframe to Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# load Toknizer 
tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [None]:
def preprocess_data(examples):
    inputs = examples['input_text']
    targets = examples['target_text']
    model_inputs = tokenizer(inputs, max_length=max_content_len, truncation=True, padding="max_length")
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_title_len, truncation=True, padding="max_length")
        
    #Ground truth
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
# implement preprocess_data on train ,val ,test
train_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess_data, batched=True, remove_columns=val_dataset.column_names)
test_dataset = test_dataset.map(preprocess_data, batched=True, remove_columns=test_dataset.column_names)

In [None]:
# load model
model = T5ForConditionalGeneration.from_pretrained('t5-small')

In [None]:

# parameters for train
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_total_limit=3,
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    gradient_accumulation_steps=2,
    warmup_steps=500,
)

In [None]:
# إضافة Early Stopping
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

In [None]:
# إعداد المدرب (Trainer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping_callback]
)

In [None]:
# بدء عملية fine-tuning
trainer.train()

In [None]:
# evaluate model on test_dataset
test_results = trainer.evaluate(test_dataset)
print(f"Test Loss: {test_results['eval_loss']}")

In [None]:
# saving model and toknizing
tokenizer.save_pretrained("summarization_toknizer")
model.save_pretrained("summarization_model")
