In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


# fine tuning hugging face model to summarize the table data or tabular data using T5 model

# load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

# load the data
df = pd.read_csv("https://raw.githubusercontent.com/priya-dwivedi/Deep-Learning/master/data/india-news-headlines.csv")
df = df.head(1000)

def get_summary(text):
    preprocess_text = text.strip().replace(" " , " ").replace(" " , " ").replace(" " , " ") # remove extra spaces
    t5_prepared_Text = "summarize: " + preprocess_text
    print("original text preprocessed: \n", preprocess_text)

    tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to("cuda")

    # summmarize
    summary_ids = model.generate(tokenized_text,
                                num_beams=4,
                                no_repeat_ngram_size=2,
                                min_length=30,
                                max_length=100,
                                early_stopping=True)

    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print("Summarized text: \n", output)
    return output

df["summary"] = df["headline_text"].apply(lambda x: get_summary(x))

# save the data
df.to_csv("news_summary.csv", index=False)

# load the data
df = pd.read_csv("news_summary.csv")
df.head()

# original text preprocessed:
#  2 killed in clash between supporters of rival candidates in poll-bound madhya pradesh