In [None]:
!pip install bert-extractive-summarizer
!pip install sentence_transformers

In [2]:
import pandas as pd
from summarizer import Summarizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

##Text Summarisation

In [3]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Datasets/daily_mail_news.csv')[:5]
df.head()

Unnamed: 0,id,article,highlights
0,61df4979ac5fcc2b71be46ed6fe5a46ce7f071c3,"Sally Forrest, an actress-dancer who graced th...","Sally Forrest, an actress-dancer who graced th..."
1,21c0bd69b7e7df285c3d1b1cf56d4da925980a68,A middle-school teacher in China has inked hun...,Works include pictures of Presidential Palace ...
2,56f340189cd128194b2e7cb8c26bb900e3a848b4,A man convicted of killing the father and sist...,"Iftekhar Murtaza, 29, was convicted a year ago..."
3,00a665151b89a53e5a08a389df8334f4106494c2,Avid rugby fan Prince Harry could barely watch...,Prince Harry in attendance for England's crunc...
4,9f6fbd3c497c4d28879bebebea220884f03eb41a,A Triple M Radio producer has been inundated w...,Nick Slater's colleagues uploaded a picture to...


In [6]:
# Create a new column 'extractive_summary' to store the extractive summary
extractive_summarizer = Summarizer()
df['extractive_summary'] = df['article'].apply(lambda x: extractive_summarizer(x))

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [7]:
# Sentence Embeddings using BERT
sentence_bert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [8]:
# Generate embeddings for each sentence
df['sentence_embeddings'] = df['article'].apply(lambda x: sentence_bert_model.encode([x])[0])

In [9]:
# Calculate similarity scores using cosine similarity
cosine_similarities = cosine_similarity(df['sentence_embeddings'].tolist(), df['sentence_embeddings'].tolist())

# Select Top Sentences based on similarity scores
num_top_sentences = 3
df['top_sentences'] = ''
for i in range(len(df)):
    similarity_scores = list(enumerate(cosine_similarities[i]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_sentences_indices = [score[0] for score in sorted_scores[1:num_top_sentences+1]]
    df.at[i, 'top_sentences'] = ' '.join([df['article'][j] for j in top_sentences_indices])

In [10]:
# Abstractive Summarization using 'bert-extractive-summarizer'
abstractive_summarizer = Summarizer()
# Create a new column 'abstractive_summary' to store the abstractive summary
df['abstractive_summary'] = df['article'].apply(lambda x: abstractive_summarizer(x))



In [11]:
df_show = df[['article', 'extractive_summary', 'top_sentences', 'abstractive_summary']]
df_show.head()


Unnamed: 0,article,extractive_summary,top_sentences,abstractive_summary
0,"Sally Forrest, an actress-dancer who graced th...","Sally Forrest, an actress-dancer who graced th...",A middle-school teacher in China has inked hun...,"Sally Forrest, an actress-dancer who graced th..."
1,A middle-school teacher in China has inked hun...,A middle-school teacher in China has inked hun...,"Sally Forrest, an actress-dancer who graced th...",A middle-school teacher in China has inked hun...
2,A man convicted of killing the father and sist...,A man convicted of killing the father and sist...,"Sally Forrest, an actress-dancer who graced th...",A man convicted of killing the father and sist...
3,Avid rugby fan Prince Harry could barely watch...,Avid rugby fan Prince Harry could barely watch...,A Triple M Radio producer has been inundated w...,Avid rugby fan Prince Harry could barely watch...
4,A Triple M Radio producer has been inundated w...,A Triple M Radio producer has been inundated w...,Avid rugby fan Prince Harry could barely watch...,A Triple M Radio producer has been inundated w...


##Sentiment Analysis

In [None]:
!pip install transformers
!pip install torch
!pip install datasets

In [23]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [34]:
dataset = load_dataset("imdb")
df = dataset["train"].to_pandas()

In [35]:
df = df[:8000]
df['label'] = df['label'].apply(lambda x: 0 if x == "neg" else 1)

In [36]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

In [37]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation=True, padding=True)

In [38]:
def tokenize_data(df, tokenizer, max_length=256):
    input_ids = []
    attention_masks = []

    for text in df['text']:
        encoding = tokenizer(text, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(df['label'].tolist())

    return TensorDataset(input_ids, attention_masks, labels)

In [39]:
train_dataset = tokenize_data(train_df, tokenizer)
val_dataset = tokenize_data(val_df, tokenizer)

In [40]:
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [41]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 3

for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

In [43]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(true_labels, predictions)
classification_report_result = classification_report(true_labels, predictions)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report_result)

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       800

    accuracy                           1.00       800
   macro avg       1.00      1.00      1.00       800
weighted avg       1.00      1.00      1.00       800

