In [203]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
#!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

In [204]:
!pip install transformers



In [205]:
import numpy as np
import pandas as pd
import textwrap
from pprint import pprint

from transformers import pipeline

In [206]:
df = pd.read_csv('BBC News Train.csv')

In [207]:
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [208]:
labels = set(df['Category'])
labels

{'business', 'entertainment', 'politics', 'sport', 'tech'}

In [209]:
# Pick a label
label = 'business'

In [211]:
texts = df['Text']
texts.head()

0    worldcom ex-boss launches defence lawyers defe...
1    german business confidence slides german busin...
2    bbc poll indicates economic gloom citizens in ...
3    lifestyle  governs mobile choice  faster  bett...
4    enron bosses in $168m payout eighteen former e...
Name: Text, dtype: object

In [212]:
np.random.seed(1234)

In [213]:
i = np.random.choice(texts.shape[0])
doc = texts.iloc[i]

In [214]:
print(textwrap.fill(doc, replace_whitespace=False, fix_sentence_endings=True))

stars pay tribute to actor davis hollywood stars including spike lee
burt reynolds and oscar nominee alan alda have paid tribute to actor
ossie davis at a funeral in new york.  veteran star ossie davis  a
well-known civil rights activist  died in miami at the age of 87 on 4
february 2005. friends and family  including actress ruby dee his wife
of 56 years  gathered at the riverside church on saturday.  also
present at the service was former us president bill clinton and singer
harry belafonte  who gave the eulogy.  he would have been a very good
president of the united states   said mr clinton.  like most of you
here  he gave more to me than i gave to him.   the 87-year-old was
found dead last weekend in his hotel room in florida  where he was
making a film.  police said that he appeared to have died of natural
causes.  davis made his acting debut in 1950 in no way out starring
sidney poiter.  he frequently collaborated with director spike lee
starring in seven lee films including jung

In [215]:
mlm = pipeline('fill-mask')

No model was supplied, defaulted to distilbert/distilroberta-base and revision ec58a5b (https://huggingface.co/distilbert/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [218]:
mlm('stars pay <mask> to actor davis hollywood stars including spike lee')


[{'score': 0.7043274641036987,
  'token': 6430,
  'token_str': ' tribute',
  'sequence': 'stars pay tribute to actor davis hollywood stars including spike lee'},
 {'score': 0.2804144322872162,
  'token': 20898,
  'token_str': ' homage',
  'sequence': 'stars pay homage to actor davis hollywood stars including spike lee'},
 {'score': 0.003258502809330821,
  'token': 25432,
  'token_str': ' heed',
  'sequence': 'stars pay heed to actor davis hollywood stars including spike lee'},
 {'score': 0.002301377011463046,
  'token': 1503,
  'token_str': ' attention',
  'sequence': 'stars pay attention to actor davis hollywood stars including spike lee'},
 {'score': 0.001203577034175396,
  'token': 2098,
  'token_str': ' respect',
  'sequence': 'stars pay respect to actor davis hollywood stars including spike lee'}]

In [219]:
text = 'Shares in <mask> and plane-making ' + \
  'giant Bombardier have fallen to a 10-year low following the departure ' + \
  'of its chief executive and two members of the board.'

mlm(text)

[{'score': 0.6640910506248474,
  'token': 11016,
  'token_str': ' Airbus',
  'sequence': 'Shares in Airbus and plane-making giant Bombardier have fallen to a 10-year low following the departure of its chief executive and two members of the board.'},
 {'score': 0.2614683508872986,
  'token': 6722,
  'token_str': ' Boeing',
  'sequence': 'Shares in Boeing and plane-making giant Bombardier have fallen to a 10-year low following the departure of its chief executive and two members of the board.'},
 {'score': 0.02363520674407482,
  'token': 15064,
  'token_str': ' aerospace',
  'sequence': 'Shares in aerospace and plane-making giant Bombardier have fallen to a 10-year low following the departure of its chief executive and two members of the board.'},
 {'score': 0.014581655152142048,
  'token': 8537,
  'token_str': ' airlines',
  'sequence': 'Shares in airlines and plane-making giant Bombardier have fallen to a 10-year low following the departure of its chief executive and two members of the

In [220]:
text = 'Shares in train and plane-making ' + \
  'giant Bombardier have fallen to a 10-year low following the <mask> ' + \
  'of its chief executive and two members of the board.'

pprint(mlm(text))

[{'score': 0.5513899326324463,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-year low following the resignation of its chief '
              'executive and two members of the board.',
  'token': 6985,
  'token_str': ' resignation'},
 {'score': 0.21090494096279144,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-year low following the departure of its chief executive '
              'and two members of the board.',
  'token': 5824,
  'token_str': ' departure'},
 {'score': 0.13042041659355164,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-year low following the departures of its chief '
              'executive and two members of the board.',
  'token': 25624,
  'token_str': ' departures'},
 {'score': 0.036515600979328156,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-y

In [221]:
# Prompt for the MLM
prompt = "boys are very <mask>"

# Generate 10 top prompts
result = mlm(prompt, top_k=10)

# Print the generated prompts
for item in result:
    print(item['sequence'])

boys are very polite
boys are very smart
boys are very nice
boys are very naughty
boys are very good
boys are very cute
boys are very lucky
boys are very stupid
boys are very cool
boys are very respectful


##based on previous file and code, can you write a function and script that automatically masks and replaces words in the whole file. Words that have to be replaced are the 5 most important words per text entry, based on using tf-idf

In [120]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline, DistilBertTokenizer

In [226]:
# Load dataset
df = pd.read_csv('BBC News Train.csv')

In [227]:
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [228]:
# Initialize tokenizer for DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [229]:
# Function to tokenize and truncate text to fit the DistilBERT model constraints
def tokenize_truncate(text):
    tokens = tokenizer.tokenize(text)
    return tokenizer.convert_tokens_to_string(tokens[:512])  # Convert back to string after truncation


In [235]:
# Apply tokenizer and truncate texts
df['truncated_text'] = df['Text'].apply(tokenize_truncate)

In [236]:
# Set up the TF-IDF vectorizer and fit it on the truncated texts
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['truncated_text'])

# Get feature names to locate the highest TF-IDF score per text
feature_names = vectorizer.get_feature_names_out()

In [237]:
# Function to find the word with the highest TF-IDF score in each document
def get_key_word(row_index, tfidf_matrix, feature_names):
    row_data = tfidf_matrix.getrow(row_index)
    max_tfidf_idx = row_data.indices[np.argmax(row_data.data)]
    return feature_names[max_tfidf_idx]

In [238]:
# Replace the highest TF-IDF score word with [MASK]
df['masked_text'] = df.apply(lambda row: row['truncated_text'].replace(get_key_word(row.name, tfidf_matrix, feature_names), '[MASK]', 1), axis=1)


In [239]:
# Initialize the fill-mask pipeline
fill_mask = pipeline('fill-mask', model='distilbert-base-uncased')

In [240]:
# Function to apply the mask filling
def fill_masked_word(masked_text):
    try:
        result = fill_mask(masked_text)
        return result[0]['sequence']  # Returning the most likely replacement
    except Exception as e:
        return str(e)  # Handle errors possibly due to no mask in text or other issues

In [241]:
# Apply the mask filling to the masked texts
df['filled_text'] = df['masked_text'].head(5).apply(fill_masked_word)

In [242]:
df.head(5)

Unnamed: 0,ArticleId,Text,Category,truncated_text,masked_text,filled_text
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex - boss launches defence lawyers de...,[MASK] ex - boss launches defence lawyers defe...,former ex - boss launches defence lawyers defe...
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...,german business confidence slides german busin...,german business confidence slides german busin...
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens in ...,bbc poll indicates economic gloom citizens in ...,The size of tensor a (514) must match the size...
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster better ...,lifestyle governs mobile choice faster better ...,The size of tensor a (514) must match the size...
4,917,enron bosses in $168m payout eighteen former e...,business,enron bosses in $ 168m payout eighteen former ...,[MASK] bosses in $ 168m payout eighteen former...,corporate bosses in $ 168m payout eighteen for...


 segment the text into manageable chunks (each not exceeding the maximum token length of 512 for DistilBERT), and within each of these segments, identify and mask the most important word using the TF-IDF metric. We then need to handle the continuation of the text beyond these segments similarly, ensuring that no important content is lost.

 Segment the text into chunks each up to 512 tokens long.
Compute TF-IDF for each segment.
Mask the most important word in each segment.
Process each segment separately for filling the [MASK].
Recombine the segments into the final, modified text.

In [243]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline, DistilBertTokenizer

In [244]:
# Load dataset
df = pd.read_csv('BBC News Train.csv')

In [245]:
# Initialize tokenizer for DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [246]:
# Function to segment text into chunks of up to 512 tokens
def segment_text(text, max_length=512):
    tokens = tokenizer.tokenize(text)
    segments = []
    for i in range(0, len(tokens), max_length):
        segments.append(tokenizer.convert_tokens_to_string(tokens[i:i+max_length]))
    return segments

In [247]:
# Set up the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

In [248]:
# Function to mask the most important word in a segment
def mask_important_word(segment):
    tfidf_matrix = vectorizer.fit_transform([segment])
    feature_names = vectorizer.get_feature_names_out()
    max_tfidf_idx = np.argmax(tfidf_matrix[0].toarray())
    key_word = feature_names[max_tfidf_idx]
    return segment.replace(key_word, '[MASK]', 1), key_word

In [251]:
# Initialize the fill-mask pipeline
fill_mask = pipeline('fill-mask', model='distilbert-base-uncased')

In [252]:
# Process each text
def process_text(text):
    segments = segment_text(text)
    masked_filled_segments = []
    masked_words = []

    for segment in segments:
        if '[MASK]' not in segment:  # Ensure we don't reprocess already masked segments
            masked_segment, key_word = mask_important_word(segment)
            masked_words.append(key_word)  # Collect masked words
        else:
            masked_segment = segment
            masked_words.append('[Existing MASK]')  # Placeholder if MASK already processed

        # Fill the mask
        try:
            filled_segment = fill_mask(masked_segment)[0]['sequence']
        except Exception as e:
            filled_segment = masked_segment  # Use masked segment if error occurs
        masked_filled_segments.append(filled_segment)

    return ''.join(masked_filled_segments), ', '.join(masked_words)

In [253]:
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [257]:
df_short = df.head(100)

In [258]:
# Apply the processing to each text and split results into two new columns
df_short['processed_results'] = df_short['Text'].apply(process_text)
df_short['filled_text'] = df_short['processed_results'].apply(lambda x: x[0])
df_short['masked_words'] = df_short['processed_results'].apply(lambda x: x[1])

# Clean up the DataFrame by removing the temporary column
df_short.drop(columns=['processed_results'], inplace=True)

# Display the original text, filled text, and masked words
#print(df[['text', 'filled_text', 'masked_words']].head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short['processed_results'] = df_short['Text'].apply(process_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short['filled_text'] = df_short['processed_results'].apply(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short['masked_words'] = df_short['processed_results'].a

In [260]:
df_short

Unnamed: 0,ArticleId,Text,Category,filled_text,masked_words
0,1833,worldcom ex-boss launches defence lawyers defe...,business,former ex - boss launches defence lawyers defe...,worldcom
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...,economy
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens in ...,"world, 15"
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster better ...,"phone, cameras"
4,917,enron bosses in $168m payout eighteen former e...,business,corporate bosses in $ 168m payout eighteen for...,enron
...,...,...,...,...,...
95,1962,durex maker ssl awaits firm bid uk condom make...,business,durex maker condom awaits firm bid uk condom m...,ssl
96,1474,bmw to recall faulty diesel cars bmw is to rec...,business,attempting to recall faulty diesel cars bmw is...,bmw
97,603,pension hitch for long-living men male life ex...,business,pension hitch for long - living men male life ...,years
98,1220,ryanair in $4bn boeing plane deal budget airli...,business,bringing in $ 4bn boeing plane deal budget air...,ryanair


In [262]:
# Filter and display texts that were originally longer than 512 tokens
df_short['token_count'] = df_short['Text'].apply(lambda x: len(tokenizer.tokenize(x)))
long_texts_df = df_short[df_short['token_count'] > 512]
#print(long_texts_df[['text', 'filled_text', 'masked_words']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short['token_count'] = df_short['Text'].apply(lambda x: len(tokenizer.tokenize(x)))


In [263]:
long_texts_df

Unnamed: 0,ArticleId,Text,Category,filled_text,masked_words,token_count
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens in ...,"world, 15",578
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster better ...,"phone, cameras",695
8,2034,car giant hit by mercedes slump a slump in pro...,business,car giant hit by mercedes slump a slump in pro...,"euros, 2005",545
10,1683,blair rejects iraq advice calls tony blair has...,politics,blair rejects iraq [MASK] calls tony blair has...,"advice, said",619
16,342,u2 s desire to be number one u2 who have won ...,entertainment,u2 s desire to be number one u2 who have won t...,"band, band",962
25,1561,china had role in yukos split-up china lent ru...,business,china had role in yukos split - up china lent ...,"rosneft, deal",602
26,702,peer-to-peer nets here to stay peer-to-peer ...,tech,peer - to - peer nets here to stay peer - to -...,"p2p, p2p",985
27,1026,henman to face saulnier test british number on...,sport,henman to face saulnier test british number on...,"open, russia, france",1218
39,1532,security scares spark browser fix microsoft is...,tech,security scares spark [MASK] fix microsoft is ...,"browser, 2004",544
43,1394,viewers to be able to shape tv imagine editing...,tech,viewers to be able to shape [MASK] imagine edi...,"tv, make",656
