In [6]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

#Tokenization and Lowercasing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\esman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\esman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\esman\AppData\Roaming\nltk_data...


In [4]:
df = pd.read_csv("./bbc_news.csv")

In [5]:
df.head()

Unnamed: 0,title,pubDate,guid,link,description
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as..."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...


In [21]:
df.shape

(26695, 5)

In [None]:
random.

In [22]:
# Drop one or more columns
columns_to_drop = ['title', 'pubDate', 'guid', 'link']
df = df.drop(columns=columns_to_drop)

# Display the DataFrame after dropping columns
print("\nDataFrame after dropping columns:")
df


DataFrame after dropping columns:


Unnamed: 0,description
0,The Ukrainian president says the country will ...
1,"Jeremy Bowen was on the frontline in Irpin, as..."
2,One of the world's biggest fertiliser firms sa...
3,The parents of the Manchester Arena bombing's ...
4,Consumers are feeling the impact of higher ene...
...,...
26690,Pep Guardiola says his Manchester City side ca...
26691,Liverpool and Scotland legend Sir Kenny Dalgli...
26692,It seemed like Arsenal had it figured out and ...
26693,Manager Sean Dyche enjoys a winning return to ...


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ABAR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ABAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ABAR\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
df['tokenized_text'] = df['description'].apply(lambda x: word_tokenize(str(x).lower()))
#This line tokenizes the text in the 'text' column, converts each word to lowercase, and creates a new column 'tokenized_text' containing lists of tokens for each article.

In [25]:
stop_words = set(stopwords.words('english'))
df['filtered_text'] = df['tokenized_text'].apply(lambda x: [word for word in x if word.lower() not in stop_words])
#Stopwords (common words like 'and', 'the', etc.) are removed from the tokenized text, and the result is stored in a new column 'filtered_text'.

In [26]:
lemmatizer = WordNetLemmatizer()
df['lemmatized_text'] = df['filtered_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
#Lemmatization reduces words to their base or root form, helping to standardize variations. The lemmatized text is stored in a new column 'lemmatized_text'.

In [27]:
df['preprocessed_text'] = df['lemmatized_text'].apply(' '.join)
#The lemmatized text is converted back to a single string per article and stored in the 'preprocessed_text' column.

In [28]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['preprocessed_text'])
#TF-IDF vectorization converts the preprocessed text into numerical vectors, representing the importance of each word in the context of the entire corpus.

In [29]:
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
#Cosine similarity is calculated between each pair of articles based on their TF-IDF vectors. 

In [31]:
df

Unnamed: 0,description,tokenized_text,filtered_text,lemmatized_text,preprocessed_text
0,The Ukrainian president says the country will ...,"[the, ukrainian, president, says, the, country...","[ukrainian, president, says, country, forgive,...","[ukrainian, president, say, country, forgive, ...",ukrainian president say country forgive forget...
1,"Jeremy Bowen was on the frontline in Irpin, as...","[jeremy, bowen, was, on, the, frontline, in, i...","[jeremy, bowen, frontline, irpin, ,, residents...","[jeremy, bowen, frontline, irpin, ,, resident,...","jeremy bowen frontline irpin , resident came r..."
2,One of the world's biggest fertiliser firms sa...,"[one, of, the, world, 's, biggest, fertiliser,...","[one, world, 's, biggest, fertiliser, firms, s...","[one, world, 's, biggest, fertiliser, firm, sa...",one world 's biggest fertiliser firm say confl...
3,The parents of the Manchester Arena bombing's ...,"[the, parents, of, the, manchester, arena, bom...","[parents, manchester, arena, bombing, 's, youn...","[parent, manchester, arena, bombing, 's, young...",parent manchester arena bombing 's youngest vi...
4,Consumers are feeling the impact of higher ene...,"[consumers, are, feeling, the, impact, of, hig...","[consumers, feeling, impact, higher, energy, c...","[consumer, feeling, impact, higher, energy, co...",consumer feeling impact higher energy cost fue...
...,...,...,...,...,...
26690,Pep Guardiola says his Manchester City side ca...,"[pep, guardiola, says, his, manchester, city, ...","[pep, guardiola, says, manchester, city, side,...","[pep, guardiola, say, manchester, city, side, ...",pep guardiola say manchester city side blame b...
26691,Liverpool and Scotland legend Sir Kenny Dalgli...,"[liverpool, and, scotland, legend, sir, kenny,...","[liverpool, scotland, legend, sir, kenny, dalg...","[liverpool, scotland, legend, sir, kenny, dalg...",liverpool scotland legend sir kenny dalglish h...
26692,It seemed like Arsenal had it figured out and ...,"[it, seemed, like, arsenal, had, it, figured, ...","[seemed, like, arsenal, figured, top, wsl, wai...","[seemed, like, arsenal, figured, top, wsl, wai...","seemed like arsenal figured top wsl waiting , ..."
26693,Manager Sean Dyche enjoys a winning return to ...,"[manager, sean, dyche, enjoys, a, winning, ret...","[manager, sean, dyche, enjoys, winning, return...","[manager, sean, dyche, enjoys, winning, return...",manager sean dyche enjoys winning return burnl...
