In [20]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
from nltk.metrics import distance

# Step 1: Load and preprocess the dataset
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)



In [21]:
# Step 2: Data preprocessing
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Remove unwanted characters and convert to lowercase
    text = text.lower().replace("[^a-zA-Z0-9]", " ")
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and perform stemming
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    # Join tokens back into a string
    processed_text = " ".join(stemmed_tokens)
    return processed_text

df['processed_text'] = df['headline'] + " " + df['short_description']
df['processed_text'] = df['processed_text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dpras\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
# Step 3: Feature extraction
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(df['processed_text'])

In [23]:
# Step 4: Similarity algorithms
def calculate_cosine_similarity(query_vector, features):
    similarity_scores = cosine_similarity(query_vector, features).flatten()
    return similarity_scores

def calculate_jaccard_similarity(query_vector, features):
    query_set = set(query_vector.indices)
    similarity_scores = []
    for feature_vector in features:
        feature_set = set(feature_vector.indices)
        similarity_scores.append(len(query_set.intersection(feature_set)) / len(query_set.union(feature_set)))
    return similarity_scores


def calculate_euclidean_distance(query_vector, features):
    # Convert the query vector to a dense array
    query_vector = query_vector.toarray()[0]

    # Calculate the Euclidean distance between the query vector and each feature vector
    distances = np.linalg.norm(query_vector - features, axis=1)

    return distances.tolist()


def calculate_levenshtein_distance(query_text, features):
    similarity_scores = []
    for feature_text in features:
        similarity_scores.append(distance.edit_distance(query_text, feature_text))
    return similarity_scores

In [24]:
# Step 5: Model building
def find_most_similar_data(query_text, df, top_k=5, batch_size=1000):
    query_text = preprocess_text(query_text)
    query_vector = vectorizer.transform([query_text])

    num_batches = len(df) // batch_size
    if len(df) % batch_size != 0:
        num_batches += 1

    cosine_similarities = []
    jaccard_similarities = []
    euclidean_distances = []
    levenshtein_distances = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, len(df))
        batch_features = features[start_idx:end_idx]

        cosine_similarities.extend(calculate_cosine_similarity(query_vector, batch_features))
        jaccard_similarities.extend(calculate_jaccard_similarity(query_vector, batch_features))
        euclidean_distances.extend(calculate_euclidean_distance(query_vector, batch_features))
        levenshtein_distances.extend(calculate_levenshtein_distance(query_text, df['processed_text'][start_idx:end_idx]))

    df['cosine_similarity'] = cosine_similarities
    df['jaccard_similarity'] = jaccard_similarities
    df['euclidean_distance'] = euclidean_distances
    df['levenshtein_distance'] = levenshtein_distances

    # Rank the data points based on similarity scores
    df = df.sort_values(by=['cosine_similarity', 'jaccard_similarity', 'euclidean_distance', 'levenshtein_distance'], ascending=False)

    # Return the top-k most similar data points
    most_similar_data = df.head(top_k)
    return most_similar_data

In [30]:
# Step 6: Model
query_text = "Latest technology advancements in AI"
similar_data = find_most_similar_data(query_text, df)
print(similar_data[['headline', 'short_description']])

                                                 headline  \
144273  AI Day Will Replace Christmas as the Most Impo...   
63721   Four Incredible New Advances in Health Technology   
123313      Because You Ain't Never Had A Friend Like Him   
88965   Trump Supporters Harass Immigration Protesters...   
45603   This Is Why Your Coffee Is About To Get More E...   

                                        short_description  
144273  Some religious people, anti-transhumanists, an...  
63721                                                      
123313                                                     
88965   Trump supporters shouted things like "If it ai...  
45603                                    Say it ain't so!  
