## Lightweight Recommendation System with tags.

Import Libraries

In [33]:
import pandas as pd
import gensim.downloader as api
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

Read Dataset

In [34]:
data = pd.read_csv('ted_talks_en.csv')

xtraining_column = 'talk_id'
useful_data = data[[xtraining_column, 'topics']]
useful_data.head()

Unnamed: 0,talk_id,topics
0,1,"['alternative energy', 'cars', 'climate change..."
1,92,"['Africa', 'Asia', 'Google', 'demo', 'economic..."
2,7,"['computers', 'entertainment', 'interface desi..."
3,53,"['MacArthur grant', 'activism', 'business', 'c..."
4,66,"['children', 'creativity', 'culture', 'dance',..."


In [35]:
# Load the CSV dataset
data = useful_data

# converting the text to 
all_topics = []
for topics,i in zip(useful_data['topics'],range(len(useful_data['topics']))):
    all_topics+=eval(topics)
    useful_data.loc[i]['topics'] = eval(topics)

unique_topics = sorted(list(set(all_topics)))
print(unique_topics)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  useful_data.loc[i]['topics'] = eval(topics)


['3D printing', 'AI', 'AIDS', 'Africa', "Alzheimer's", 'Antarctica', 'Anthropocene', 'Asia', 'Audacious Project', 'Autism spectrum disorder', 'Best of the Web', 'Brand', 'Brazil', 'Buddhism', 'CRISPR', 'Christianity', 'DNA', 'Debate', 'Egypt', 'Europe', 'Foreign Policy', 'Gender spectrum', 'God', 'Google', 'HIV', 'Humanities', 'Internet', 'Iran', 'Islam', 'LGBT', 'Latin America', 'MacArthur grant', 'Mars', 'Middle East', 'Moon', 'NASA', 'New York', 'Nobel Prize', 'PTSD', 'Planets', 'Science (hard)', 'Senses', 'Slavery', 'Social Science', 'South America', 'String theory', 'Sun', 'Surgery', 'Syria', 'TED Books', 'TED Connects', 'TED Fellows', 'TED Prize', 'TED Residency', 'TED en Español', 'TED-Ed', 'TEDMED', 'TEDNYC', 'TEDYouth', 'TEDx', 'Transgender', 'United States', 'Vaccines', 'activism', 'addiction', 'adventure', 'advertising', 'aging', 'agriculture', 'aircraft', 'algorithm', 'alternative energy', 'ancient world', 'animals', 'animation', 'anthropology', 'ants', 'apes', 'archaeology

In [36]:
data

Unnamed: 0,talk_id,topics
0,1,"['alternative energy', 'cars', 'climate change..."
1,92,"['Africa', 'Asia', 'Google', 'demo', 'economic..."
2,7,"['computers', 'entertainment', 'interface desi..."
3,53,"['MacArthur grant', 'activism', 'business', 'c..."
4,66,"['children', 'creativity', 'culture', 'dance',..."
...,...,...
4000,62678,"['activism', 'data', 'technology', 'mental hea..."
4001,62782,"['TED-Ed', 'education', 'history', 'animation'..."
4002,62263,"['society', 'law', 'policy', 'justice system',..."
4003,62784,"['TED-Ed', 'education', 'animation', 'United S..."


Getting the Word2Vec Model

In [37]:
info = api.info()  # show info about available models/datasets
word2vec_model = api.load("word2vec-google-news-300")  # download the model and return as object ready for use
word2vec_model.most_similar("cat")

[('cats', 0.8099379539489746),
 ('dog', 0.760945737361908),
 ('kitten', 0.7464985251426697),
 ('feline', 0.7326234579086304),
 ('beagle', 0.7150582671165466),
 ('puppy', 0.7075453400611877),
 ('pup', 0.6934291124343872),
 ('pet', 0.6891531348228455),
 ('felines', 0.6755931973457336),
 ('chihuahua', 0.6709762215614319)]

Function For Converting tags to Embeddings

In [38]:
def column_to_word2vec_embeddings(dataframe, column_name, word2vec_model):

    # Get the text from the specified column
    tags = dataframe[column_name]
    
    # Initialize a list to store Word2Vec embeddings
    embeddings = []

    # Iterate through the texts and convert to Word2Vec embeddings
    for tag_list in tags:
        tokens = tag_list  # Tokenize the text
        word_embeddings = [word2vec_model[token] for token in tokens if token in word2vec_model]
        if word_embeddings:
            # Calculate the paragraph embedding (average of word embeddings)
            paragraph_embedding = np.mean(word_embeddings, axis=0)
        else:
            # Handle the case where there are no word embeddings
            paragraph_embedding = np.zeros(word2vec_model.vector_size)
        embeddings.append(paragraph_embedding)

    # Create a new DataFrame with the embeddings
    modified_dataframe = dataframe.copy()
    modified_dataframe['tag_embeddings'] = embeddings

    return modified_dataframe

In [39]:
vectorized_data = column_to_word2vec_embeddings(data, 'topics', word2vec_model)
vectorized_data.head()

Unnamed: 0,talk_id,topics,tag_embeddings
0,1,"['alternative energy', 'cars', 'climate change...","[-0.17534839, 0.10282389, -0.0007562747, 0.129..."
1,92,"['Africa', 'Asia', 'Google', 'demo', 'economic...","[-0.17896906, 0.120670915, 0.010108304, 0.1194..."
2,7,"['computers', 'entertainment', 'interface desi...","[-0.18247779, 0.11157528, -0.01706121, 0.15448..."
3,53,"['MacArthur grant', 'activism', 'business', 'c...","[-0.179375, 0.09608236, -0.0026692708, 0.14162..."
4,66,"['children', 'creativity', 'culture', 'dance',...","[-0.17675063, 0.101213045, -0.00961962, 0.1466..."


In [40]:
# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(vectorized_data['tag_embeddings'].tolist())

# Function to get top similar talks for a given talk_id
def get_top_similar_talks(talk_id, n=5):
    # Get the index of the talk_id
    index = data[data['talk_id'] == talk_id].index[0]

    # Get similarity scores for the talk
    sim_scores = list(enumerate(similarity_matrix[index]))

    # Sort talks by similarity score (descending order)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top n similar talks (excluding itself)
    top_similar_talks = sim_scores[1:n+1]

    # Create a dictionary with talk_id and similarity score
    result_dict = {data.iloc[talk[0]]['talk_id']: talk[1] for talk in top_similar_talks}

    return result_dict

# Create a new column 'recommended_talks'
vectorized_data['recommended_talks'] = data['talk_id'].apply(lambda x: get_top_similar_talks(x))

# Display the DataFrame with the new column
print(vectorized_data[['talk_id', 'recommended_talks']])

      talk_id                                  recommended_talks
0           1  {1374: 0.9984736, 22628: 0.9984503, 19756: 0.9...
1          92  {140: 0.99801713, 620: 0.9971254, 2348: 0.9959...
2           7  {400: 0.9983315, 1276: 0.99734735, 2310: 0.997...
3          53  {2407: 0.99768436, 2855: 0.9970897, 2511: 0.99...
4          66  {1734: 0.9962766, 815: 0.99602616, 36415: 0.99...
...       ...                                                ...
4000    62678  {2786: 0.9967842, 52190: 0.99572235, 41455: 0....
4001    62782  {24271: 0.9979672, 58212: 0.9977772, 2734: 0.9...
4002    62263  {2406: 0.9924404, 1145: 0.9917114, 50990: 0.99...
4003    62784  {2697: 0.99713606, 2681: 0.9968118, 2521: 0.99...
4004    62794  {59148: 0.99782175, 31779: 0.9976266, 23958: 0...

[4005 rows x 2 columns]


In [59]:
vectorized_data['recommended_talks'][0]

{1374: 0.9984736,
 22628: 0.9984503,
 19756: 0.997666,
 60080: 0.99761516,
 2633: 0.99745804}

In [43]:
data = pd.read_csv('ted_talks_en.csv')

In [61]:
query_talk_id = 1
print(f"Input Talk : {data['title'][data['talk_id'] == query_talk_id].values[0]}")
print(f"Recommended Talks :")
for i in vectorized_data['recommended_talks'][vectorized_data['talk_id'] == query_talk_id].values[0]:
    print(f"{data['title'][data['talk_id'] == i].values[0]}")

Input Talk : Averting the climate crisis
Recommended Talks :
The Earth is full
How will we survive when the population hits 10 billion?
A new way to remove CO2 from the atmosphere
How we could change the planet's climate future
We need nuclear power to solve climate change
