In [1]:
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models import LsiModel
from nltk.corpus import stopwords
import nltk
import re
import string


# Load data
train_data = pd.read_csv('.../train_data.txt', delimiter=' ::: ', header=None, names=["ID", "TITLE", "GENRE", "DESCRIPTION"], engine='python') #use engine = pyhton since the delimiter ":::" is not so common and required it in order to be managed properly.

test_data = pd.read_csv('.../test_data.txt', delimiter=' ::: ', header=None, names=["ID", "TITLE", "DESCRIPTION"], engine='python')

test_solution = pd.read_csv('.../test_data_solution.txt', delimiter=' ::: ', header=None, names=["ID", "TITLE", "GENRE", "DESCRIPTION"], engine='python')

# find unique genres
unique_genres_in_train = train_data['GENRE'].unique()
print(f"There are {len(unique_genres_in_train)} unique GENRES present in train_data:", unique_genres_in_train,)

# download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# join TITLE and DESCRIPTION
train_data['TEXT'] = train_data['TITLE'] + " " + train_data['DESCRIPTION']
test_data['TEXT'] = test_data['TITLE'] + " " + test_data['DESCRIPTION']


# text preprocessing function
def preprocess_text(text):
    # convert to lowercase
    text = text.lower()

    # remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)

    # remove stopwords
    words = text.split()  # tokenize by splitting on spaces
    filtered_words = [word for word in words if word not in stop_words]

    return filtered_words


train_data['TEXT'] = train_data['TEXT'].apply(preprocess_text)
test_data['TEXT'] = test_data['TEXT'].apply(preprocess_text)

There are 27 unique GENRES present in train_data: ['drama' 'thriller' 'adult' 'documentary' 'comedy' 'crime' 'reality-tv'
 'horror' 'sport' 'animation' 'action' 'fantasy' 'short' 'sci-fi' 'music'
 'adventure' 'talk-show' 'western' 'family' 'mystery' 'history' 'news'
 'biography' 'romance' 'game-show' 'musical' 'war']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# build a dictionary from the training text
corpus = train_data['TEXT'].tolist()
dictionary = Dictionary(corpus)

# convert training text into Bag-of-Words format
train_bow = [dictionary.doc2bow(text) for text in corpus]

# train the LSI model
lsi_model = LsiModel(train_bow, id2word=dictionary, num_topics=27)  #num_topics is an optional parameter.

# print all the identified (=num_topics) topics, as a vector of descriptive words.
print(lsi_model.show_topics(num_words=4, formatted=True)) #num_words = number of words from topic that will be used.


[(0, '0.286*"life" + 0.275*"one" + 0.176*"film" + 0.169*"new"'), (1, '0.663*"maya" + -0.386*"film" + 0.153*"class" + 0.152*"sean"'), (2, '-0.502*"film" + -0.499*"maya" + 0.180*"life" + 0.157*"family"'), (3, '0.768*"life" + -0.445*"one" + 0.146*"maya" + -0.142*"two"'), (4, '-0.488*"one" + -0.414*"life" + 0.399*"film" + 0.295*"family"'), (5, '-0.699*"new" + 0.392*"one" + 0.290*"film" + -0.194*"world"'), (6, '-0.607*"de" + -0.455*"la" + -0.228*"et" + -0.192*"le"'), (7, '-0.650*"love" + 0.564*"family" + 0.186*"one" + -0.175*"young"'), (8, '0.575*"world" + -0.395*"new" + -0.318*"film" + 0.240*"family"'), (9, '-0.517*"love" + -0.431*"family" + -0.280*"one" + 0.270*"man"'), (10, '-0.612*"story" + -0.369*"young" + -0.290*"man" + 0.248*"love"'), (11, '-0.405*"two" + 0.393*"world" + -0.387*"story" + 0.281*"young"'), (12, '-0.572*"two" + -0.442*"young" + -0.266*"family" + 0.225*"story"'), (13, '-0.524*"people" + 0.451*"world" + 0.369*"two" + -0.169*"time"'), (14, '-0.521*"man" + 0.293*"school" + 

In [6]:
# Here we match topic number with genre name:

# create 2 dictionaries to store the mapping of topics to genres
topic_to_genre = {}
genre_to_topic = {}

# iterate through some document in train_bow, untill we have matched everything
for id, text_bow in enumerate(train_bow):
    # get the LSI vector for the text_bow
    lsi_vector = lsi_model[text_bow] #-> vector [(topic, weight),(topic, weight),(topic, weight),..]

    if lsi_vector:
        # get the corresponding genre from the train_data
        genre = train_data.iloc[id]['GENRE']

        if genre not in genre_to_topic:
            top_topic = max(lsi_vector, key=lambda x: x[1])[0] #-> match genre with the topic number associated with the maximum weight
            # remember that x[0] is the topic number, and x[1] is the weight of that topic w.r.t. the considered text
            topic_to_genre[top_topic] = genre
            genre_to_topic[genre] = top_topic

    # stop if all unique genres are covered !
    if len(genre_to_topic) == 27:
        break

print("Topic to Genre Mapping:")
for genre, topic in genre_to_topic.items():
    print(f"Genre {genre}: Topic {topic}")


Topic to Genre Mapping:
Genre drama: Topic 0
Genre thriller: Topic 26
Genre adult: Topic 0
Genre documentary: Topic 0
Genre comedy: Topic 0
Genre crime: Topic 0
Genre reality-tv: Topic 0
Genre horror: Topic 0
Genre sport: Topic 0
Genre animation: Topic 21
Genre action: Topic 0
Genre fantasy: Topic 0
Genre short: Topic 0
Genre sci-fi: Topic 0
Genre music: Topic 17
Genre adventure: Topic 0
Genre talk-show: Topic 23
Genre western: Topic 0
Genre family: Topic 0
Genre mystery: Topic 8
Genre history: Topic 0
Genre news: Topic 0
Genre biography: Topic 0
Genre romance: Topic 0
Genre game-show: Topic 22
Genre musical: Topic 0
Genre war: Topic 0


In [3]:
# convert test text into Bag-of-Words format
test_bow = [dictionary.doc2bow(text) for text in test_data['TEXT']]

# transform test text into the LSI space
text_test_lsi_space = [lsi_model[text_test_bow] for text_test_bow in test_bow]

test_solution['PREDICTED_GENRE'] = None #create a new column

# predict the topic
for i, doc_lsi in enumerate(text_test_lsi_space):
    print(f"Document {i + 1}:")
    print("LSI topic distribution:", doc_lsi)

    # find the dominant topic with the highest weight
    dominant_topic = max(doc_lsi, key=lambda x: x[1])
    print(f"Assigned to Topic {dominant_topic[0]} with weight {dominant_topic[1]}")

    test_solution.at[i, 'PREDICTED_GENRE'] = dominant_topic[0]
    # WARNING !!!!!!!!!!!!!! Here, instead of the topic number, we should put the genre name, in order to be able to compare it with the ground truth and compute accuracy!!!!!

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
LSI topic distribution: [(0, 1.3989773925005509), (1, -0.5006879566402791), (2, -0.16421821334449568), (3, 0.6094294824547991), (4, 0.27808769943629136), (5, 0.07278550482828286), (6, 0.1285719259202117), (7, 0.7196240737283082), (8, -0.29592510221455803), (9, -0.027232895611420647), (10, 0.6990391163843864), (11, -0.16305738201226752), (12, -0.10215957339670877), (13, -0.3028178669188296), (14, -0.721674894024422), (15, -1.1188429146873), (16, 0.46650223571295113), (17, -0.04002209817624404), (18, 0.4188499915238403), (19, -0.03509496121002547), (20, 0.03486564891928594), (21, -0.03454561390525701), (22, 0.4700955303738612), (23, 0.1409187633757345), (24, 0.23255582232645217), (25, -0.2951793729435873), (26, 0.06849405479811649)]
Assigned to Topic 0 with weight 1.3989773925005509
Document 37617:
LSI topic distribution: [(0, 0.6052678538486128), (1, 0.08653236485805028), (2, 0.22234199933548535), (3, -0.0666815984099694), 

In [4]:
# Calculate accuracy -> we'll see obv accuracy = 0 since we're not comparing string genre with string predicted_genre (because we're still using topic number instead of genre)
# SEE END OF PREVIOUS CELL
test_solution['CORRECT'] = test_solution['PREDICTED_GENRE'] == test_solution['GENRE'] #the last one the ground truth
print(test_solution[['CORRECT', 'DESCRIPTION']])
accuracy = test_solution['CORRECT'].mean()
print(f"Accuracy: {accuracy:.2%}")

#WARNING: if you try to execute this cell, you'll see that some DESCRIPTION in test_solution are empty! SCARY !
# We should check it, by looking in the test_data_solution.txt and test_data.txt files!!
# If some DESCRIPTION fields in text_data.txt are NaN then we have a problem :)


       CORRECT                                        DESCRIPTION
0        False  L.R. Brane loves his life - his car, his apart...
1        False  Spain, March 1964: Quico is a very naughty chi...
2        False  One year in the life of Albin and his family o...
3        False  His father has died, he hasn't spoken with his...
4        False  Before he was known internationally as a marti...
...        ...                                                ...
39277    False                                                NaN
39278    False                                                NaN
39279    False                                                NaN
39280    False                                                NaN
39281    False                                                NaN

[39282 rows x 2 columns]
Accuracy: 0.00%


Per usare LSI con KeyBERT, richiamare KeyBERT su train_data['TEXT']. Da qui prendere le keywords estratte per ogni documento, e con queste costruire un dictionary ad hoc, contenente meno parole.
Applicare poi naturalmente KeyBERT anche a test_data['TEXT']