In [13]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
import plotly.express as px
import plotly.offline as offline

# read the csv file clotho_captions_evaluation.csv in data folder 
df = pd.read_csv('/Users/dilipharish/Master-Thesis/dh-freesound-crossmodal-search/data/clotho_captions_evaluation.csv')

# print('df.head()', df.head())
# # Read the CSV file
# df = pd.read_csv('input.csv')

# Preprocess the sentences
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(text)
    
    # Preprocess each sentence
    preprocessed_sentences = []
    for sentence in sentences:
        # Convert to lowercase
        sentence = sentence.lower()
        
        # Remove stopwords
        words = nltk.word_tokenize(sentence)
        words = [word for word in words if word not in stop_words]
        
        # Join the words back into a sentence
        preprocessed_sentence = ' '.join(words)
        
        preprocessed_sentences.append(preprocessed_sentence)
    
    return preprocessed_sentences

# Apply preprocessing to the description columns
text_columns = ['caption_1', 'caption_2', 'caption_3', 'caption_4', 'caption_5']
preprocessed_sentences = []
for column in text_columns:
    df[column] = df[column].fillna('') # fillna is used to fill the missing values with a specified value

    # applies the pre-process function to each of the text columns and then concatenates the results in a single list
    preprocessed_sentences.extend(df[column].apply(preprocess_text).sum()) 

# Create vector representations of sentences using TF-IDF, convert text description in to a numerical vector
vectorizer = TfidfVectorizer()
sentence_vectors = vectorizer.fit_transform(preprocessed_sentences)

# # Cluster the sentences using K-Means
# num_clusters = 10  # Set the desired number of clusters
# kmeans = KMeans(n_clusters=num_clusters, random_state=42)
# kmeans.fit(sentence_vectors)

# Reduce the dimensionality of the vectors for visualization
svd = TruncatedSVD(n_components=2, random_state=42)
X_2d = svd.fit_transform(sentence_vectors)

# Perform clustering using K-Means
kmeans = KMeans(n_clusters=10, random_state=42)
labels = kmeans.fit_predict(sentence_vectors)

unique_descriptions_df = pd.DataFrame({'Description': preprocessed_sentences, 'Cluster': labels})

# Create a scatter plot with hover text
fig = px.scatter(unique_descriptions_df, x=X_2d[:, 0], y=X_2d[:, 1], color='Cluster', hover_data=['Description'])
fig.update_traces(textposition='top center')

# Set plot title and axis labels
fig.update_layout(title='Sentence Clustering', xaxis_title='Component 1', yaxis_title='Component 2')

# Display the interactive plot
offline.plot(fig, filename='plot.html')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dilipharish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




'plot.html'