1. Combine and Preprocess All Texts from Multiple VTT Files
We'll start by reading all VTT files, extracting the text (cleaning and tokenizing), and preparing it for topic modeling

In [49]:
import webvtt
import spacy
import nltk
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import string
import re
import pandas as pd

# Load spaCy for NLP processing
nlp = spacy.load("en_core_web_sm")

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to preprocess and clean text
def preprocess_text(text):
    # Tokenize and remove punctuation, stopwords
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text.lower() not in stop_words and token.text not in string.punctuation]
    return " ".join(tokens)  # Return preprocessed text as a single string

# Function to extract texts from multiple VTT files
def extract_texts_from_vtt_files(file_paths):
    conversation_texts = []

    for file_path in file_paths:
        # Parse each VTT file
        for caption in webvtt.read(file_path):
            speaker_text = caption.text.strip()  # Assuming speaker's text is stored here
            if speaker_text:
                # Remove speaker label (e.g., "Speaker 1:", "Speaker 2:")
                cleaned_text = re.sub(r'^[A-Za-z0-9\s]+?:\s*', '', speaker_text)  # Regex to remove speaker labels
                if cleaned_text:
                    conversation_texts.append(cleaned_text)
    
    return conversation_texts

# Example: Extracting texts from multiple VTT files
file_paths = ["HAKA3_copy.vtt", "meeting1_copy.vtt", "meeting2_copy.vtt"]  # Replace with actual paths to your VTT files
conversation_texts = extract_texts_from_vtt_files(file_paths)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chithraanand/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


2. Apply LDA to the Entire Corpus
Now, we can apply LDA to the combined corpus of texts from all the VTT files.

In [50]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Function to extract topics using LDA with scikit-learn
def extract_topics_from_text(texts, num_topics=5, num_words=5):
    # Preprocess texts: clean and tokenize
    processed_texts = [preprocess_text(text) for text in texts]
    
    # Create a document-term matrix (DTM) using CountVectorizer
    vectorizer = CountVectorizer(stop_words='english')
    dtm = vectorizer.fit_transform(processed_texts)
    
    # Apply LDA (Latent Dirichlet Allocation) using scikit-learn
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda_model.fit(dtm)
    
    # Get the top words for each topic
    feature_names = vectorizer.get_feature_names_out()
    topics = []

    for topic_idx, topic in enumerate(lda_model.components_):
        top_words_idx = topic.argsort()[:-num_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append(top_words)
    
    # Return the LDA model, document-term matrix, and the topics
    return lda_model, dtm, topics

# Example: Extract LDA topics from the entire corpus of conversation texts
num_topics = 5  # Choose the number of topics you want to extract
lda_model, dtm, topics = extract_topics_from_text(conversation_texts, num_topics=num_topics)

# Display the top words for each topic
for idx, topic in enumerate(topics):
    print(f"Topic {idx}: {', '.join(topic)}")


Topic 0: biology, minor, university, thank, student
Topic 1: test, thank, student, good, biology
Topic 2: biology, want, hope, really, yeah
Topic 3: work, master, yeah, try, major
Topic 4: illinois, good, right, okay, delhi


3. Get Topic Distribution for Each Document (VTT File Segment)
Once the LDA model is trained on all the documents, we can get the topic distribution for each document (i.e., each conversation segment).

In [51]:
import numpy as np

# Function to get the topic distribution for each document
def get_topic_distribution_for_documents(lda_model, dtm, num_topics):
    # Use the transform method of the LDA model to get topic distributions for each document
    topic_distributions = lda_model.transform(dtm)
    return topic_distributions

# Example: Get topic distributions for all conversation segments
topic_distributions = get_topic_distribution_for_documents(lda_model, dtm, num_topics)

# Print the topic distributions for the first 5 documents (just as an example)
print(topic_distributions[:5])



[[0.72975016 0.06669885 0.06667957 0.06667588 0.07019554]
 [0.02507588 0.02500599 0.0250024  0.02500172 0.89991402]
 [0.05073794 0.05002668 0.05001069 0.79921222 0.05001247]
 [0.83963979 0.04001525 0.04000612 0.04019613 0.04014272]
 [0.7327856  0.06668956 0.06667585 0.06667323 0.06717577]]


3. Sentiment Analysis (TextBlob)
Next, we perform sentiment analysis and get the polarity, subjectivity, and sentiment label (positive/negative/neutral) for each text segment:

Polarity: A continuous score indicating sentiment (positive/negative).
Subjectivity: A continuous score indicating how subjective (opinion-based) or objective the text is.
Sentiment_Label: A categorical sentiment label (e.g., Positive, Negative, Neutral).

Polarity > 0: Positive sentiment.
Polarity < 0: Negative sentiment.
Polarity = 0: Neutral sentiment

In [52]:
from textblob import TextBlob

# Function to get sentiment features (polarity and subjectivity) for each text
def get_sentiment_features(texts):
    sentiments = []
    
    for text in texts:
        blob = TextBlob(text)
        polarity = blob.sentiment.polarity   # Sentiment polarity (between -1 and 1)
        subjectivity = blob.sentiment.subjectivity  # Sentiment subjectivity (between 0 and 1)
        
        sentiments.append([polarity, subjectivity])
    
    return sentiments

# Function to categorize sentiment polarity into Positive, Neutral, Negative
def categorize_sentiment(polarity):
    if polarity > 0:
        return "Positive"
    elif polarity < 0:
        return "Negative"
    else:
        return "Neutral"

# Get sentiment features for all conversation segments
sentiment_features = get_sentiment_features(conversation_texts)

# Create a DataFrame for sentiment features
sentiment_df = pd.DataFrame(sentiment_features, columns=["Polarity", "Subjectivity"])

# Apply categorization to the Polarity column
sentiment_df["Sentiment_Label"] = sentiment_df["Polarity"].apply(categorize_sentiment)

# Display the first few rows of sentiment features
print(sentiment_df.head())


   Polarity  Subjectivity Sentiment_Label
0  0.910000      0.780000        Positive
1  0.512121      0.551515        Positive
2  0.500000      0.500000        Positive
3  0.000000      0.000000         Neutral
4 -0.050000      0.050000        Negative


4. Combine Topic Features with Sentiment Features
Now that you have both the topic features and sentiment features, we can combine them into a single DataFrame.

In [53]:
import numpy as np
import pandas as pd

# Function to generate descriptive column names for topics in scikit-learn LDA
def generate_topic_column_names(lda_model, vectorizer, num_top_words=5):
    topic_column_names = []
    feature_names = vectorizer.get_feature_names_out()
    
    for topic_id in range(lda_model.n_components):
        # Get the top words for each topic based on the topic-word distribution
        top_words_idx = lda_model.components_[topic_id].argsort()[:-num_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        
        # Create a descriptive name like "Topic X: [word1, word2, ...]"
        topic_name = f"Topic {topic_id}: [" + ", ".join(top_words) + "]"
        topic_column_names.append(topic_name)
    
    return topic_column_names

# Generate descriptive column names for topics
topic_column_names = generate_topic_column_names(lda_model, vectorizer, num_top_words=5)

# Convert topic distributions to a DataFrame for easier viewing
topic_sentiment_df = pd.DataFrame(topic_distributions, columns=topic_column_names)

# Append sentiment features to the topic DataFrame
topic_sentiment_df["Polarity"] = sentiment_df["Polarity"]
topic_sentiment_df["Subjectivity"] = sentiment_df["Subjectivity"]
topic_sentiment_df["Sentiment_Label"] = sentiment_df["Sentiment_Label"]

# Display the final DataFrame
topic_sentiment_df



Unnamed: 0,"Topic 0: [be, learn, see, say, psychology]","Topic 1: [right, say, psychology, first, be]","Topic 2: [be, still, go, minor, student]","Topic 3: [structure, kind, student, science, interested]","Topic 4: [good, first, move, lot, datum]",Polarity,Subjectivity,Sentiment_Label
0,0.72975,0.066699,0.06668,0.066676,0.070196,0.91,0.78,Positive
1,0.025076,0.025006,0.025002,0.025002,0.899914,0.512121,0.551515,Positive
2,0.050738,0.050027,0.050011,0.799212,0.050012,0.5,0.5,Positive
3,0.83964,0.040015,0.040006,0.040196,0.040143,0.0,0.0,Neutral
4,0.732786,0.06669,0.066676,0.066673,0.067176,-0.05,0.05,Negative
5,0.033724,0.033344,0.033493,0.033336,0.866102,0.5,0.5,Positive
6,0.100009,0.100021,0.100008,0.100006,0.599956,0.285714,0.535714,Positive
7,0.839255,0.040013,0.040588,0.040004,0.040141,-0.05,0.2,Negative
8,0.910858,0.02223,0.022374,0.022275,0.022262,-0.05,0.2,Negative
9,0.050008,0.050019,0.799153,0.050572,0.050248,-0.7,0.666667,Negative


In [54]:
topic_sentiment_df.drop('Sentiment_Label', axis=1, inplace=True)

In [55]:
topic_sentiment_df

Unnamed: 0,"Topic 0: [be, learn, see, say, psychology]","Topic 1: [right, say, psychology, first, be]","Topic 2: [be, still, go, minor, student]","Topic 3: [structure, kind, student, science, interested]","Topic 4: [good, first, move, lot, datum]",Polarity,Subjectivity
0,0.72975,0.066699,0.06668,0.066676,0.070196,0.91,0.78
1,0.025076,0.025006,0.025002,0.025002,0.899914,0.512121,0.551515
2,0.050738,0.050027,0.050011,0.799212,0.050012,0.5,0.5
3,0.83964,0.040015,0.040006,0.040196,0.040143,0.0,0.0
4,0.732786,0.06669,0.066676,0.066673,0.067176,-0.05,0.05
5,0.033724,0.033344,0.033493,0.033336,0.866102,0.5,0.5
6,0.100009,0.100021,0.100008,0.100006,0.599956,0.285714,0.535714
7,0.839255,0.040013,0.040588,0.040004,0.040141,-0.05,0.2
8,0.910858,0.02223,0.022374,0.022275,0.022262,-0.05,0.2
9,0.050008,0.050019,0.799153,0.050572,0.050248,-0.7,0.666667


In [57]:
topic_sentiment_df.to_csv('topic_sentiment_df.csv', index=False)

Stored 'topic_sentiment_df' (DataFrame)
