In [8]:
import pandas as pd
import pickle
from gensim import matutils, models
import scipy.sparse
from nltk import word_tokenize, pos_tag, download
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Ensure NLTK resources are downloaded
download('punkt')
download('averaged_perceptron_tagger')

# Sample data
transcripts = [
    """Brené Brown: The power of vulnerability.\nVulnerability is not weakness; it’s our greatest measure of courage.""",
    """Simon Sinek: How Great Leaders Inspire Action.\nPeople don’t buy what you do; they buy why you do it.""",
    """Ken Robinson: Do Schools Kill Creativity?\nCreativity is as important in education as literacy.""",
    """Amy Cuddy: Your Body Language Shapes Who You Are.\nFake it till you become it.""",
    """Shonda Rhimes: My Year of Saying Yes to Everything.\nThe hum is gone, and I wanted it back.""",
    """Jill Bolte Taylor: My Stroke of Insight.\nHer vivid, poetic description of the stroke and recovery.""",
    """Dan Gilbert: The Surprising Science of Happiness.\nSynthetic happiness shows how people adapt.""",
    """Elizabeth Gilbert: Your Elusive Creative Genius.\nCreativity as a partnership with an external source.""",
    """Tony Robbins: Why We Do What We Do.\nSix core human needs shape our actions.""",
    """James Clear: Atomic Habits: The Surprising Power of Small Changes.\nYou fall to the level of your systems."""
]

data = pd.DataFrame({"transcript": transcripts})

# Function to extract nouns
def nouns(text):
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)]
    return ' '.join(all_nouns)

# Function to extract nouns and adjectives
def nouns_adj(text):
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)]
    return ' '.join(nouns_adj)

# Apply noun extraction
data_nouns = pd.DataFrame(data.transcript.apply(nouns), columns=['transcript'])

# Apply noun and adjective extraction
data_nouns_adj = pd.DataFrame(data.transcript.apply(nouns_adj), columns=['transcript'])

# Stop words
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = list(text.ENGLISH_STOP_WORDS.union(add_stop_words))

# CountVectorizer for nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names_out())

# CountVectorizer for nouns and adjectives
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names_out())

# Create gensim corpus for nouns and adjectives
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

# LDA Model for nouns and adjectives
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=80)

# Print topics
print(ldana.print_topics())

# Assign topics to documents
corpus_transformed = ldana[corpusna]
results = list(zip([a for a, _ in [max(doc, key=lambda x: x[1]) for doc in corpus_transformed]], data.index))
print(results)


[(0, '0.055*"great" + 0.055*"action" + 0.055*"sinek" + 0.055*"simon" + 0.055*"leaders" + 0.055*"inspire" + 0.011*"shonda" + 0.011*"rhimes" + 0.011*"hum" + 0.011*"year"'), (1, '0.057*"stroke" + 0.031*"bolte" + 0.031*"jill" + 0.031*"description" + 0.031*"insight" + 0.031*"recovery" + 0.031*"vivid" + 0.031*"taylor" + 0.031*"poetic" + 0.031*"genius"'), (2, '0.067*"surprising" + 0.067*"happiness" + 0.037*"power" + 0.037*"gilbert" + 0.037*"level" + 0.037*"systems" + 0.037*"james" + 0.037*"habits" + 0.037*"small" + 0.037*"clear"'), (3, '0.049*"creativity" + 0.049*"vulnerability" + 0.027*"robinson" + 0.027*"ken" + 0.027*"kill" + 0.027*"schools" + 0.027*"education" + 0.027*"literacy" + 0.027*"important" + 0.027*"brené"')]
[(3, 0), (0, 1), (3, 2), (3, 3), (1, 4), (1, 5), (2, 6), (1, 7), (3, 8), (2, 9)]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
