In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [None]:
df = pd.read_csv('report_edited.csv', encoding='latin1')


In [None]:
excluded_terms = ['verizon', 'att', 'comcast', 'centurylink', 'telco']
def preprocess_text(text, excluded_terms):
    # Convert to lowercase, remove punctuation, etc. (existing preprocessing steps)
    if not isinstance(text, str):
        text = str(text)
    # Exclude specific terms
    tokens = text.split()
    tokens = [word for word in tokens if word.lower() not in excluded_terms]
    
    # Continue with any further preprocessing like lemmatization, etc.
    return ' '.join(tokens)

# Apply preprocessing to your text data
df['processed_text'] = df['processed_text'].apply(lambda x: preprocess_text(x, excluded_terms))

In [None]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(df['processed_text'])


In [None]:
# Number of topics
n_topics = 10

lda = LatentDirichletAllocation(n_components=n_topics, max_iter=10, learning_method='online', random_state=0)
lda.fit(X)


In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 25
display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)


In [None]:
topic_results = lda.transform(X)
df['Topic'] = topic_results.argmax(axis=1)
