In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('report_edited.csv', encoding='latin1')


In [None]:
excluded_terms = ['verizon', 'att', 'comcast', 'centurylink', 'telco']
def preprocess_text(text, excluded_terms):
    # Convert to lowercase, remove punctuation, etc. (existing preprocessing steps)
    if not isinstance(text, str):
        text = str(text)
    # Exclude specific terms
    tokens = text.split()
    tokens = [word for word in tokens if word.lower() not in excluded_terms]
    
    # Continue with any further preprocessing like lemmatization, etc.
    return ' '.join(tokens)

# Apply preprocessing to your text data
df['processed_text'] = df['processed_text'].apply(lambda x: preprocess_text(x, excluded_terms))


In [None]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
# Remove rows where 'processed_text' is NaN
df = df.dropna(subset=['processed_text'])
# Use TF-IDF for vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features
X = tfidf_vectorizer.fit_transform(df['processed_text'])

# Apply NMF
nmf = NMF(n_components=10, random_state=0)  # Adjust n_components (topics)
nmf.fit(X)

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))



In [None]:
no_top_words = 25 # Number of top words to display for each topic
display_topics(nmf, tfidf_vectorizer.get_feature_names_out(), no_top_words)

In [None]:
import numpy as np

# Assuming 'nmf' is your fitted NMF model and 'tfidf' is your vectorized data
nmf_topic_values = nmf.transform(X)

# Get the dominant topic for each document
dominant_topics = np.argmax(nmf_topic_values, axis=1)


In [None]:
df['Dominant_Topic'] = dominant_topics
df.head()

In [None]:
df.to_csv('report_with_topics.csv', index=False)
