In [1]:
import pandas as pd

df = pd.read_excel(r"C:\Users\disha\OneDrive\Desktop\Student feedback.xlsx")

print(df.head())

  student_id  age  gender       locality                course_name  rating  \
0      S0001   23    Male    South Delhi      AI Product Management       4   
1      S0002   27    Male    Vasant Kunj      NLP with Transformers       4   
2      S0003   21    Male   Lajpat Nagar   Applied Machine Learning       5   
3      S0004   26    Male  Central Delhi  Deep Learning Foundations       5   
4      S0005   18  Female         Dwarka     Computer Vision Basics       5   

   satisfaction_score                                      feedback_text  \
0                 3.5  Loved the real-world case studies and industry...   
1                 4.4  Loved the real-world case studies and industry...   
2                 5.0  Great value for money. The mentor support was ...   
3                 4.2  Loved the real-world case studies and industry...   
4                 4.7  Great value for money. The mentor support was ...   

  enrollment_date completion_status  hours_spent would_recommend   d

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# One-time setup: download NLTK data if you haven't already
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Assuming 'df' is your DataFrame that is already loaded.
# If not, uncomment the line below and replace with your file path
# df = pd.read_excel('your_file_name.xlsx')

# 1. Remove missing rows
df.dropna(subset=['suggestion'], inplace=True)

# 3. Make every text in the text column lower case
df['lowercase_suggestion'] = df['suggestion'].astype(str).str.lower()

# 4. Remove special characters
df['no_special_chars'] = df['lowercase_suggestion'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# 2. Tokenize it
df['tokenized_suggestion'] = df['no_special_chars'].apply(word_tokenize)

# 5. Remove stopwords
stop_words = set(stopwords.words('english'))
df['suggestion_no_stopwords'] = df['tokenized_suggestion'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

print(df.head())

   student_id  age  gender       locality                course_name  rating  \
2       S0003   21    Male   Lajpat Nagar   Applied Machine Learning       5   
3       S0004   26    Male  Central Delhi  Deep Learning Foundations       5   
7       S0008   21    Male     West Delhi            AI for Business       3   
9       S0010   28    Male     Karol Bagh     Computer Vision Basics       4   
11      S0012   20  Female   Punjabi Bagh  Deep Learning Foundations       5   

    satisfaction_score                                      feedback_text  \
2                  5.0  Great value for money. The mentor support was ...   
3                  4.2  Loved the real-world case studies and industry...   
7                  3.7  Good for beginners, but intermediates might fi...   
9                  3.9  Loved the real-world case studies and industry...   
11                 4.9  Loved the real-world case studies and industry...   

   enrollment_date completion_status  ...  would_recomme

In [5]:
df.to_excel('updated_reviews 2.xlsx', index=False)

print("Successfully saved the updated DataFrame to 'updated_reviews.xlsx'")

Successfully saved the updated DataFrame to 'updated_reviews.xlsx'


In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Assuming 'df' is your DataFrame with the preprocessed text.
# If you are in a new session, load the data first:
# df = pd.read_excel('reviews_with_sentiment.xlsx')

# The 'text_no_stopwords' column is a list of words. We need to join them back into a string.
# If you didn't create this in the sentiment analysis step, create it now.
if 'processed_text_string' not in df.columns:
    df['processed_text_string'] = df['suggestion_no_stopwords'].apply(lambda x: ' '.join(x))

# 1. Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = vectorizer.fit_transform(df['processed_text_string'])

# 2. Apply NMF
num_topics = 5
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(tfidf)

# 3. Display the topics with their top words
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx + 1}:")
        top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
        print(" ".join(top_words))
        print("\n")

num_top_words = 10
feature_names = vectorizer.get_feature_names_out()
display_topics(nmf_model, feature_names, num_top_words)

Topic #1:
sessions qa add live projects provide timings offline performance mobile


Topic #2:
offline performance mobile access app improve timings provide projects sessions


Topic #3:
provide downloadable notebooks datasets qa sessions timings offline performance projects


Topic #4:
projects introduce mentors capstone industry sessions timings offline performance provide


Topic #5:
timings flexible offer classes live sessions qa offline performance projects


