In [1]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn

#Clone the repository
!git clone https://github.com/dregmi08/Milestone-2-Data-Exploration-Initial-Preprocessing.git

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

Cloning into 'Milestone-2-Data-Exploration-Initial-Preprocessing'...
remote: Enumerating objects: 57, done.[K
remote: Counting objects: 100% (57/57), done.[K
remote: Compressing objects: 100% (50/50), done.[K
remote: Total 57 (delta 24), reused 19 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (57/57), 4.20 MiB | 4.40 MiB/s, done.
Resolving deltas: 100% (24/24), done.


In [6]:
data = pd.read_csv('Milestone-2-Data-Exploration-Initial-Preprocessing/reviews.csv')
data = data.drop(columns=['Time_submitted', 'Total_thumbsup', 'Reply'])
data

Unnamed: 0,Review,Rating
0,"Great music service, the audio is high quality...",5
1,Please ignore previous negative rating. This a...,5
2,"This pop-up ""Get the best Spotify experience o...",4
3,Really buggy and terrible to use as of recently,1
4,Dear Spotify why do I get songs that I didn't ...,1
...,...,...
61589,Even though it was communicated that lyrics fe...,1
61590,"Use to be sooo good back when I had it, and wh...",1
61591,This app would be good if not for it taking ov...,2
61592,The app is good hard to navigate and won't jus...,2


In [8]:
def preprocess_reviews(text_series):
    # Basic preprocessing - modify as needed
    text_series = text_series.str.lower()  # Lowercase all text
    text_series = text_series.str.replace(r'[^\w\s]', '')  # Remove punctuation
    text_series = text_series.str.replace(r'\d+', '')  # Remove numbers
    return text_series

data['Review_clean'] = preprocess_reviews(data['Review'])

In [9]:
vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 2), stop_words='english')
X = vectorizer.fit_transform(data['Review_clean'])

In [10]:
lda = LatentDirichletAllocation(n_components=20, random_state=0)
lda.fit(X)
topics = lda.transform(X)


In [12]:
def get_sentiment_score(rating):
    if rating >= 4:
        return 1  # Positive
    elif rating <= 2:
        return -1  # Negative
    else:
        return 0  # Neutral

data['sentiment_score'] = data['Rating'].apply(get_sentiment_score)

In [14]:
# Number of top terms to display per topic
n_top_terms = 10

# Get top terms in each topic
terms = vectorizer.get_feature_names_out()
topic_features = {}
for topic_idx, topic in enumerate(lda.components_):
    top_terms = [terms[i] for i in topic.argsort()[-n_top_terms:]]
    topic_features[f"Topic {topic_idx + 1}"] = top_terms

# Convert to DataFrame to view
topic_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in topic_features.items()]))

# Display the top words for each topic
print("Top words per topic:")
print(topic_df)


Top words per topic:
      Topic 1      Topic 2  Topic 3  Topic 4       Topic 5       Topic 6  \
0    stopping      app use   logged   update       pandora         music   
1        time  use spotify  spotify   button   spotify app         worth   
2    multiple        apple      try     play          just         money   
3  constantly    recommend    tried    pause  love spotify        paying   
4     crashes      use app      won    close          adds  subscription   
5       times        music      let   screen           app       version   
6    crashing          app      log     open        better       spotify   
7     podcast     easy use      app  playing          love           pay   
8       keeps         easy  premium     song          like          free   
9         app          use  account      app       spotify       premium   

     Topic 7    Topic 8          Topic 9       Topic 10    Topic 11  \
0  listening     random            great  using spotify        mode   


In [16]:
# Aggregate sentiment scores by topics and normalize
topic_sentiment_scores = np.dot(topics.T, data['sentiment_score'])
topic_frequencies = topics.sum(axis=0)
normalized_topic_scores = topic_sentiment_scores / topic_frequencies

# Create a DataFrame for topic scores
topic_score_df = pd.DataFrame({
    'topic': [f'Topic {i+1}' for i in range(len(topic_sentiment_scores))],
    'score': topic_sentiment_scores,
    'normalized_score': normalized_topic_scores
})

# Sort topics by their normalized scores
topic_score_df = topic_score_df.sort_values(by='normalized_score', ascending=False)

print("Top Topics Users Like:")
print(topic_score_df.head(10))

print("\nTop Topics Users Dislike:")
print(topic_score_df.tail(10))


Top Topics Users Like:
       topic        score  normalized_score
15  Topic 16  3020.806763          0.745585
8    Topic 9  2340.063542          0.737194
19  Topic 20  3242.680488          0.705284
1    Topic 2  1146.795218          0.545380
9   Topic 10  1307.820248          0.429498
4    Topic 5   774.158178          0.284225
11  Topic 12   758.022529          0.245205
16  Topic 17   615.677959          0.156919
13  Topic 14    55.142448          0.022183
14  Topic 15  -169.354441         -0.055577

Top Topics Users Dislike:
       topic        score  normalized_score
5    Topic 6  -227.016084         -0.066016
6    Topic 7  -274.489721         -0.083906
10  Topic 11  -236.561898         -0.092828
17  Topic 18  -703.882683         -0.279378
7    Topic 8 -1062.574623         -0.290089
18  Topic 19  -833.140003         -0.303686
0    Topic 1  -830.862538         -0.341637
3    Topic 4  -891.818240         -0.347985
2    Topic 3 -1067.583096         -0.409248
12  Topic 13 -1797.884046 