In [None]:
# Extracting aspects (e.g., price, performance) from text using LDA or keyword mapping. 
# Assigning sentiments to each aspect using Naive Bayes or another sentiment model.

#Gensim = “Generate Similar” is a popular open source 
#natural language processing (NLP) library used for unsupervised topic modeling

In [None]:
!python -m spacy download en_core_web_lg 

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [None]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from gensim import corpora
from gensim.models import LdaModel
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import nltk
import matplotlib.pyplot as plt

# Download necessary libraries
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Step 1: Load Dataset
# Replace with the path to your dataset
data = pd.read_csv('data_sentiment_pp_balanced.csv')  

# Display dataset summary
print("Dataset Summary:")
print(data.head())



In [None]:
# Step 1: Handle Missing or Invalid Data
data["review_cleaned"] = data["review_cleaned"].fillna("").astype(str)

# Step 2: Preprocess the Reviews
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    """Preprocess text: tokenize, lemmatize, and remove stopwords."""
    if not isinstance(text, str):
        return []  # Return an empty list for invalid entries
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words]
    return tokens

# Apply preprocessing to the reviews
data["processedReview"] = data["review_cleaned"].apply(preprocess_text)



In [None]:
# Step 3: Create Dictionary and Corpus for LDA
dictionary = corpora.Dictionary(data["review_cleaned"])
corpus = [dictionary.doc2bow(text) for text in data["review_cleaned"]]

# Step 4: Train LDA Model to Identify Aspects
num_topics = 6  # Adjust based on the number of expected aspects
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42)

# Display the extracted topics
print("\nExtracted Topics and Keywords:")
for idx, topic in lda_model.print_topics(num_words=5):
    print(f"Topic {idx + 1}: {topic}")

# Step 5: Assign Topics (Aspects) to Reviews
def get_dominant_topic(review):
    """Get the dominant topic for a review."""
    bow = dictionary.doc2bow(review)
    topic_distribution = lda_model[bow]
    dominant_topic = max(topic_distribution, key=lambda x: x[1])[0] if topic_distribution else None
    return dominant_topic

data["dominant_topic"] = data["review_cleaned"].apply(get_dominant_topic)


In [None]:
# Step 6: Sentiment Analysis Using VADER
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    """Get sentiment scores using VADER."""
    scores = analyzer.polarity_scores(text)
    if scores["compound"] >= 0.05:
        return "positive"
    elif scores["compound"] <= -0.05:
        return "negative"
    else:
        return "neutral"

data["sentiment"] = data["review"].apply(get_sentiment)

# Step 7: Combine Aspect and Sentiment
aspect_sentiment = data.groupby("dominant_topic")["sentiment"].value_counts().unstack().fillna(0)


In [None]:

print("\nAspect Sentiment Distribution:")
print(aspect_sentiment)

# Step 8: Visualize Results
# (a) Topic Distribution
plt.figure(figsize=(10, 6))
data["dominant_topic"].value_counts().plot(kind="bar", color="skyblue")
plt.title("Topic Distribution (Aspects)")
plt.xlabel("Topic")
plt.ylabel("Number of Reviews")
plt.xticks(rotation=0)
plt.show()



In [None]:
# (b) Sentiment Distribution for Each Aspect
aspect_sentiment.plot(kind="bar", stacked=True, figsize=(12, 6), colormap="viridis")
plt.title("Sentiment Distribution by Aspect (Topic)")
plt.xlabel("Aspect (Topic)")
plt.ylabel("Count")
plt.legend(title="Sentiment")
plt.tight_layout()
plt.show()

# Step 9: Save Results
#data.to_csv("aspect_sentiment_analysis_results.csv", index=False)
#print("Results saved to 'aspect_sentiment_analysis_results.csv'.")


## Topic

In [5]:
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
import pandas as pd
import nltk
import pickle
from gensim.models import Phrases, LdaModel
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

# Download necessary NLTK data
nltk.download('all')

# Load dataset
df_car = pd.read_csv('car_5_brands.csv')

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Define functions for sentiment scoring and processing
def sentimenter(x):
    return sia.polarity_scores(x)

def sent_score_string(x):
    if x['compound'] >= 0.05:
        return 'positive'
    elif -0.05 < x['compound'] < 0.05:
        return 'neutral'
    elif x['compound'] <= -0.05:
        return 'negative'

df_car['sentiment score'] = df_car['review'].apply(sentimenter)
df_car['relative sentiments'] = df_car['sentiment score'].apply(sent_score_string)

# Stop words and lemmatizer setup
stop_words = set(stopwords.words('english'))
stop_words.update(['porsche','mercede','comfortsport', 'mercedes','mercedes-benz', 'honda','toyota','audi', 'benz','bentley','lexus',
                  'nissan','volvo','drive','nt','like','vehicle','infiniti','good','miles','corvette','come','edmund','lotus','diego','snake',
                 'porsche', 'cayman','bought','year','minute','chicago','car','home', 'work','think','suv','people','edmunds',
                  'cabriolet','lexuss','japan','husband','baby','range', 'rover','cadillac','cadillacs','michelin','texas','second',
                   'awsome','one','now', 'take', 'give', 'new','levinson','road','love','sedan','wife','sport','bang','tank',
                   'truck','lemon','imho','pathfinder','infinity','convertible','allroad','conv','bike','ski','grocery','mclass'
                  ,'hardtop','club','hubby','child','zoom','test','etc','brain','ashamed','carmax','alpina','rocketship','great','germany',
                  'autobahn','mercedez','bmw'])
lemmatizer = WordNetLemmatizer()

def tokenisation_pos_stopword_lemmatize(x):
    tokens = nltk.word_tokenize(x)
    tags = nltk.pos_tag(tokens)
    pos_tags_words = [t for t in tags if t[1] in["JJ","JJR","JJS","NN","NNP","NNS","NNPS","VB", "VBD" ,"VBG" ,"VBN" ,"VBP", "VBZ"] ]
    filtered_words = [t[0] for t in pos_tags_words if t[0].lower() not in stop_words]
    return [lemmatizer.lemmatize(w).lower() for w in filtered_words]

# Preprocessing and saving progress
try:
    with open("preprocessed_docs.pkl", "rb") as f:
        docs = pickle.load(f)
    print("Loaded preprocessed documents from checkpoint.")
except FileNotFoundError:
    print("Preprocessed documents not found. Starting preprocessing.")
    df_car['review_cleaned'] = df_car['review'].apply(tokenisation_pos_stopword_lemmatize)
    
    # Debugging: Print some cleaned reviews
    print(f"Cleaned reviews sample: {df_car['review_cleaned'].head()}")
    
    # Remove empty reviews
    df_car = df_car[df_car['review_cleaned'].apply(lambda x: len(x) > 0)]
    print(f"Reviews after cleaning and removing empty ones: {df_car['review_cleaned'].head()}")

    # Concatenate all the words into a single document
    big_array = []
    for i in range(len(df_car['review_cleaned'])):
        big_array.extend(df_car['review_cleaned'][i])
    docs1 = [' '.join(big_array)]
    lng_sent1 = [i for i in docs1]
    docs = lng_sent1

    # Save progress
    with open("preprocessed_docs.pkl", "wb") as f:
        pickle.dump(docs, f)
    print("Saved preprocessed documents.")

# Tokenizing the documents
tokenized_docs = [doc.split() for doc in docs]

# Bigram collocation for frequent bigrams
biagram_collocation = BigramCollocationFinder.from_words(big_array)
biagram_collocation.apply_freq_filter(3)
bigram_list = biagram_collocation.nbest(BigramAssocMeasures.likelihood_ratio, 15)

# Display the frequent bigrams
print("Frequent bigrams:", bigram_list)

# Dictionary and corpus creation
try:
    dictionary = Dictionary.load("dictionary.gensim")
    with open("corpus.pkl", "rb") as f:
        corpus = pickle.load(f)
    print("Loaded dictionary and corpus from checkpoint.")
except FileNotFoundError:
    print("Dictionary and corpus not found. Creating dictionary and corpus.")
    dictionary = Dictionary(tokenized_docs)
    dictionary.filter_extremes(no_below=1, no_above=0.9)
    corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

    # Save progress
    dictionary.save("dictionary.gensim")
    with open("corpus.pkl", "wb") as f:
        pickle.dump(corpus, f)
    print("Saved dictionary and corpus.")

# Check if corpus is empty
if len(corpus) == 0:
    print("Error: The corpus is empty. LDA cannot be run on an empty corpus.")
else:
    print(f"Corpus size: {len(corpus)}")

# LDA Model Training
try:
    model = LdaModel.load("lda_model.gensim")
    print("Loaded LDA model from checkpoint.")
except FileNotFoundError:
    print("LDA model not found. Training LDA model.")
    model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=6,
        passes=20,
        iterations=400,
        random_state=123,
        alpha=0.1,
        eta=0.001
    )
    model.save("lda_model.gensim")
    print("Saved LDA model.")

# Display topics
for topic_id, topic in model.show_topics(num_topics=6, num_words=10, formatted=True):
    print(f"({topic_id},")
    print(f"  '{topic}')")


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\farah\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\farah\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\farah\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\farah\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\farah\AppData\Roaming\nltk_data...
[

Loaded preprocessed documents from checkpoint.
Frequent bigrams: [('gas', 'mileage'), ('sound', 'system'), ('fuel', 'economy'), ('fit', 'finish'), ('build', 'quality'), ('brand', 'new'), ('back', 'seat'), ('water', 'pump'), ('cruise', 'control'), ('navigation', 'system'), ('leg', 'room'), ('head', 'turner'), ('extended', 'warranty'), ('sport', 'package'), ('center', 'console')]
Loaded dictionary and corpus from checkpoint.
Corpus size: 1
LDA model not found. Training LDA model.


ValueError: cannot compute LDA over an empty collection (no terms)

In [6]:
# Check tokenized documents and the first document's corpus representation
print("Tokenized documents sample:", tokenized_docs[:2])  # Print the first 2 documents' tokens
print("First document bag-of-words:", corpus[0])  # Print the first document's bag-of-words representation

# Check if the corpus has enough data
if len(corpus) == 0 or all(len(doc) == 0 for doc in corpus):
    print("Error: Corpus is empty or has no meaningful terms. LDA cannot be trained.")
else:
    print(f"Corpus size after processing: {len(corpus)}")

# Ensure the corpus has terms before proceeding with LDA model
if len(corpus) > 0 and any(len(doc) > 0 for doc in corpus):
    print("Training LDA model...")
    # LDA Model Training
    try:
        model = LdaModel.load("lda_model.gensim")
        print("Loaded LDA model from checkpoint.")
    except FileNotFoundError:
        print("LDA model not found. Training LDA model.")
        model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=6,
            passes=20,
            iterations=400,
            random_state=123,
            alpha=0.1,
            eta=0.001
        )
        model.save("lda_model.gensim")
        print("Saved LDA model.")

    # Display topics
    for topic_id, topic in model.show_topics(num_topics=6, num_words=10, formatted=True):
        print(f"({topic_id}, '{topic}')")
else:
    print("Skipping LDA model training due to empty corpus.")


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

