#### review topic modelling

In [1]:
import pandas as pd

import os
import spacy 
from spacy import displacy

import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

In [2]:
# Trying to read with more flexible options
try:
    # Attempt with error handling options
    df = pd.read_csv("./car_review_datasets/Scraped_Car_Review_ford.csv", 
                     error_bad_lines=False,  # Skip bad lines
                     warn_bad_lines=True,    # Warn about them
                     encoding='utf-8')       # Explicit encoding
except:
    # If that doesn't work, try a different approach
    df = pd.read_csv("./car_review_datasets/Scraped_Car_Review_ford.csv", 
                     engine='python',        # Python engine is more flexible
                     encoding='utf-8')       # Explicit encoding

In [3]:
df = df.drop('Unnamed: 0', axis=1)
df_fiesta = df[df["Vehicle_Title"].str.contains("Fiesta", na=False)]
df_fiesta["vehicle_age"] = df_fiesta["Vehicle_Title"].str.extract(r'(\d{4})')[0]
df_fiesta["vehicle_age"] = pd.to_numeric(df_fiesta["vehicle_age"])
df_fiesta = df_fiesta[df_fiesta["vehicle_age"] < 2017]
df_fiesta = df_fiesta.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fiesta["vehicle_age"] = df_fiesta["Vehicle_Title"].str.extract(r'(\d{4})')[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fiesta["vehicle_age"] = pd.to_numeric(df_fiesta["vehicle_age"])


In [14]:
df_fiesta_reviews = df_fiesta[["Review", "Rating", "Review_Title", "vehicle_age"]].copy()


In [31]:
import re

def clean_text(text):
    """More thorough text cleaning before tokenization"""
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    return ""

stop_words = set(stopwords.words('english'))
auto_stopwords = ['car', 'vehicle', 'drive', 'ford', 'fiesta', 'model', 'year']
custom_stopwords = stop_words.union(auto_stopwords)


def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in custom_stopwords and len(word) > 2]
    return tokens


nlp = spacy.load('en_core_web_sm')
def lemmatize(tokens):
    doc = nlp(' '.join(tokens))
    return [token.lemma_ for token in doc if token.pos_ in ['NOUN', 'ADJ', 'VERB']]


def create_bigram_model(tokens):
    bigrams = gensim.models.Phrases([tokens], min_count=5, threshold=100)
    return bigrams[tokens]


df_fiesta_reviews["pre_processed_review"] = df_fiesta_reviews["Review"].apply(clean_text)
df_fiesta_reviews["pre_processed_review"] = df_fiesta_reviews["pre_processed_review"].apply(preprocess)
df_fiesta_reviews["pre_processed_review"] = df_fiesta_reviews["pre_processed_review"].apply(lemmatize)
df_fiesta_reviews["pre_processed_review"] = df_fiesta_reviews["pre_processed_review"].apply(create_bigram_model)

In [32]:
df_fiesta_reviews

Unnamed: 0,Review,Rating,Review_Title,vehicle_age,pre_processed_review
0,We bought a Ford Fiesta 2014 in March. The So...,1.000,Ford Fiesta 2014. Bad idea,2014,"[buy, sound, pop, long, time, take, dealership..."
1,"OK, I will be honest, at first I didn't like ...",4.000,A relationship to build over time,2014,"[honest, like, quirky, transmission, serious, ..."
2,I rented this car. The transmission problems ...,2.000,Transmission troubles are real,2014,"[rent, transmission, problem, people, speak, p..."
3,Okay... does this car stutter when you drive ...,5.000,You have to learn how to drive this car,2014,"[stutter, learn, take, time, learn, learn, wif..."
4,This car is a death trap and an embarrassment...,1.000,RIP. OFF.,2014,"[death, trap, embarrassment, transmission, jer..."
...,...,...,...,...,...
335,"I love the little car! It's a blast to drive,...",4.250,Buena Fiesta!,2011,"[love, little, blast, park, mini, handle, goka..."
336,Overall a very nice car but it could have jus...,3.750,Very nice but needs some obvious changes,2011,"[nice, amazing, simple, small, thing, change, ..."
337,This is a wonderful car with great build qual...,4.875,Great Car!,2011,"[wonderful, great, build, quality, great, ride..."
338,We have now had our new Fiesta SES hatchback ...,5.000,"Fun, Fit and Finish",2011,"[month, put, mile, powershift, spd, automatic,..."


In [33]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(df_fiesta_reviews["pre_processed_review"])

# Create Corpus
texts = df_fiesta_reviews["pre_processed_review"].tolist()

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]


In [35]:
from pprint import pprint
num_topics = 5

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]


[(0,
  '0.015*"transmission" + 0.012*"great" + 0.010*"good" + 0.010*"get" + '
  '0.009*"buy" + 0.009*"problem" + 0.008*"time" + 0.008*"gas" + 0.007*"mile" + '
  '0.007*"shift"'),
 (1,
  '0.015*"get" + 0.010*"transmission" + 0.009*"new" + 0.009*"drive" + '
  '0.009*"great" + 0.008*"seat" + 0.007*"buy" + 0.006*"mile" + 0.006*"use" + '
  '0.006*"go"'),
 (2,
  '0.016*"transmission" + 0.014*"get" + 0.013*"mile" + 0.012*"buy" + '
  '0.011*"drive" + 0.008*"take" + 0.008*"time" + 0.008*"problem" + '
  '0.007*"seat" + 0.006*"feel"'),
 (3,
  '0.016*"transmission" + 0.012*"buy" + 0.011*"issue" + 0.010*"time" + '
  '0.008*"mile" + 0.008*"great" + 0.007*"get" + 0.007*"drive" + 0.006*"clutch" '
  '+ 0.006*"engine"'),
 (4,
  '0.015*"get" + 0.012*"transmission" + 0.012*"buy" + 0.011*"mile" + '
  '0.011*"problem" + 0.008*"go" + 0.007*"make" + 0.007*"good" + 0.006*"mpg" + '
  '0.006*"drive"')]


# to do set of positive and set of negative review topics

In [36]:

# Separate positive and negative reviews
positive_reviews = df_fiesta_reviews[df_fiesta_reviews['Rating'] > 3]
negative_reviews = df_fiesta_reviews[df_fiesta_reviews['Rating'] < 3]

# Create separate topic models for positive and negative reviews
# Generate corpus for positive reviews
positive_texts = positive_reviews["pre_processed_review"].tolist()
positive_id2word = corpora.Dictionary(positive_texts)
positive_corpus = [positive_id2word.doc2bow(text) for text in positive_texts]

# Generate corpus for negative reviews
negative_texts = negative_reviews["pre_processed_review"].tolist()
negative_id2word = corpora.Dictionary(negative_texts)
negative_corpus = [negative_id2word.doc2bow(text) for text in negative_texts]

# Create LDA models
positive_lda = gensim.models.LdaMulticore(corpus=positive_corpus, id2word=positive_id2word, num_topics=5)
negative_lda = gensim.models.LdaMulticore(corpus=negative_corpus, id2word=negative_id2word, num_topics=5)

# Print topics
print("Positive Review Topics:")
pprint(positive_lda.print_topics())
print("\nNegative Review Topics:")
pprint(negative_lda.print_topics())

Positive Review Topics:
[(0,
  '0.013*"get" + 0.011*"drive" + 0.009*"great" + 0.009*"little" + 0.009*"mile" '
  '+ 0.008*"mpg" + 0.007*"look" + 0.007*"seat" + 0.007*"gas" + '
  '0.007*"transmission"'),
 (1,
  '0.017*"get" + 0.011*"mile" + 0.011*"seat" + 0.008*"drive" + 0.008*"great" + '
  '0.007*"transmission" + 0.007*"little" + 0.007*"new" + 0.007*"trip" + '
  '0.007*"good"'),
 (2,
  '0.014*"great" + 0.011*"mpg" + 0.010*"mile" + 0.009*"get" + 0.008*"seat" + '
  '0.007*"drive" + 0.007*"use" + 0.007*"buy" + 0.007*"make" + 0.007*"feel"'),
 (3,
  '0.015*"get" + 0.013*"good" + 0.010*"transmission" + 0.007*"feel" + '
  '0.007*"mile" + 0.007*"problem" + 0.007*"automatic" + 0.006*"little" + '
  '0.006*"buy" + 0.006*"speed"'),
 (4,
  '0.016*"great" + 0.015*"drive" + 0.014*"get" + 0.013*"transmission" + '
  '0.010*"mile" + 0.009*"mpg" + 0.008*"love" + 0.008*"gas" + 0.008*"little" + '
  '0.008*"good"')]

Negative Review Topics:
[(0,
  '0.020*"transmission" + 0.019*"buy" + 0.014*"problem" + 0.012

pydavis visualise