In [1]:
import pandas as pd
import re
import string
import spacy
import torch
import nltk
import emoji
from transformers import BertTokenizer, BertModel
from torch import nn
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from autocorrect import Speller
from sklearn.model_selection import train_test_split

In [2]:
# Load Spacy model for dependency parsing and POS tagging
nlp = spacy.load("en_core_web_sm")

In [3]:
# Initialize tokenizer and BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased') 

In [4]:
# Initialize lemmatizer, stopwords and spell checker
lemmatizer = WordNetLemmatizer()
spell = Speller(lang='en')
stop_words = set(stopwords.words('english'))

In [5]:
from nltk.tokenize import word_tokenize
# Assuming spell is already defined or imported from a spell-correction library
# If not, you can implement a placeholder function
def spell(token):
    return token  # For now, it returns the same token

import string
from nltk.tokenize import word_tokenize

def clean_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenization
    tokens = word_tokenize(text)  # Ensure punkt is downloaded
    return tokens


In [6]:
# Define the preprocessing function
def advanced_preprocess_text(text):
    # Convert emojis and emoticons to words
    text = convert_emojis_to_words(text)
    text = convert_emoticons_to_words(text)
    
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and numbers
    text = re.sub(f"[{string.punctuation}0-9]", "", text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization and Stemming
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    stemmed = [stemmer.stem(token) for token in lemmatized]
    
    # Join the tokens back to form a string
    processed_text = ' '.join(stemmed)
    
    return processed_text

In [7]:
# 2. Apply Dependency Parsing and POS tagging
def dependency_parsing_and_pos_tagging(text):
    doc = nlp(text)
    
    pos_tags = defaultdict(int)
    dependency_tags = defaultdict(int)
    
    for token in doc:
        # Count Part of Speech tags
        pos_tags[token.pos_] += 1
        
        # Count Dependency tags
        dependency_tags[token.dep_] += 1
    
    return {
        'pos_tags': dict(pos_tags),          # POS tags distribution in the text
        'dependency_tags': dict(dependency_tags)  # Dependency tags distribution
    }


In [8]:
# 3. Extract Review-Related Features
def extract_review_related_features(text):
    # Example features: word count, sentiment analysis, etc.
    word_count = len(text.split())
    sentiment_polarity = TextBlob(text).sentiment.polarity
    positive_words = len([word for word in text.split() if TextBlob(word).sentiment.polarity > 0])
    negative_words = len([word for word in text.split() if TextBlob(word).sentiment.polarity < 0])
    exclamations = text.count('!')
    questions = text.count('?')

    return {
        'word_count': word_count,
        'sentiment_polarity': sentiment_polarity,
        'pos_word_count': positive_words,
        'neg_word_count': negative_words,
        'exclamations': exclamations,
        'questions': questions
    }

In [9]:
# 4. Extract Aspect-Related Features (Dummy Example)
def extract_aspect_related_features(text):
    # Dummy extraction for aspects; implement as needed
    aspects = ['service', 'food', 'ambiance']  # Example aspects
    aspect_sentiment = {aspect: TextBlob(text).sentiment.polarity for aspect in aspects}
    
    return {
        'num_aspects': len(aspects),
        'avg_aspect_sentiment': np.mean(list(aspect_sentiment.values()))  # Simplified average sentiment
    }

In [10]:
# 5. Combine Hybrid Feature Vector with POS and Dependency Parsing Features
def create_full_hybrid_feature_vector(text):
    # Extract Review-Related and Aspect-Related Features
    rrf = extract_review_related_features(text)
    arf = extract_aspect_related_features(text)
    
    # Extract POS and Dependency features
    parsing_features = dependency_parsing_and_pos_tagging(text)
    
    # Combine all features into one dictionary
    hybrid_features = {
        'review_length': rrf['word_count'],
        'sentiment_polarity': rrf['sentiment_polarity'],
        'pos_word_count': rrf['pos_word_count'],
        'neg_word_count': rrf['neg_word_count'],
        'exclamations': rrf['exclamations'],
        'questions': rrf['questions'],
        'num_aspects': arf['num_aspects'],
        'avg_aspect_sentiment': arf['avg_aspect_sentiment'],
        'pos_tags': parsing_features['pos_tags'],  # POS tags feature
        'dependency_tags': parsing_features['dependency_tags']  # Dependency tags feature
    }
    
    return hybrid_features

In [11]:
# Load dataset
df = pd.read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')

In [12]:
# Replace dots with underscores in all column names
df.columns = df.columns.str.replace('.', '_', regex=False)

# Confirm the changes
print(df.columns)


Index(['id', 'dateAdded', 'dateUpdated', 'name', 'asins', 'brand',
       'categories', 'primaryCategories', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'reviews_date', 'reviews_dateSeen',
       'reviews_didPurchase', 'reviews_doRecommend', 'reviews_id',
       'reviews_numHelpful', 'reviews_rating', 'reviews_sourceURLs',
       'reviews_text', 'reviews_title', 'reviews_username', 'sourceURLs'],
      dtype='object')


In [13]:
nltk.download('punkt')
# Preprocess and extract features
df['cleaned_text'] = df['reviews_text'].apply(clean_text)
df['features'] = df['cleaned_text'].apply(extract_features)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91940\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\91940/nltk_data'
    - 'C:\\Users\\91940\\.conda\\envs\\newenv\\nltk_data'
    - 'C:\\Users\\91940\\.conda\\envs\\newenv\\share\\nltk_data'
    - 'C:\\Users\\91940\\.conda\\envs\\newenv\\lib\\nltk_data'
    - 'C:\\Users\\91940\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [15]:
import nltk
print(nltk.data.path)


['C:\\Users\\91940/nltk_data', 'C:\\Users\\91940\\.conda\\envs\\newenv\\nltk_data', 'C:\\Users\\91940\\.conda\\envs\\newenv\\share\\nltk_data', 'C:\\Users\\91940\\.conda\\envs\\newenv\\lib\\nltk_data', 'C:\\Users\\91940\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


In [17]:
conda info --envs


error: incomplete escape \U at position 28

In [None]:
import nltk
nltk.download('punkt')


In [None]:
import nltk
nltk.data.path.append(r'C:\Users\91940\AppData\Roaming\nltk_data')


In [None]:
from nltk.tokenize import word_tokenize
text = "This is a test sentence."
tokens = word_tokenize(text)
print(tokens)


In [14]:
# Ensure that the column name is updated
df.columns = df.columns.str.replace('.', '_', regex=False)

# Preprocess and extract features
df['cleaned_text'] = df['reviews_text'].apply(clean_text)
df['features'] = df['cleaned_text'].apply(extract_features)


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\91940/nltk_data'
    - 'C:\\Users\\91940\\.conda\\envs\\newenv\\nltk_data'
    - 'C:\\Users\\91940\\.conda\\envs\\newenv\\share\\nltk_data'
    - 'C:\\Users\\91940\\.conda\\envs\\newenv\\lib\\nltk_data'
    - 'C:\\Users\\91940\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [19]:
import os

# Get the current working directory
current_directory = os.getcwd()

# Print the current working directory
print("Current Working Directory:", current_directory)



Current Working Directory: C:\Users\91940\Aspect Based Sentiment Analysis
