# Feature Engineering for Simulated User Feedback Data

In [1]:
# import libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
import textstat
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Faryad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Faryad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Faryad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Feature Extraction: Text Length, Sentiment Polarity, and Product Name Mention

In [2]:
# Read the dataset
user_feedback = pd.read_csv ("user_feedback.csv")

# Calculate the length of reviews by counting the words in each review and store it as a new feature
user_feedback['TEXT_LENGTH'] = user_feedback['REVIEW_TEXT'].apply(lambda x: len(x.split()))

# Analyze sentiment scores (polarity) of each review using TextBlob and store it as a new feature
user_feedback['SENTIMENT_POLARITY'] = user_feedback['REVIEW_TEXT'].apply(lambda x: TextBlob(x).sentiment.polarity)

### Feature Engineering: Important Words and Repeating Words Ratio

In [3]:
def ratio_repeating_words(text):
    # Tokenize the text into words, considering only alphanumeric words
    words = [word for word in text.split() if word.isalnum()]
    total_word_count = len(words)
    # Return 0 if the review is empty or very short to avoid division by zero
    if total_word_count <= 1:
        return 0

    # Create a dictionary to count occurrences of each word
    word_counts = {}
    for word in words:
        word_lowercase = word.lower()
        if word_lowercase in word_counts:
            word_counts[word_lowercase] += 1
        else:
            word_counts[word_lowercase] = 1

    # Count the number of words that appear more than once
    repeating_words_count = sum(1 for count in word_counts.values() if count > 1)

    # Calculate the ratio of repeating words to the total word count
    ratio = repeating_words_count / total_word_count
    return ratio

# Apply the function to each review in the dataset to create the new feature
user_feedback['RATIO_REPEATING_WORDS'] = user_feedback['REVIEW_TEXT'].apply(ratio_repeating_words)

# List of important words
important_words = ["great", "love", "like", "really", "quality", "good"]

def add_combined_important_words_count(df, words):
    df['IMPORTANT_WORDS_COUNT'] = df['REVIEW_TEXT'].apply(lambda x: sum(x.lower().split().count(word) for word in words))
    return df

# Apply the function
user_feedback = add_combined_important_words_count(user_feedback, important_words)

### Calculating Additional Text-Based Features

In [4]:
# Calculate the number of stopwords in each review and store it as a new feature
stop = set(stopwords.words('english'))
user_feedback['STOPWORDS'] = user_feedback['REVIEW_TEXT'].apply(lambda x: len([word for word in word_tokenize(x.lower()) if word in stop]))

# Compute the Flesch-Kincaid grade level for each review and store it as a new feature
user_feedback['FK_SCORE'] = user_feedback['REVIEW_TEXT'].apply(lambda x: textstat.flesch_kincaid_grade(x))

# Count the number of uppercase letters in each review and store it as a new feature
user_feedback['CAPITALIZATION_COUNT'] = user_feedback['REVIEW_TEXT'].apply(lambda x: sum(1 for char in x if char.isupper()))

# Count the number of punctuation marks in each review and store it as a new feature
user_feedback['PUNCTUATION_COUNT'] = user_feedback['REVIEW_TEXT'].apply(lambda x: sum(1 for char in x if char in string.punctuation))

# Display the updated DataFrame to verify new features
print(user_feedback.head())

# Save the modified dataset with new features to a CSV file
user_feedback.to_csv('modified_feedback.csv', index=False)


        PRODUCT_CATEGORY  RATING LABEL  \
0       Toys_and_Games_5     5.0     F   
1        Movies_and_TV_5     5.0     F   
2         Pet_Supplies_5     5.0     F   
3       Toys_and_Games_5     5.0     F   
4  Sports_and_Outdoors_5     5.0     F   

                                         REVIEW_TEXT  SOURCE  TEXT_LENGTH  \
0  My children have really enjoyed this set and t...  AMAZON          120   
1  This is one of the better ones I've seen in a ...  AMAZON           32   
2  Y dogs could eat these, but I thought they wou...  AMAZON           65   
3  Omg... this Is a super cute set, the materials...  AMAZON           16   
4  I bought a couple of these for my son and they...  AMAZON           38   

   SENTIMENT_POLARITY  RATIO_REPEATING_WORDS  IMPORTANT_WORDS_COUNT  \
0            0.235577               0.140187                      3   
1            0.364286               0.111111                      0   
2           -0.145833               0.186441                      0   
