# Women Clothing E-Commerce NLP Case Study

## Import Required Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import spacy
!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')

import warnings
warnings.filterwarnings('ignore')

## Importing Dataset:

In [None]:
# data = pd.read_excel('/content/drive/MyDrive/Data Science/Machine Learning/15. Capstone Case Study - NLP- Woman Clothing E-Commerce Platform_/Womens Clothing Reviews Data.xlsx')
!pip install openpyxl
git_link = 'https://github.com/devan-b46/ml-case-studies-temp/raw/929dcb433e6586be8d78e03abc1cb6ff432a05fa/15.%20Capstone%20Case%20Study%20-%20NLP-%20Woman%20Clothing%20E-Commerce%20Platform_/Womens%20Clothing%20Reviews%20Data.xlsx'

data = pd.read_excel(git_link)
data.head()

In [None]:
data.info()

## Exploratory Data Analysis (EDA):



### Data Cleaning:

In [None]:
# 1. Checking and handling missing data, can be done using Imputation or Removal.

# # Here, we will fill missing values in the 'Customer Age' column with the mean age.
# data['Customer Age'].fillna(data['Customer Age'].mean(), inplace=True)



# # 2. Check for duplicates and remove them if necessary
# # Use the .duplicated() and .drop_duplicates() functions to identify and remove duplicates.
# data.duplicated(subset=['Product ID', 'Review Text'], keep='first')
# data.drop_duplicates(subset=['Product ID', 'Review Text'], keep='first', inplace=True)



# # 3. Ensure data types are consistent
# # Check and convert data types as needed.
# # For example, you might want to convert 'Customer Age' to integer if it's currently a float.
# data['Customer Age'] = data['Customer Age'].astype(int)



# # 4. Converting the provided text data into lowercase, for consistent performance:
# # Use the .str.lower() function to convert text to lowercase.
# data['Review Title'] = data['Review Title'].str.lower()
# data['Review Text'] = data['Review Text'].str.lower()

# # Save the preprocessed data to a new file if necessary
# # data.to_csv('preprocessed_data.csv', index=False)


In [None]:

# Check for missing values
data.isnull().sum()


In [None]:
# Impute missing values in categorical columns using mode(), where applicable.
# Otherwise, we will impute the missing data with "Not Available" in it's place.

categorical_column_mode = ['Category','Subcategory1','SubCategory2']
for col in categorical_column_mode:
  data[col].fillna(data[col].mode()[0],inplace = True)

categorical_column_na = ['Review Title','Review Text']
for col in categorical_column_na:
  data[col] = data[col].fillna('Not Available')

data.isnull().sum()

In [None]:
# Checking for duplicates in our dataset:

data.duplicated(subset=['Product ID','Category','Review Text']).sum()


In [None]:
df = data.drop_duplicates(subset=['Product ID','Category','Review Text'])


In [None]:
df

In [None]:
df.info()

In [None]:
# Check data types
print(df.dtypes)


In [None]:
# # Convert 'Review Title' and 'Review Text' to lowercase
# df['Review Title'] = df['Review Title'].str.lower()
# df['Review Text'] = df['Review Text'].str.lower()
# df.info()

### Data Visualization:

In [None]:


# List of categorical columns
categorical_columns = ['Category', 'Subcategory1', 'SubCategory2', 'Location', 'Channel']



# Create bar charts and calculate proportions for each categorical variable
for column in categorical_columns:
    # Count the frequency of each category
    category_counts = df[column].value_counts()

    # Calculate proportions (percentages)
    category_proportions = category_counts / len(df) * 100

    # Plot a bar chart
    plt.figure(figsize=(7, 4))
    sns.barplot(x=category_proportions.index, y=category_proportions.values)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Percentage')
    plt.xticks(rotation=90)  # Rotate x-axis labels for readability
    plt.show()


In [None]:
# Plotting for Numerical columns:

numerical_columns = ['Customer Age', 'Rating']

# Creating histograms for each numerical variables
for column in numerical_columns:
    plt.figure(figsize=(8, 6))
    plt.hist(df[column], bins=20, edgecolor='k', alpha=0.7)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()






In [None]:
# Use box plots to identify outliers


for column in numerical_columns:
    plt.figure(figsize=(5, 8))
    plt.boxplot(df[column])
    plt.title(f'Box Plot for {column}')
    plt.ylabel(column)
    plt.show()

In [None]:
# Plotting the relationship between 'Customer Age' and 'Rating'

plt.figure(figsize=(8, 6))
plt.scatter(df['Customer Age'], df['Rating'], alpha=0.8)
plt.title('Scatter Plot: Customer Age vs. Rating')
plt.xlabel('Customer Age')
plt.ylabel('Rating')
plt.show()


# Calculating Mean rating for each age group:
bins = [0,20,45,65,100]
labels = ['0-20','21-45','46-65','66-100']
df['Age Group']=pd.cut(df['Customer Age'],bins=bins,labels=labels)


# Plotting "Average Rating by Age group."
mean_ratings = df.groupby('Age Group')['Rating'].mean()
plt.figure(figsize=(8, 6))
ax = mean_ratings.plot.bar(x='Age Group', y='Rating', legend=False)
ax.bar_label(ax.containers[0])
plt.title('Average Product Rating by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Rating')
plt.show()

1. From the above chart we can see that the customers of age groups of 0-20 years, have given the highest of average ratings.

In [None]:
correlation=df['Customer Age'].corr(df['Rating'])

print(f'Correlation coefficient: {correlation}')


In [None]:
df['Recommend Flag'].value_counts()

In [None]:
# Create a pie chart for the distribution of 'Recommend Flag'


recommend_counts = df['Recommend Flag'].value_counts()
labels = ['Not Recommended', 'Recommended']
plt.figure(figsize=(8, 8))
plt.pie(recommend_counts, labels=labels, autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Recommend Flag')
plt.show()



In [None]:
# Explore the relationship between 'Recommend Flag' and 'Category'

plt.figure(figsize=(10, 6))
ax = sns.countplot(data=df, x='Category', hue='Recommend Flag',stat = 'percent') # stat = 'percent' can be added, to get percent distribution
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])

plt.title('Recommend Flag by Category')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()


## Text Mining:**

   - Combine Review Title and Review Text into a single text field.

In [None]:
df['Combined Review'] = df['Review Title'].fillna('') + ' ' + df['Review Text'].fillna('')

# we can fill any potential missing values in 'Review Title' or 'Review Text' with empty strings ('').

# Display the DataFrame with the new column
display(df[['Review Title', 'Review Text', 'Combined Review']])


#### Text Preprocessing
1. Lower case
2. Remove stop words, punctuations, special characters
3. Remove unique data (such as ID, orderID, Date, etc)
4. TOKENIZATION
5. Lemmatization

In [None]:
df

- Tokenization: Split text into words or phrases.

In [None]:
# Creating a function, that will clean the df words.

# Using below function, we perform
# 1. remove stop words
# 2. remove any special characters
# 3. remove any punctuations
# 4. LEMMATIZATION

def text_cleaner(t):
  special_char = '!@#$%^&*()_+`~-='
  doc = nlp(t)
  clean_text = ''
  for token in doc:
    if (not token.is_stop) and (not token.is_punct) and (str(token) not in special_char):
      clean_text = clean_text.strip()
      clean_text = clean_text + ' ' + str(token.lemma_).lower()
  return clean_text


In [None]:
!pip install pandarallel
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True,nb_workers=8)


df['clean_text'] = df['Combined Review'].parallel_apply(text_cleaner)

df

In [None]:
df.clean_text

In [None]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt



In [None]:
# pd.DataFrame(token_freq.items(), columns=['Token', 'Frequency'])

In [None]:
# Categorize reviews as positive (e.g., Rating >= 4 or Recommend Flag == 1) and negative

positive_reviews = df[df['Rating'] >= 4]  # we can choose the criteria you prefer
negative_reviews = df[df['Rating'] < 4]



In [None]:
# Calculate word frequency in positive and negative reviews


positive_reviews_tokens = []
for doc in positive_reviews['clean_text']:
    positive_reviews_tokens.extend([token.text for token in nlp(doc)])
positive_reviews_tokens_freq = Counter(positive_reviews_tokens)



negative_reviews_tokens = []
for doc in negative_reviews['clean_text']:
    negative_reviews_tokens.extend([token.text for token in nlp(doc)])
negative_reviews_tokens_freq = Counter(negative_reviews_tokens)


In [None]:
# Get the most frequent words (e.g., top 10)

top_positive_words = pd.DataFrame(positive_reviews_tokens_freq.items(), columns=['Token', 'Frequency'])
top_negative_words = pd.DataFrame(negative_reviews_tokens_freq.items(), columns=['Token', 'Frequency'])



In [None]:
top_positive_words=top_positive_words.sort_values(by='Frequency', ascending=False).reset_index(drop=True)
top_positive_words[:10]

In [None]:
top_negative_words=top_negative_words.sort_values(by='Frequency', ascending=False).reset_index(drop=True)
top_negative_words[:10]

In [None]:
# Plot word frequency for positive reviews

words = top_positive_words['Token'][:20]
frequencies = top_positive_words['Frequency'][:20]


plt.figure(figsize=(12, 6))
plt.bar(words, frequencies, color='skyblue')
plt.title('Top Words in Positive Reviews')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()




In [None]:
# Plot word frequency for negative reviews

words = top_negative_words['Token'][:20]
frequencies = top_negative_words['Frequency'][:20]


plt.figure(figsize=(12, 6))
plt.bar(words, frequencies, color='red')
plt.title('Top Words in Negative Reviews')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

## Word Cloud

In [None]:
!pip install wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# positive_reviews = df[df['Rating'] >= 4]
# negative_reviews = df[df['Rating'] < 4]

# # Join tokens into a single string for positive and negative reviews
# positive_text = ' '.join(' '.join(tokens) for tokens in positive_reviews['Lemmatized Tokens'])
# negative_text = ' '.join(' '.join(tokens) for tokens in negative_reviews['Lemmatized Tokens'])

# Create word clouds for positive and negative reviews
positive_frequencies = dict(zip(top_positive_words['Token'], top_positive_words['Frequency']))
negative_frequencies = dict(zip(top_negative_words['Token'], top_negative_words['Frequency']))

positive_wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(positive_frequencies)

negative_wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(negative_frequencies)


# Plot the word clouds
plt.figure(figsize=(18, 10))

plt.subplot(121)  # Subplot for positive word cloud
plt.imshow(positive_wordcloud, interpolation='bilinear')
plt.title('Positive Reviews Word Cloud')
plt.axis('off')

plt.subplot(122)  # Subplot for negative word cloud
plt.imshow(negative_wordcloud, interpolation='bilinear')
plt.title('Negative Reviews Word Cloud')
plt.axis('off')

plt.show()


## Sentiment Analysis:**
   


### Conduct sentiment analysis using pre-trained models or libraries like VADER.

In [None]:
!pip install textblob
from textblob import TextBlob
import nltk
nltk.download('vader_lexicon')  # Download the VADER lexicon
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [None]:

# Define a function to analyze sentiment using TextBlob

def analyze_sentiment_textblob(text):
    analysis = TextBlob(text)
    # Classify sentiment as positive, negative, or neutral
    if analysis.sentiment.polarity > 0:
        return "Positive"
    elif analysis.sentiment.polarity < 0:
        return "Negative"
    else:
        return "Neutral"


In [None]:
sentiment_analysis = df[['Review Title','Review Text']]

In [None]:

pandarallel.initialize(progress_bar=True,nb_workers=16)

df['Sentiment_TextBlob'] = df['Combined Review'].parallel_apply(analyze_sentiment_textblob)


In [None]:
# Initializing the VADER sentiment analysis
analyzer = SentimentIntensityAnalyzer()

# Function to analyze sentiment using VADER
def analyze_sentiment_vader(text):
    sentiment_scores = analyzer.polarity_scores(text)
    compound_score = sentiment_scores['compound']

    # Classify sentiment as positive, negative, or neutral based on the compound score
    if compound_score >= 0.05:
        return "Positive"
    elif compound_score <= -0.05:
        return "Negative"
    else:
        return "Neutral"


In [None]:
# Apply using parallel apply, for efficient use of resources:
pandarallel.initialize(progress_bar=True,nb_workers=16)

df['Sentiment_VADER'] = df['Combined Review'].parallel_apply(analyze_sentiment_vader)


In [None]:
df.info()

- Compute sentiment scores for each review.

In [None]:
df['Sentiment_TextBlob'].value_counts()

In [None]:
df['Sentiment_VADER'].value_counts()

Sentiment analysis, also known as opinion mining, is the process of determining the sentiment or emotional tone expressed in a piece of text, such as a review, tweet, or article. Sentiment analysis can help identify whether the sentiment in the text is positive, negative, or neutral, and sometimes even quantify the sentiment on a numerical scale.

VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool designed for text written in English. It is widely used for sentiment analysis in social media, customer reviews, and other text data. VADER analyzes text to determine the emotional tone conveyed by words and phrases, and it assigns a polarity score to each piece of text.

Here's how VADER sentiment analysis works:

Lexicon-Based Analysis: VADER relies on a predefined lexicon (dictionary) of words and phrases. Each word in the lexicon is assigned a polarity score, indicating how positive or negative the word is. The lexicon also contains words and phrases that convey neutrality, intensity, and other sentiment-related features.

Rule-Based Scoring: VADER uses rules to interpret the sentiment of text, taking into account the context and positioning of words in a sentence. It looks for patterns, intensifiers, negations, and other linguistic features to refine the sentiment analysis.

Polarity Score: For each piece of text, VADER calculates a compound polarity score that summarizes the overall sentiment. The score typically ranges from -1 (most negative) to 1 (most positive), with 0 indicating a neutral sentiment. This compound score provides a quantitative measure of sentiment.

Sentiment Classification: VADER classifies text into categories like positive, negative, or neutral based on the polarity score. The thresholds for classification can be adjusted to suit the specific analysis.

Sentiment Intensity: VADER can also provide an indication of sentiment intensity, helping differentiate between weak and strong sentiments.

VADER is especially useful for sentiment analysis in short text data, such as tweets or customer reviews, as it considers the context and semantics of text. It's a valuable tool for businesses, researchers, and organizations looking to gauge public sentiment in various forms of textual data.

To use VADER for sentiment analysis in Python, you can employ the NLTK library, the TextBlob library, or the VADERSentiment library, among others, depending on your preferences and requirements.

In [None]:
import matplotlib.pyplot as plt

sentiment_mapping = {'Positive': 1, 'Negative': -1, 'Neutral': 0}
df['Sentiment_VADER_numeric'] = df['Sentiment_VADER'].map(sentiment_mapping)

# Group by Category and calculate mean sentiment score
category_sentiment = df.groupby('Category')['Sentiment_VADER_numeric'].mean()


# Plot sentiment scores by category using a bar plot
plt.figure(figsize=(12, 6))
ax=category_sentiment.sort_values().plot(kind='bar', color='skyblue')
plt.title('Sentiment Scores by Category')
ax.bar_label(container=ax.containers[0])
plt.xlabel('Category')
plt.ylabel('Mean Sentiment Score')
plt.xticks(rotation=45)
plt.show()


In [None]:

# Create a box plot to visualize the distribution of sentiment scores by category

plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Category', y='Sentiment_VADER', palette='Set2')
plt.title('Distribution of Sentiment Scores by Category')
plt.xlabel('Category')
plt.ylabel('Sentiment Score')
plt.xticks(rotation=45)
plt.show()


In [None]:

# 1. Aggregate sentiment scores by category
sentiment_mapping = {'Positive': 1, 'Negative': -1, 'Neutral': 0}
df['Sentiment_TextBlob_numeric'] = df['Sentiment_TextBlob'].map(sentiment_mapping)

category_sentiment = df.groupby('Category')['Sentiment_TextBlob_numeric'].mean().reset_index()

# 2. Aggregate sentiment scores by subcategories
subcategory_sentiment = df.groupby(['Category', 'Subcategory1'])['Sentiment_TextBlob_numeric'].mean().reset_index()

# 3. Aggregate sentiment scores by location
location_sentiment = df.groupby('Location')['Sentiment_TextBlob_numeric'].mean().reset_index()

# 4. Aggregate sentiment scores by age group
# Define age groups or bins
age_bins = [0, 20, 30, 40, 50, 60, 100]
age_labels = ['0-20', '21-30', '31-40', '41-50', '51-60', '61+']
df['Age Group'] = pd.cut(df['Customer Age'], bins=age_bins, labels=age_labels)

age_sentiment = df.groupby('Age Group')['Sentiment_TextBlob_numeric'].mean().reset_index()

# 5. Visualize sentiment scores
plt.figure(figsize=(12, 6))

plt.subplot(221)
ax1=sns.barplot(x='Category', y='Sentiment_TextBlob_numeric', data=category_sentiment)
ax1.bar_label(ax1.containers[0])
plt.title('Sentiment by Category')

plt.subplot(222)
sns.barplot(x='Category', y='Sentiment_TextBlob_numeric', hue='Subcategory1', data=subcategory_sentiment)
plt.title('Sentiment by Subcategory')

plt.subplot(223)
ax2=sns.barplot(x='Location', y='Sentiment_TextBlob_numeric', data=location_sentiment)
ax2.bar_label(ax2.containers[0])
plt.title('Sentiment by Location')

plt.subplot(224)
ax3=sns.barplot(x='Age Group', y='Sentiment_TextBlob_numeric', data=age_sentiment)
ax3.bar_label(ax3.containers[0])
plt.title('Sentiment by Age Group')

plt.tight_layout()
plt.show()


In [None]:
# Group by Category and calculate mean sentiment score
category_sentiment = df.groupby('Category')['Sentiment_VADER_numeric'].mean()

# Group by Subcategory1 and calculate mean sentiment score
subcategory1_sentiment = df.groupby('Subcategory1')['Sentiment_VADER_numeric'].mean()

# Group by SubCategory2 and calculate mean sentiment score
subcategory2_sentiment = df.groupby('SubCategory2')['Sentiment_VADER_numeric'].mean()

# Group by Location and calculate mean sentiment score
location_sentiment = df.groupby('Location')['Sentiment_VADER_numeric'].mean()

# Group by Customer Age and calculate mean sentiment score
age_group_sentiment = df.groupby('Customer Age')['Sentiment_VADER_numeric'].mean()


In [None]:

# Plot sentiment scores by category
plt.figure(figsize=(10, 6))
category_sentiment.plot(kind='bar')
plt.title('Sentiment Scores by Category')
plt.xlabel('Category')
plt.ylabel('Mean Sentiment Score')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Analyze sentiment scores by product categories
category_sentiment = df.groupby('Category')['Sentiment_VADER_numeric'].mean()
plt.figure(figsize=(8,5))
sns.barplot(x=category_sentiment.index, y=category_sentiment.values)
plt.xticks(rotation=90)
plt.title('Average Sentiment by Product Category')
plt.xlabel('Category')
plt.ylabel('Average Sentiment Score')
plt.show()


In [None]:
# Analyze sentiment scores by subcategories
subcategory_sentiment = df.groupby('Subcategory1')['Sentiment_VADER_numeric'].mean()
plt.figure(figsize=(8,5))
sns.barplot(x=subcategory_sentiment.index, y=subcategory_sentiment.values)
plt.xticks(rotation=90)
plt.title('Average Sentiment by Subcategory')
plt.xlabel('Subcategory')
plt.ylabel('Average Sentiment Score')
plt.show()

In [None]:
# Analyze sentiment scores by location
location_sentiment = df.groupby('Location')['Sentiment_VADER_numeric'].mean().sort_values(ascending=False)
plt.figure(figsize=(8,5))
sns.barplot(x=location_sentiment.index, y=location_sentiment.values)
plt.xticks(rotation=90)
plt.title('Average Sentiment by Location')
plt.xlabel('Location')
plt.ylabel('Average Sentiment Score')
plt.show()

In [None]:
# Analyze sentiment scores by age group
age_group_sentiment = df.groupby('Age Group')['Sentiment_VADER_numeric'].mean()
plt.figure(figsize=(8, 5))
sns.barplot(x=age_group_sentiment.index, y=age_group_sentiment.values)
plt.title('Average Sentiment by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Average Sentiment Score')
plt.show()

Correlations Between Factors:

You can analyze correlations between numerical factors, for example, between 'Customer Age' and 'Sentiment_VADER':

In [None]:
correlation = df[['Customer Age', 'Sentiment_VADER_numeric']].corr()
print(correlation)


Outlier Analysis:

To identify data points with extreme sentiment scores, you can set a threshold and filter the DataFrame based on sentiment scores. For example, to find rows with extremely positive sentiment:

In [None]:
extreme_positive = df[df['Sentiment_VADER_numeric'] > 0.8]


In [None]:
extreme_positive.head()

Reading Customer Reviews from Extreme Sentiment Groups:

You've already filtered extreme positive reviews. You can do the same for extreme negative reviews:

In [None]:
extreme_negative = df[df['Sentiment_VADER_numeric'] < -0.8]


In [None]:
extreme_negative.head()

Comparative Analysis:

You mentioned using different sentiment analysis methods. If you have sentiment scores from different methods, you can compare them:

In [None]:
df.columns

#### Create visualizations to illustrate the sentiment analysis results.

In [None]:
import matplotlib.pyplot as plt

# Create a bar chart to visualize average sentiment by category
category_sentiment = df.groupby('Category')['Sentiment_VADER_numeric'].mean()
category_sentiment.plot(kind='bar')
plt.title('Average Sentiment by Category')
plt.xlabel('Category')
plt.ylabel('Average Sentiment Score')
plt.show()


In [None]:
# Create a box plot to visualize sentiment score distribution by age group
import seaborn as sns

sns.boxplot(x='Age Group', y='Sentiment_VADER_numeric', data=df)
plt.title('Sentiment Distribution by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Sentiment Score')
plt.xticks(rotation=45)
plt.show()


## Predictive Analytics:
   - Feature engineering: Prepare the text data using techniques like TF-IDF.


In [None]:
df.columns

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')
tfidf_vectorizer

In [None]:
# Combine 'Review Title' and 'Review Text' columns for text data
# df['Text'] = df['Review Title'] + ' ' + df['Review Text']

In [None]:
# Fit and transform your text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])

In [None]:
tfidf_matrix

In [None]:
# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(data=tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df

In [None]:

# Concatenate the TF-IDF DataFrame with your original DataFrame

df = pd.concat([df, tfidf_df], axis=1)

In [None]:
df

- Data split: Split the dataset into training and testing sets.

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.columns

In [None]:
from sklearn.model_selection import train_test_split

# Define the features (X) and the target variable (y)
# Assuming you want to predict 'Recommend Flag'
X = df[['Product ID', 'Category', 'Subcategory1', 'SubCategory2', 'Location']]
y = df['Recommend Flag']

# Split the data into a training set (80%) and a testing set (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


In [None]:
y_train.info()

In [None]:
X = pd.get_dummies(df[['Category', 'Location', 'Age Group']])


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fill missing values with an empty string
df['Combined Review'] = df['Combined Review'].fillna('')

tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust the number of features as needed
X_text = tfidf_vectorizer.fit_transform(df['Combined Review'])


In [None]:
X = pd.concat([X, pd.DataFrame(X_text.toarray())], axis=1)


In [None]:
# Remove rows with missing values in the target variable 'Recommend Flag'
df = df.dropna(subset=['Recommend Flag'])

# Define X and y after removing missing values
X = df[['Product ID', 'Category', 'Subcategory1', 'SubCategory2', 'Location']]
y = df['Recommend Flag']


In [None]:
# Assuming 'Your', 'Selected', 'Feature', 'Columns' are your categorical columns
from sklearn.preprocessing import OneHotEncoder
import numpy as np
# Initialize the one-hot encoder
encoder = OneHotEncoder(sparse_output=False, drop='first')

# Fit and transform the encoder on your categorical columns
X_encoded = encoder.fit_transform(df[['Product ID', 'Category', 'Subcategory1', 'SubCategory2', 'Location']])

# Combine the one-hot encoded features with any other numeric features you might have
# Make sure all columns are in a numeric format
# For example, if you have other numeric features in 'X', concatenate them with 'X_encoded'
X = np.concatenate((X_encoded, X), axis=1)


In [None]:
# Encode categorical variables using one-hot encoding
X = pd.get_dummies(df[['Category', 'Subcategory1', 'SubCategory2', 'Location', 'Channel', 'Age Group']])


In [None]:


# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
X_tfidf = tfidf_vectorizer.fit_transform(df['Combined Review'])


In [None]:


X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['Recommend Flag'], test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
clf = LogisticRegression()

# Train the model on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt

# Confusion Matrix
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print("AUC:", roc_auc)

# Plot ROC Curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()
