In [1]:
import pandas as pd
import numpy as np

In [None]:
from transformers import pipeline
# Initialize the sentiment analysis pipeline with our chosen model
# This will download the model if you haven't used it before (can take a few minutes and some disk space)
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest"
)

print("Hugging Face sentiment analysis pipeline initialized successfully!")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Xet Storage is enabled f

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu


Hugging Face sentiment analysis pipeline initialized successfully!


In [3]:
clean_file_path = r"C:\Users\comat\GitProjects\customer-churn-ai\data\cleaned_telco_churn.csv"

try:
    df_clean = pd.read_csv(clean_file_path)
    print("Data Loaded Successfully! Sensational!")
except FileNotFoundError:
    print(f"Error: File not found at {clean_file_path}. Please check the path and try again.")
except Exception as e:
    print(f"An error occurred: {e}")

Data Loaded Successfully! Sensational!


In [4]:
print("Preparing review texts...")
# Convert the CustomerReview column to a list, replacing NaN with empty strings
# as the pipeline expects string inputs.
review_texts = df_clean['CustomerReview'].fillna('').tolist()

print(f"Starting sentiment prediction for {len(review_texts)} reviews... (This may take a few minutes)")
# Get predictions for all texts
# This might take some time depending on the number of reviews and your hardware
hf_predictions = sentiment_pipeline(review_texts)

print(f"\nProcessed {len(hf_predictions)} reviews with Hugging Face model.")


Preparing review texts...
Starting sentiment prediction for 7043 reviews... (This may take a few minutes)

Processed 7043 reviews with Hugging Face model.


In [5]:
# You can inspect the first few predictions to see their structure if you like:
print("\nSample Hugging Face predictions (first 3):")
for i in range(min(3, len(hf_predictions))):
    print(f"Review snippet: {review_texts[i][:100]}...")
    print(f"Prediction: {hf_predictions[i]}")


Sample Hugging Face predictions (first 3):
Review snippet: Really happy with my plan... no unexpected fees. Made the right choice here....
Prediction: {'label': 'positive', 'score': 0.9732906222343445}
Review snippet: Really happy with my plan... phone support was helpful and polite....
Prediction: {'label': 'positive', 'score': 0.9825600981712341}
Review snippet: Unfortunately, my service was... frustrated by the slow data....
Prediction: {'label': 'negative', 'score': 0.9061986207962036}


In [10]:
print("\nProcessing predictions and adding to DataFrame...")
# Extract labels and scores
hf_labels_raw = [pred['label'] for pred in hf_predictions]
hf_scores = [pred['score'] for pred in hf_predictions]

# Define the mapping from the model's output labels to our desired text labels
# For cardiffnlp/twitter-roberta-base-sentiment-latest:
# 0 -> Negative (LABEL_0)
# 1 -> Neutral  (LABEL_1)
# 2 -> Positive (LABEL_2)
label_mapping = {
    "negative": "Negative",
    "neutral": "Neutral",
    "positive": "Positive"
}

# Map the raw labels to text labels
hf_sentiment_labels = [label_mapping.get(label, "Unknown") for label in hf_labels_raw] # Added "Unknown" for safety

# Add as new columns to your DataFrame
df_clean['HF_Sentiment_Label'] = hf_sentiment_labels
df_clean['HF_Sentiment_Score'] = hf_scores

print("Hugging Face sentiment labels and scores added to DataFrame.")



Processing predictions and adding to DataFrame...
Hugging Face sentiment labels and scores added to DataFrame.


In [None]:
# # --- Insert this cell for debugging ---
# # Assuming hf_labels_raw has been created in a previous cell:
# # hf_labels_raw = [pred['label'] for pred in hf_predictions]

# if 'hf_labels_raw' in locals() and hf_labels_raw: # Check if it exists and is not empty
#     unique_raw_labels = set(hf_labels_raw)
#     print(f"Unique raw labels produced by the Hugging Face model: {unique_raw_labels}")
#     print(f"First few raw labels: {hf_labels_raw[:5]}")
# else:
#     print("Error: hf_labels_raw is not defined or is empty. Please ensure Step C ran correctly and hf_predictions has data.")
# # --- End of debugging cell ---

Unique raw labels produced by the Hugging Face model: {'neutral', 'negative', 'positive'}
First few raw labels: ['positive', 'positive', 'negative', 'positive', 'negative']


In [15]:
print("\nDistribution of Sentiment Labels:")
print(df_clean["HF_Sentiment_Label"].value_counts())


Distribution of Sentiment Labels:
HF_Sentiment_Label
Positive    5177
Negative    1818
Neutral       48
Name: count, dtype: int64


In [16]:
print("\nDistribution of Sentiment as Percentages:")
print(df_clean["HF_Sentiment_Label"].value_counts(normalize=True))


Distribution of Sentiment as Percentages:
HF_Sentiment_Label
Positive    0.735056
Negative    0.258129
Neutral     0.006815
Name: proportion, dtype: float64


In [18]:
# Display the first few rows with the new Hugging Face sentiment columns
print("\nDataFrame with Hugging Face Sentiment (first 5 rows):")
print(df_clean[['CustomerReview', 'HF_Sentiment_Label', 'HF_Sentiment_Score']].head()) # Comparing with VADER


DataFrame with Hugging Face Sentiment (first 5 rows):
                                      CustomerReview HF_Sentiment_Label  \
0  Really happy with my plan... no unexpected fee...           Positive   
1  Really happy with my plan... phone support was...           Positive   
2  Unfortunately, my service was... frustrated by...           Negative   
3  Consistently good signal... data plan is a gre...           Positive   
4  Not satisfied with the billing... billing erro...           Negative   

   HF_Sentiment_Score  
0            0.973291  
1            0.982560  
2            0.906199  
3            0.960772  
4            0.930403  


In [None]:
# Display the counts of each new sentiment label
print("\nDistribution of Hugging Face Sentiment Labels:")
print(df_clean['HF_Sentiment_Label'].value_counts())

# Display the percentage distribution (capped at 3 decimals)
print("\nDistribution of Hugging Face Sentiment as Percentages:")
hf_percentages = df_clean["HF_Sentiment_Label"].value_counts(normalize=True) * 100
print(hf_percentages.round(3))
