In [1]:
# Import necessary libraries
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, f1_score, classification_report
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/emad/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
# Load the dataset
reviews_df = pd.read_csv('data/reviews.csv')

# Print basic information about the datasets
print("Training Data Info:")
print(reviews_df.info())


Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70317 entries, 0 to 70316
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      70317 non-null  uint64
 1   TEXT    70311 non-null  object
 2   LABEL   70317 non-null  int64 
dtypes: int64(1), object(1), uint64(1)
memory usage: 1.6+ MB
None


In [3]:
# Drop any rows with missing TEXT in the dataset
reviews_df.dropna(subset=['TEXT'], inplace=True)

# Display a sample of the training dataset
pd.set_option('display.max_colwidth', None)
print("\nExample Data Points from Training Data:")
print(reviews_df.head())

# Dataset size per class
class_distribution = reviews_df['LABEL'].value_counts()
print("\nDataset Size per Class in Training Data:")
print(class_distribution)

# Measure of text length diversity
reviews_df['text_length'] = reviews_df['TEXT'].apply(len)
print("\nStatistics of Text Lengths in Training Data:")
print(reviews_df['text_length'].describe())

# Analyzing the diversity of vocabulary
words = pd.Series(' '.join(reviews_df['TEXT']).split()).value_counts()
print("\nVocabulary Diversity in Training Data:")
print("Unique Words:", words.size)
print("Top 20 Most Frequent Words:")
print(words.head(20))

# Initialize the sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()
reviews_df['sentiment_score'] = reviews_df['TEXT'].apply(lambda x: sia.polarity_scores(x)['compound'])
# Descriptive statistics of the sentiment scores
print("\nSentiment Scores:")
print(reviews_df['sentiment_score'].describe())

# Display average sentiment scores by class
avg_sentiment_by_class = reviews_df.groupby('LABEL')['sentiment_score'].mean()
print("\nAverage Sentiment Score by Class:")
print(avg_sentiment_by_class)


Example Data Points from Training Data:
                     ID  \
0   7850790573542594519   
1   9392069522632994700   
2   5083704536542443514   
3  12418349755186772171   
4  12144957944004619479   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

In [4]:
# Extracting sentiment features from text
class SentimentFeature(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.sia = SentimentIntensityAnalyzer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, pd.Series):
            X = X.to_frame()
        # Applying sentiment analysis to extract positive, negative, and neutral scores
        sentiments = X.iloc[:, 0].apply(lambda x: self.sia.polarity_scores(x))
        df = pd.DataFrame(list(sentiments))
        return df[['pos', 'neg', 'neu']]

# Setup the preprocessing and model pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(min_df=5, max_df=0.45, ngram_range=(1, 3)), 'TEXT'),
        ('sentiments', Pipeline([
            ('sentiment_feat', SentimentFeature()),
            ('scaler', StandardScaler())
        ]), ['TEXT'])],
    remainder='drop')

# Define the full pipeline including preprocessing and the classifier
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', LogisticRegression(solver='saga', C=10, penalty='l2', max_iter=5000, class_weight='balanced'))])

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(reviews_df[['TEXT']], reviews_df['LABEL'], test_size=0.2, random_state=42)

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the testing data
y_pred = pipeline.predict(X_test)

# Evaluate the model performance
f1 = f1_score(y_test, y_pred, average='macro')
report = classification_report(y_test, y_pred)

print("F1 Score:", f1)
print("Classification Report:\n", report)

F1 Score: 0.9286996851578322
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98      6430
           1       0.91      0.89      0.90      3861
           2       0.91      0.91      0.91      3772

    accuracy                           0.94     14063
   macro avg       0.93      0.93      0.93     14063
weighted avg       0.94      0.94      0.94     14063



In [6]:
# Define F1 score as the scoring function
f1_macro = make_scorer(f1_score, average='macro')

# Perform cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, reviews_df[['TEXT']], reviews_df['LABEL'], cv=cv, scoring=f1_macro)

print("Cross-validated F1 scores:", scores)
print("Mean F1 score:", scores.mean())
print("Standard deviation of F1 scores:", scores.std())

Cross-validated F1 scores: [0.92467492 0.92359163 0.92291842 0.92720691 0.92555991]
Mean F1 score: 0.9247903608925515
Standard deviation of F1 scores: 0.0015090648925655927


In [7]:
# Create a DataFrame for error analysis
errors = pd.DataFrame({
    'Text': X_test['TEXT'],  
    'True Labels': y_test,
    'Predicted Labels': y_pred,
    'Probabilities': pipeline.predict_proba(X_test).max(axis=1)})
errors['Correct'] = errors['True Labels'] == errors['Predicted Labels']

In [8]:
# Analyze error distribution
print("Error rate by class:")
print(errors[~errors['Correct']].groupby('True Labels').size() / errors.groupby('True Labels').size())

Error rate by class:
True Labels
0    0.017729
1    0.108003
2    0.091994
dtype: float64


In [9]:
# Review errors with high confidence:
high_confidence_errors = errors[(~errors['Correct']) & (errors['Probabilities'] > 0.9)]
print("High confidence errors examples:")
print(high_confidence_errors.sample(min(10, len(high_confidence_errors))))  

High confidence errors examples:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [10]:
# Identify instances where negative reviews were incorrectly predicted as positive
misclassified_positive = errors[(errors['True Labels'] == 2) & (errors['Predicted Labels'] == 1)]
print("Misclassified as positive (sample):")
print(misclassified_positive.sample(min(10, len(misclassified_positive)), random_state=42))

# Identify instances where positive reviews were incorrectly predicted as negative
misclassified_negative = errors[(errors['True Labels'] == 1) & (errors['Predicted Labels'] == 2)]
print("Misclassified as negative (sample):")
print(misclassified_negative.sample(min(10, len(misclassified_negative)), random_state=42))


Misclassified as positive (sample):
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    