In [1]:
# %pip install imbalanced-learn

## Data Preparation

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [3]:
df = pd.read_csv("DataScraping_and_processing/karthik_cleaned_data.csv", index_col=[0])
df.head()

Unnamed: 0,overall_rating,review_date,review_text,airline_name,NPS_category,NPS,language_info,clean_text,text_length,word_count,unique_word_count,word_density,uppercase_words,comma_count,exclamation_count,question_mark_count,avg_sentence_length,flesch_reading_score,gunning_fog_index
0,Rated 1 out of 5 stars,2023-09-18 03:29:15,EasyJet sent text at 4.00 am day of flight hom...,easyjet,Detractor,-1,en,text rush organise transport effort implicatio...,194,32,29,5.878788,0,1,0,0,6.6,81.9,5.06
1,9,2023-12-01 00:00:00,Its been a few years when I flew a lot in A...,Vistara,Promoter,1,en,asia vistara surprise swift immaculate steward...,608,109,74,5.527273,3,7,0,0,27.25,61.19,14.08
3,Rated 1 out of 5 stars,2018-09-09 20:37:11,one of the worst experiences with Air France e...,air france,Detractor,-1,en,ever bore carry duty inbound,204,30,30,6.580645,0,2,0,0,10.0,61.33,8.0
4,Rated 1 out of 5 stars,2019-12-23 14:00:19,Not a single star this airlines deserves .I lo...,egyptair,Detractor,-1,en,deserves last block respond mail operator harr...,464,87,64,5.272727,4,2,0,0,14.833333,74.39,7.45
5,Rated 1 out of 5 stars,2023-12-29 06:42:56,I was forced to pay 150-euro worth penalty for...,ryanair,Detractor,-1,en,penalty fail earth would dare season arrogant ...,240,38,35,6.153846,1,1,0,1,12.666667,58.58,10.34


In [4]:
df.isna().sum()

overall_rating            0
review_date               0
review_text               0
airline_name              0
NPS_category              0
NPS                       0
language_info             0
clean_text              719
text_length               0
word_count                0
unique_word_count         0
word_density              0
uppercase_words           0
comma_count               0
exclamation_count         0
question_mark_count       0
avg_sentence_length       0
flesch_reading_score      0
gunning_fog_index         0
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
# creating bag of words representation
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(df["clean_text"])

X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(
    bow_matrix, df["NPS_category"], test_size=0.25, random_state=42)

clf_bow = MultinomialNB()
clf_bow.fit(X_train_bow, y_train_bow)
y_pred_bow = clf_bow.predict(X_test_bow)
print("BoW Classification Report:\n", classification_report(y_test_bow, y_pred_bow))

BoW Classification Report:
               precision    recall  f1-score   support

   Detractor       0.90      0.98      0.94     11894
     Neutral       0.07      0.01      0.01       338
    Promoter       0.78      0.47      0.58      1920

    accuracy                           0.89     14152
   macro avg       0.58      0.48      0.51     14152
weighted avg       0.86      0.89      0.87     14152



In [7]:
# creating a TF-IDF text representation

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(df["clean_text"])

X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(
    tfidf_matrix, df["NPS_category"], test_size=0.25, random_state=42)

clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)
print("TF-IDF Classification Report:\n", classification_report(y_test_tfidf, y_pred_tfidf))

TF-IDF Classification Report:
               precision    recall  f1-score   support

   Detractor       0.85      1.00      0.92     11894
     Neutral       0.00      0.00      0.00       338
    Promoter       0.91      0.08      0.14      1920

    accuracy                           0.85     14152
   macro avg       0.59      0.36      0.35     14152
weighted avg       0.84      0.85      0.79     14152



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
import numpy as np
import pandas as pd
from scipy.sparse import vstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['NPS_category'], test_size=0.25, random_state=42)

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

class_idfs = []
unique_classes = np.unique(y_train)
for class_idx in unique_classes:
    class_mask = (y_train == class_idx)
    class_docs = X_train[class_mask]
    class_vectorizer = TfidfVectorizer(vocabulary=tfidf_vectorizer.vocabulary_)
    class_tfidf = class_vectorizer.fit_transform(class_docs)
    class_idf = class_vectorizer.idf_
    class_idfs.append(class_idf)

mean_idf = np.mean(class_idfs, axis=0)

delta_idfs = [class_idf - mean_idf for class_idf in class_idfs]

class_to_index = {label: index for index, label in enumerate(unique_classes)}

X_train_delta_tfidf = []
for i, doc in enumerate(X_train_tfidf):
    class_idx = class_to_index[y_train.iloc[i]]
    delta_idf = delta_idfs[class_idx]
    X_train_delta_tfidf.append(doc.multiply(delta_idf))

X_train_delta_tfidf = vstack(X_train_delta_tfidf)

X_test_tfidf = tfidf_vectorizer.transform(X_test)
X_test_delta_tfidf = []
for i, doc in enumerate(X_test_tfidf):
    class_idx = class_to_index[y_test.iloc[i]]
    delta_idf = delta_idfs[class_idx]
    X_test_delta_tfidf.append(doc.multiply(delta_idf))

X_test_delta_tfidf = vstack(X_test_delta_tfidf)

clf = LogisticRegression()
clf.fit(X_train_delta_tfidf, y_train)

y_pred = clf.predict(X_test_delta_tfidf)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

   Detractor       0.96      1.00      0.98     11894
     Neutral       1.00      0.12      0.22       338
    Promoter       0.98      0.88      0.93      1920

    accuracy                           0.96     14152
   macro avg       0.98      0.67      0.71     14152
weighted avg       0.96      0.96      0.95     14152



#### Data balancing techniques

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import ComplementNB


count_vectorizer = CountVectorizer()

# Fit and transform the clean_text to create BoW representation
bow_matrix = count_vectorizer.fit_transform(df["clean_text"])

# Split the data into training and testing sets
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(
    bow_matrix, df["NPS_category"], test_size=0.25, random_state=42, stratify=df["NPS_category"])

# Train the pipeline on the training data

# Initialize the classifier
clf_bow_res = ComplementNB()

# Fit the classifier on the resampled training data
clf_bow_res.fit(X_train_delta_tfidf, y_train)

# Predict on the test set using the trained pipeline
y_pred = clf_bow_res.predict(X_test_delta_tfidf)

# Print the classification report
print(classification_report(y_test, y_pred))

ValueError: Negative values in data passed to ComplementNB (input X)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Assuming df["clean_text"] and df["NPS_category"] are already defined

# Initialize CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the clean_text to create BoW representation
bow_matrix = count_vectorizer.fit_transform(df["clean_text"])

# Split the data into training and testing sets
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(
    bow_matrix, df["NPS_category"], test_size=0.25, random_state=42)

# Initialize RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Apply RandomOverSampler to training data only
X_train_bow_res, y_train_bow_res = ros.fit_resample(X_train_bow, y_train_bow)

# Check the class distribution after resampling
print("Class distribution after Random OverSampling:", Counter(y_train_bow_res))

# Initialize the classifier
clf_bow_res = ComplementNB()

# Fit the classifier on the resampled training data
clf_bow_res.fit(X_train_bow_res, y_train_bow_res)

# Predict on the original (non-resampled) testing data
y_pred_bow_res = clf_bow_res.predict(X_test_bow)

# Print the classification report
print("BoW Classification Report after Random OverSampling:\n", classification_report(y_test_bow, y_pred_bow_res))

Class distribution after Random OverSampling: Counter({'Detractor': 35651, 'Promoter': 35651, 'Neutral': 35651})
BoW Classification Report after Random OverSampling:
               precision    recall  f1-score   support

   Detractor       0.95      0.89      0.91     11894
     Neutral       0.09      0.33      0.14       338
    Promoter       0.65      0.61      0.63      1920

    accuracy                           0.84     14152
   macro avg       0.56      0.61      0.56     14152
weighted avg       0.89      0.84      0.86     14152



In [11]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Assuming df["clean_text"] and df["NPS_category"] are already defined

# Initialize CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the clean_text to create BoW representation
bow_matrix = count_vectorizer.fit_transform(df["clean_text"])

# Split the data into training and testing sets
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(
    bow_matrix, df["NPS_category"], test_size=0.25, random_state=42)

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to training data only
X_train_bow_smote, y_train_bow_smote = smote.fit_resample(X_train_bow, y_train_bow)

# Initialize the classifier
clf_bow_smote = ComplementNB()

# Fit the classifier on the resampled training data
clf_bow_smote.fit(X_train_bow_smote, y_train_bow_smote)

# Predict on the original (non-resampled) testing data
y_pred_bow_smote = clf_bow_smote.predict(X_test_bow)

# Print the classification report
print("BoW Classification Report after SMOTE:\n", classification_report(y_test_bow, y_pred_bow_smote))


BoW Classification Report after SMOTE:
               precision    recall  f1-score   support

   Detractor       0.93      0.93      0.93     11894
     Neutral       0.08      0.16      0.11       338
    Promoter       0.68      0.61      0.64      1920

    accuracy                           0.86     14152
   macro avg       0.57      0.56      0.56     14152
weighted avg       0.88      0.86      0.87     14152



In [12]:
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Assuming df["clean_text"] and df["NPS_category"] are already defined

# Initialize CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the clean_text to create BoW representation
bow_matrix = count_vectorizer.fit_transform(df["clean_text"])

# Split the data into training and testing sets
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(
    bow_matrix, df["NPS_category"], test_size=0.25, random_state=42)

# Initialize ADASYN
adasyn = ADASYN(random_state=42)

# Apply ADASYN to training data only
X_train_bow_adasyn, y_train_bow_adasyn = adasyn.fit_resample(X_train_bow, y_train_bow)

# Initialize the classifier
clf_bow_adasyn = ComplementNB()

# Fit the classifier on the resampled training data
clf_bow_adasyn.fit(X_train_bow_adasyn, y_train_bow_adasyn)

# Predict on the original (non-resampled) testing data
y_pred_bow_adasyn = clf_bow_adasyn.predict(X_test_bow)

# Print the classification report
print("BoW Classification Report after ADASYN:\n", classification_report(y_test_bow, y_pred_bow_adasyn))


BoW Classification Report after ADASYN:
               precision    recall  f1-score   support

   Detractor       0.93      0.92      0.93     11894
     Neutral       0.08      0.16      0.10       338
    Promoter       0.66      0.61      0.64      1920

    accuracy                           0.86     14152
   macro avg       0.56      0.56      0.56     14152
weighted avg       0.88      0.86      0.87     14152



## Modelling

## Evaluation