In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, classification_report



In [4]:
df = pd.read_csv('using_reviews (Autosaved).csv')
df.head()

Unnamed: 0.1,Unnamed: 0,reviews,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 92,Unnamed: 93,Unnamed: 94,Unnamed: 95,Unnamed: 96,Unnamed: 97,Unnamed: 98,Unnamed: 99,Unnamed: 100,sentiment
0,0,British Airways has confirmed itself as the...,,,,,,,,,...,,,,,,,,,,0.0
1,1,✅ Worst BA experience. I was supposed to fl...,,,,,,,,,...,,,,,,,,,,0.0
2,2,✅ My daughter and I were denied boarding o...,,,,,,,,,...,,,,,,,,,,0.0
3,3,✅ Despite boarding being the usual free for...,,,,,,,,,...,,,,,,,,,,0.0
4,4,"Flight cancelled, no crew! 9th September 20...",,,,,,,,,...,,,,,,,,,,0.0


In [5]:
df = df[['reviews', 'sentiment']]
df

Unnamed: 0,reviews,sentiment
0,British Airways has confirmed itself as the...,0.0
1,✅ Worst BA experience. I was supposed to fl...,0.0
2,✅ My daughter and I were denied boarding o...,0.0
3,✅ Despite boarding being the usual free for...,0.0
4,"Flight cancelled, no crew! 9th September 20...",0.0
...,...,...
995,✅ Boston to Nairobi via London. The only g...,
996,✅ Basel to San Fransisco via Heathrow. I'm...,
997,✅ London to Lyon. The flight has 1h 30 del...,
998,✅ London to Boston. I was seated next to a...,


In [6]:
df.dropna()
df

Unnamed: 0,reviews,sentiment
0,British Airways has confirmed itself as the...,0.0
1,✅ Worst BA experience. I was supposed to fl...,0.0
2,✅ My daughter and I were denied boarding o...,0.0
3,✅ Despite boarding being the usual free for...,0.0
4,"Flight cancelled, no crew! 9th September 20...",0.0
...,...,...
995,✅ Boston to Nairobi via London. The only g...,
996,✅ Basel to San Fransisco via Heathrow. I'm...,
997,✅ London to Lyon. The flight has 1h 30 del...,
998,✅ London to Boston. I was seated next to a...,


In [7]:
df_cleaned = df.dropna(subset=['sentiment'])
df_cleaned 

Unnamed: 0,reviews,sentiment
0,British Airways has confirmed itself as the...,0.0
1,✅ Worst BA experience. I was supposed to fl...,0.0
2,✅ My daughter and I were denied boarding o...,0.0
3,✅ Despite boarding being the usual free for...,0.0
4,"Flight cancelled, no crew! 9th September 20...",0.0
...,...,...
95,✅ Busy day at LHR and flight full. Lounge ...,1.0
96,✅ Worst seats I have ever encountered in ec...,0.0
97,Top Ten REASONS to not use British Airways T...,0.0
98,Easy check in on the way to Heathrow. The f...,1.0


In [8]:

df_cleaned['reviews'] = df_cleaned['reviews'].str.replace('[^a-zA-Z0-9\s]', '', regex=True)


print(df_cleaned)


                                              reviews  sentiment
0      British Airways has confirmed itself as the...        0.0
1       Worst BA experience I was supposed to fly ...        0.0
2        My daughter and I were denied boarding on...        0.0
3       Despite boarding being the usual free for ...        0.0
4      Flight cancelled no crew 9th September 2023...        0.0
..                                                ...        ...
95       Busy day at LHR and flight full Lounge cr...        1.0
96      Worst seats I have ever encountered in eco...        0.0
97    Top Ten REASONS to not use British Airways T...        0.0
98     Easy check in on the way to Heathrow The fl...        1.0
99       Online check in worked fine Quick securit...        1.0

[100 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['reviews'] = df_cleaned['reviews'].str.replace('[^a-zA-Z0-9\s]', '', regex=True)


In [9]:
df_cleaned.head()

Unnamed: 0,reviews,sentiment
0,British Airways has confirmed itself as the...,0.0
1,Worst BA experience I was supposed to fly ...,0.0
2,My daughter and I were denied boarding on...,0.0
3,Despite boarding being the usual free for ...,0.0
4,Flight cancelled no crew 9th September 2023...,0.0


In [10]:
df_cleaned['sentiment'] = df_cleaned['sentiment'].astype(int)
df_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['sentiment'] = df_cleaned['sentiment'].astype(int)


Unnamed: 0,reviews,sentiment
0,British Airways has confirmed itself as the...,0
1,Worst BA experience I was supposed to fly ...,0
2,My daughter and I were denied boarding on...,0
3,Despite boarding being the usual free for ...,0
4,Flight cancelled no crew 9th September 2023...,0


In [25]:
reviews = df['reviews'].astype(str)
sentiment = df['sentiment']

# Tokenization and Padding for Text Data
max_words = 100
max_len = 25

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(reviews)

text_sequences = tokenizer.texts_to_sequences(reviews)
text_padded = pad_sequences(text_sequences, maxlen=max_len, padding='post')

# Additional preprocessing for numerical features (if any)
numerical_features = df.drop(['sentiment', 'reviews'], axis=1)  # Assuming 'text' and 'label' are column names
# ... additional preprocessing for numerical features if needed

# Combine processed text data and numerical features
X_combined = text_padded  # Combine with numerical features if applicable

# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, sentiment, test_size=0.2, random_state=42
)

# Build Bidirectional LSTM Model
embedding_dim = 50

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
model.add(Bidirectional(LSTM(100)))#recurrent_dropout=0.2
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model Training
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_split=0.2)

# Making Predictions
y_pred_prob = model.predict(X_test)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy: 0.02
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.02      1.00      0.04         4
           2       0.00      0.00      0.00       180

    accuracy                           0.02       200
   macro avg       0.01      0.33      0.01       200
weighted avg       0.00      0.02      0.00 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report




In [11]:
X = df_cleaned['reviews']
y = df_cleaned['sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
tfidf_vectorizer = TfidfVectorizer(max_features=100)  # Adjust max_features based on your dataset
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [13]:
svm_model = SVC(kernel='linear')  # 'linear' kernel is often used for text classification
svm_model.fit(X_train_tfidf, y_train)


SVC(kernel='linear')

In [14]:

y_pred = svm_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)


Accuracy: 0.90
Classification Report:
               precision    recall  f1-score   support

           0       0.90      1.00      0.95        18
           1       0.00      0.00      0.00         2

    accuracy                           0.90        20
   macro avg       0.45      0.50      0.47        20
weighted avg       0.81      0.90      0.85        20



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
new_example = ['good crew']


new_example_tfidf = tfidf_vectorizer.transform(new_example)


y_pred = svm_model.predict(new_example_tfidf)

print(f"Predicted Label: {y_pred}")

Predicted Label: [1]


In [16]:
new_example = ['unresonable flight attendants']


new_example_tfidf = tfidf_vectorizer.transform(new_example)


y_pred = svm_model.predict(new_example_tfidf)

print(f"Predicted Label: {y_pred}")

Predicted Label: [0]


In [17]:
new_example = ['they lost my luggage']


new_example_tfidf = tfidf_vectorizer.transform(new_example)


y_pred = svm_model.predict(new_example_tfidf)

print(f"Predicted Label: {y_pred}")

Predicted Label: [0]


# the data was just too small for using deep Neural network

In [18]:
new_example = ['good crew, but they lost my luggage']


new_example_tfidf = tfidf_vectorizer.transform(new_example)


y_pred = svm_model.predict(new_example_tfidf)

print(f"Predicted Label: {y_pred}")

Predicted Label: [0]
