In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk

In [6]:
df = pd.read_csv("british-airways-reviews.csv")
df.head()

Unnamed: 0,Review
0,Not Verified| We are flying Business class fo...
1,✅Trip Verified| I am in Australia and on Frid...
2,✅Trip Verified| At 7.54 am on the day of trav...
3,✅Trip Verified| Would happily fly them again....
4,"Not Verified| Flew premium, only worth the ex..."


In [7]:
# remove "Trip Verified" and "Trip not verified" from the "review" column using regex
df['Review'] = df['Review'].str.replace(r'Trip (not )?Verified\| ', '', regex=True)

In [8]:
df.head()

Unnamed: 0,Review
0,Not Verified| We are flying Business class fo...
1,"✅ I am in Australia and on Friday night, went ..."
2,✅ At 7.54 am on the day of travel whilst drivi...
3,✅ Would happily fly them again. I had a person...
4,"Not Verified| Flew premium, only worth the ex..."


In [9]:
df["Review"]

0       Not Verified|  We are flying Business class fo...
1       ✅ I am in Australia and on Friday night, went ...
2       ✅ At 7.54 am on the day of travel whilst drivi...
3       ✅ Would happily fly them again. I had a person...
4       Not Verified|  Flew premium, only worth the ex...
                              ...                        
3482    Flew LHR - VIE return operated by bmi but BA a...
3483    LHR to HAM. Purser addresses all club passenge...
3484    My son who had worked for British Airways urge...
3485    London City-New York JFK via Shannon on A318 b...
3486    SIN-LHR BA12 B747-436 First Class. Old aircraf...
Name: Review, Length: 3487, dtype: object

In [11]:
df.shape

(3487, 1)

In [20]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [22]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\georg\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [24]:
analyzer = SentimentIntensityAnalyzer()

# Calculate the sentiment scores for each row in the 'Review' column
scores = df['Review'].apply(analyzer.polarity_scores)

# Combine the scores into a DataFrame
scores_df = pd.DataFrame(list(scores))

# Concatenate the scores DataFrame with the original DataFrame
result_df = pd.concat([df, scores_df], axis=1)


In [25]:
print(result_df)

                                                 Review    neg    neu    pos  \
0     Not Verified|  We are flying Business class fo...  0.034  0.943  0.023   
1     ✅ I am in Australia and on Friday night, went ...  0.060  0.932  0.008   
2     ✅ At 7.54 am on the day of travel whilst drivi...  0.146  0.754  0.100   
3     ✅ Would happily fly them again. I had a person...  0.051  0.686  0.263   
4     Not Verified|  Flew premium, only worth the ex...  0.103  0.777  0.120   
...                                                 ...    ...    ...    ...   
3482  Flew LHR - VIE return operated by bmi but BA a...  0.027  0.667  0.306   
3483  LHR to HAM. Purser addresses all club passenge...  0.000  0.712  0.288   
3484  My son who had worked for British Airways urge...  0.056  0.875  0.069   
3485  London City-New York JFK via Shannon on A318 b...  0.000  0.711  0.289   
3486  SIN-LHR BA12 B747-436 First Class. Old aircraf...  0.078  0.795  0.126   

      compound  
0      -0.4932  
1    

In [33]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()


# Calculate the sentiment scores for each row in the 'Review' column
scores = df['Review'].apply(analyzer.polarity_scores)

# Combine the scores into a DataFrame
scores_df = pd.DataFrame(list(scores))

# Add a new column to the original DataFrame with the sentiment labels
conditions = [
    (scores_df['compound'] > 0.05),
    (scores_df['compound'] < -0.05),
    (scores_df['compound'] >= -0.05) & (scores_df['compound'] <= 0.05)
]

values = ['positive', 'negative', 'neutral']

df['sentiment'] = np.select(conditions, values)

# Concatenate the scores DataFrame with the original DataFrame
result_df = pd.concat([df, scores_df], axis=1)

# Print the resulting DataFrame with the sentiment scores and labels
print(result_df)

                                                 Review sentiment    neg  \
0     Not Verified|  We are flying Business class fo...  negative  0.034   
1     ✅ I am in Australia and on Friday night, went ...  negative  0.060   
2     ✅ At 7.54 am on the day of travel whilst drivi...  negative  0.146   
3     ✅ Would happily fly them again. I had a person...  positive  0.051   
4     Not Verified|  Flew premium, only worth the ex...  positive  0.103   
...                                                 ...       ...    ...   
3482  Flew LHR - VIE return operated by bmi but BA a...  positive  0.027   
3483  LHR to HAM. Purser addresses all club passenge...  positive  0.000   
3484  My son who had worked for British Airways urge...  positive  0.056   
3485  London City-New York JFK via Shannon on A318 b...  positive  0.000   
3486  SIN-LHR BA12 B747-436 First Class. Old aircraf...  positive  0.078   

        neu    pos  compound  
0     0.943  0.023   -0.4932  
1     0.932  0.008   -0.7

In [40]:
result_df = pd.read_csv("result.csv")

In [43]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Plot a histogram of the compound scores
plt.hist(result_df['compound'], bins=20)
plt.xlabel('Compound Sentiment Score')
plt.ylabel('Frequency')
plt.title('Distribution of Sentiment Scores')
plt.show()

# Generate a word cloud of the most common words in the reviews
text = ' '.join(result_df['clean_review'].tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Reviews')
plt.show()

ModuleNotFoundError: No module named 'wordcloud'