In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
import re

from wordcloud import WordCloud
from nltk import bigrams
from collections import Counter
from nltk.corpus import stopwords

In [None]:
#cloning git
!git clone https://github.com/dorianb04/airline-ucg

In [3]:
companies = ['air-france', 'asiana-airlines', 'british-airways', 'iberia', 'japan-airlines-jal', 'lufthansa',
             'or520-emirates', 'or1030-korean-air', 'qatar-airways', 'ryanair' ,'singapore_airlines',
             'swiss-international-air-lines-swiss','turkish-airlines']

In [4]:
dataframes = []
for company in companies:
    df = pd.read_csv(f'airline-ucg/cleaning/{company}_reviews_cleaned.csv')
    df['company'] = company 
    dataframes.append(df)

#concatenate everything in one dataframe
data = pd.concat(dataframes, ignore_index=True)

In [None]:
max(data['rating'])

In [None]:
min(data['rating'])

In [None]:
#define the value which separates statisfied and not satisfied customers
lim_rating = 2.5

lim_rating

In [8]:
#we create a column to categorize satisfied / not satisfied customers
data['satisfaction'] = data['rating'].apply(lambda x: 'Satisfied' if x > lim_rating else 'Not Satisfied')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
proportions_total = data['satisfaction'].value_counts(normalize=True)

proportions_total

In [None]:
#we compute the proportion of satisfied / not satisfied
proportions = data.groupby(['company', 'satisfaction']).size().unstack(fill_value=0)
proportions = proportions.div(proportions.sum(axis=1), axis=0)

proportions_sorted = proportions.sort_values(by='Satisfied', ascending=False)

proportions_sorted

In [None]:
plt.figure(figsize=(12, 8))
proportions_sorted.plot(kind='bar', stacked=True, color=['#FF9999', '#66B3FF'], alpha=0.85)

plt.title('Proportion of Satisfied vs Not Satisfied Customers by Airline', fontsize=14)
plt.ylabel('Proportion', fontsize=12)
plt.xlabel('Airline', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Satisfaction', loc='upper right')
plt.tight_layout()

plt.show()

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [15]:
# Preprocess: delete stopwords, lower, remove punctuation
def preprocess_text(text):
    text = ''.join(char for char in text if char not in string.punctuation)
    text = text.lower()
    return ' '.join(word for word in text.split() if word not in stop_words)

In [16]:
def generate_bigram_wordcloud(data, company_name, min_bigrams):
    # Filter data for the company
    company_data = data[data['company'] == company_name]

    company_data['cleaned_review_text'] = company_data['review_text'].dropna().apply(preprocess_text)

    # Extract bigrams
    all_text = ' '.join(company_data['cleaned_review_text'].dropna())
    bigram_list = list(bigrams(all_text.split()))
    bigram_counts = Counter(bigram_list)

    # Filter bigrams appearing more than `min_bigrams` times
    frequent_bigrams = {f"{w1} {w2}": count for (w1, w2), count in bigram_counts.items() if count > min_bigrams}

    # Calculate average ratings for each bigram and count occurrences
    def bigram_rating(bigram):
        pattern = r'\b{}\b \b{}\b'.format(bigram[0], bigram[1])
        reviews_containing_bigram = company_data[company_data['cleaned_review_text'].str.contains(pattern, na=False, regex=True)]
        return reviews_containing_bigram['rating'].mean(), len(reviews_containing_bigram)

    bigram_ratings = {bigram: bigram_rating(tuple(bigram.split())) for bigram in frequent_bigrams.keys()}

    # Define min/max rating
    min_rating = 1
    max_rating = 5

    # Define color scale logic
    def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
        rating, _ = bigram_ratings[word]

        # color according to the rating
        if rating <= 1.5:  # dark red
            color = sns.color_palette("Reds", 7)[6]
        elif rating <= 2.2:  # red
            color = sns.color_palette("Reds", 7)[4]
        elif rating <= 2.8:  # grey
            color = sns.color_palette("Greys", 7)[4]
        elif rating <= 3.5:  # green
            color = sns.color_palette("Greens", 7)[2]
        else:  # dark green
            color = sns.color_palette("Greens", 7)[6]

        # RGB
        return tuple([int(c * 255) for c in color[:3]])

    # Generate word cloud
    wc = WordCloud(width=800, height=400, background_color='white', color_func=color_func).generate_from_frequencies(frequent_bigrams)

    # Display word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"Bigram WordCloud for {company_name}", fontsize=16)
    plt.show()

    # Print bigram ratings and frequencies
    for bigram, (rating, count) in bigram_ratings.items():
        print(f"Bigram: {bigram}, Avg. Rating: {rating:.2f}, Count: {count}")


In [None]:
generate_bigram_wordcloud(data, 'air-france', 100)

In [None]:
generate_bigram_wordcloud(data, 'asiana-airlines', 50)

In [None]:
generate_bigram_wordcloud(data, 'british-airways', 150)

In [None]:
generate_bigram_wordcloud(data, 'iberia', 150)

In [None]:
generate_bigram_wordcloud(data, 'japan-airlines-jal', 150)

In [None]:
generate_bigram_wordcloud(data, 'lufthansa', 150)

In [None]:
generate_bigram_wordcloud(data, 'or520-emirates', 150)

In [None]:
generate_bigram_wordcloud(data, 'or1030-korean-air', 150)

In [None]:
generate_bigram_wordcloud(data, 'qatar-airways', 150)

In [None]:
generate_bigram_wordcloud(data, 'ryanair', 150)

In [None]:
generate_bigram_wordcloud(data, 'singapore_airlines', 150)

In [None]:
generate_bigram_wordcloud(data, 'swiss-international-air-lines-swiss', 150)

In [None]:
generate_bigram_wordcloud(data, 'turkish-airlines', 150)

In [None]:
data.head()

In [31]:
# Compute the number of times a route is mentioned in a negative review
def bad_travels(data, company):
    # Filter data for the given company
    company_data_neg = data[(data['company'] == company) & (data['satisfaction'] == 'Not Satisfied')]
    company_data_pos = data[(data['company'] == company) & (data['satisfaction'] == 'Satisfied')]

    # Count negative and positive reviews for each route
    nb_neg = company_data_neg.groupby(['departure', 'destination']).size().reset_index(name='neg_count')
    nb_pos = company_data_pos.groupby(['departure', 'destination']).size().reset_index(name='pos_count')

    # Merge negative and positive counts
    merged_counts = pd.merge(nb_neg, nb_pos, on=['departure', 'destination'], how='outer').fillna(0)

    # Calculate the ratio of negative mentions to total mentions
    merged_counts['total_count'] = merged_counts['neg_count'] + merged_counts['pos_count']
    merged_counts['neg_ratio'] = merged_counts['neg_count'] / merged_counts['total_count']

    # Filter routes mentioned more than 50 times
    merged_counts_filtered = merged_counts[merged_counts['total_count'] > 50]

    # Sort by negative ratio
    merged_counts_sorted = merged_counts_filtered.sort_values(by='neg_ratio', ascending=False)

    # Select the top 10 routes
    top_10_routes = merged_counts_sorted.head(10)

    # Plot the top routes
    plt.figure(figsize=(12, 8))
    plt.bar(
        x=top_10_routes.apply(lambda row: f"{row['departure']} -> {row['destination']}", axis=1),
        height=top_10_routes['neg_ratio'],
        color='#FF9999',
        alpha=0.85
    )

    # Add titles and labels
    plt.title(f'Ratio a route is mentioned in a negative review (min 50 times) ({company})', fontsize=14)
    plt.ylabel('Ratio of Negative Mentions/Total Mentions', fontsize=12)
    plt.xlabel('Route', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    # Show the plot
    plt.show()



In [None]:
for company in companies:
    company_name = company
    bad_travels(data, company_name)

In [33]:
# Compute the number of times a route is mentioned in a positive review
def good_travels(data, company):
    # Filter data for the given company
    company_data_neg = data[(data['company'] == company) & (data['satisfaction'] == 'Not Satisfied')]
    company_data_pos = data[(data['company'] == company) & (data['satisfaction'] == 'Satisfied')]

    # Count negative and positive reviews for each route
    nb_neg = company_data_neg.groupby(['departure', 'destination']).size().reset_index(name='neg_count')
    nb_pos = company_data_pos.groupby(['departure', 'destination']).size().reset_index(name='pos_count')

    # Merge negative and positive counts
    merged_counts = pd.merge(nb_neg, nb_pos, on=['departure', 'destination'], how='outer').fillna(0)

    # Calculate the ratio of positive mentions to total mentions
    merged_counts['total_count'] = merged_counts['neg_count'] + merged_counts['pos_count']
    merged_counts['pos_ratio'] = merged_counts['pos_count'] / merged_counts['total_count']

    # Filter routes mentioned more than 50 times
    merged_counts_filtered = merged_counts[merged_counts['total_count'] > 50]

    # Sort by positive ratio
    merged_counts_sorted = merged_counts_filtered.sort_values(by='pos_ratio', ascending=False)

    # Select the top 10 routes
    top_10_routes = merged_counts_sorted.head(10)

    # Plot the top routes
    plt.figure(figsize=(12, 8))
    plt.bar(
        x=top_10_routes.apply(lambda row: f"{row['departure']} -> {row['destination']}", axis=1),
        height=top_10_routes['pos_ratio'],
        color='#66B3FF',
        alpha=0.85
    )

    # Add titles and labels
    plt.title(f'Ratio a route is mentioned in a positive review (min 50 times) ({company})', fontsize=14)
    plt.ylabel('Ratio of Positive Mentions/Total Mentions', fontsize=12)
    plt.xlabel('Route', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    # Show the plot
    plt.show()

In [None]:
for company in companies:
    company_name = company
    good_travels(data, company_name)

In [40]:
def interactive_bigram_wordcloud(data, min_bigrams=2):
    # Get unique options for user selection
    companies = sorted(data['company'].unique())

    # User selects the company
    print("Available Companies:")
    for i, company in enumerate(companies, 1):
        print(f"{i}. {company}")
    company_choice = int(input("Select a company (enter the number): "))
    company_name = companies[company_choice - 1]

    # Filter data for the selected company
    company_data = data[data['company'] == company_name]

    # User specifies the travel date range
    print("\nSpecify the travel date range (YYYY-MM-DD):")
    start_date1 = input("Start date: ")
    end_date1 = input("End date: ")

    start_date = start_date1
    end_date = end_date1

    # Convert to datetime and filter the data
    company_data['travel_date'] = pd.to_datetime(company_data['travel_date'])
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    company_data = company_data[(company_data['travel_date'] >= start_date) & (company_data['travel_date'] <= end_date)]

    if company_data.empty:
        print("\nNo data available for the selected date range.")
        return

    # User selects the departure city
    departure_cities = sorted(company_data['departure'].unique())
    print("\nAvailable Departure Cities:")
    for i, city in enumerate(departure_cities, 1):
        print(f"{i}. {city}")
    departure_choice = int(input("Select a departure city (enter the number): "))
    departure_city = departure_cities[departure_choice - 1]

    # Filter data for the selected departure city
    departure_data = company_data[company_data['departure'] == departure_city]

    # User selects the destination city
    destination_cities = sorted(departure_data['destination'].unique())
    print("\nAvailable Destination Cities:")
    for i, city in enumerate(destination_cities, 1):
        print(f"{i}. {city}")
    destination_choice = int(input("Select a destination city (enter the number): "))
    destination_city = destination_cities[destination_choice - 1]

    # Filter data for the selected destination city
    route_data = departure_data[departure_data['destination'] == destination_city]

    # Calculate the number of comments and average satisfaction ratio
    total_comments = len(route_data)
    if total_comments > 0:
        avg_satisfaction = route_data['rating'].mean()
    else:
        avg_satisfaction = 0

    print(f"\nNumber of Comments: {total_comments}")
    print(f"Average Satisfaction Ratio: {avg_satisfaction:.2f}")

    route_data['cleaned_review_text'] = route_data['review_text'].dropna().apply(preprocess_text)

    # Extract bigrams
    all_text = ' '.join(route_data['cleaned_review_text'].dropna())
    bigram_list = list(bigrams(all_text.split()))
    bigram_counts = Counter(bigram_list)

    # Filter bigrams appearing more than `min_bigrams` times
    frequent_bigrams = {f"{w1} {w2}": count for (w1, w2), count in bigram_counts.items() if count > min_bigrams}

    # Calculate average ratings for each bigram and count occurrences
    def bigram_rating(bigram):
        pattern = r'\b{}\b \b{}\b'.format(bigram[0], bigram[1])
        reviews_containing_bigram = route_data[route_data['cleaned_review_text'].str.contains(pattern, na=False, regex=True)]
        return reviews_containing_bigram['rating'].mean(), len(reviews_containing_bigram)

    bigram_ratings = {bigram: bigram_rating(tuple(bigram.split())) for bigram in frequent_bigrams.keys()}

    # Define min/max rating
    min_rating = 1
    max_rating = 5

    # Define color scale logic
    def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
        rating, _ = bigram_ratings[word]

        # Color according to the rating
        if rating <= 1.5:  # Dark red
            color = sns.color_palette("Reds", 7)[6]
        elif rating <= 2.2:  # Red
            color = sns.color_palette("Reds", 7)[4]
        elif rating <= 2.8:  # Grey
            color = sns.color_palette("Greys", 7)[4]
        elif rating <= 3.5:  # Light green
            color = sns.color_palette("Greens", 7)[2]
        else:  # Dark green
            color = sns.color_palette("Greens", 7)[6]

        return tuple([int(c * 255) for c in color[:3]])

    # Generate word cloud
    wc = WordCloud(width=800, height=400, background_color='white', color_func=color_func).generate_from_frequencies(frequent_bigrams)

    # Display word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"Bigram WordCloud for {company_name} ({departure_city} -> {destination_city}) ({start_date1} -> {end_date1})", fontsize=16)
    plt.show()

    # Print bigram ratings and frequencies
    for bigram, (rating, count) in bigram_ratings.items():
        print(f"Bigram: {bigram}, Avg. Rating: {rating:.2f}, Count: {count}")


In [None]:
interactive_bigram_wordcloud(data, 2)