# Task 1: Web scraping to gain company inshights

In [None]:
#Import all libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import spacy
import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim
from nltk.sentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

### Section 1 - Web scraping

In [None]:
# URL of the website to scrape
url = 'https://www.airlinequality.com/airline-reviews/british-airways/page/1/'

In [None]:
# List to store the extracted reviews
reviewlist = []

In [None]:
# Function to get the BeautifulSoup object from a URL
def get_soup(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup

In [None]:
# Function to extract reviews from a BeautifulSoup object
def get_reviews(soup):
    reviews = soup.find_all('article', {'itemprop': 'review'})
    try:
        for item in reviews:
            review = {
                'title': item.find('h2', {'class': 'text_header'}).text,
                'rating': item.find('div', {'itemprop': 'reviewRating'}).text.strip(),
                'body': item.find('div', {'class': 'text_content'}).text.strip(),
            }
            reviewlist.append(review)
    except:
        pass

In [None]:
# Loop through multiple pages of reviews
for x in range(1, 362):
    soup = get_soup(f'https://www.airlinequality.com/airline-reviews/british-airways/page/{x}/')
    print(f'Getting page: {x}')
    get_reviews(soup)
    print(len(reviewlist))
    if not soup.find('li', {'class': 'off', 'text': '$0'}):
        pass
    else:
        break

In [None]:
# Create a DataFrame from the extracted reviews
df = pd.DataFrame(reviewlist)

In [None]:
# Save the DataFrame as a CSV file
df.to_csv('BA-reviews.csv', index=False)

### Section 2 - Data cleaning

In [None]:
df['rating'] = df['rating'].replace('/10', '', regex=True).astype(float)
df['verified'] = df['body'].str.contains('Trip Verified')
df['body'] = df['body'].replace('✅ Trip Verified ', '', regex=True)

#Convert into lowercase
df['body'] = df['body'].str.lower()
df['title'] = df['title'].str.lower()

#Remove punctuations
df['body'] = df['body'].str.replace('[^\w\s]', '', regex=True)
df['title'] = df['title'].str.replace('[^\w\s]', '', regex=True)

#Remove numbers
df['body'] = df['body'].str.replace('\d', '', regex=True)
df.head()

In [None]:
nltk.download('punkt')
df['tokens'] = df['body'].apply(word_tokenize)

In [None]:
# Load the spaCy English language model
nlp = spacy.load('en_core_web_sm')

# Function to remove stopwords using spaCy
def remove_stopwords(tokens):
    text = ' '.join(tokens)
    doc = nlp(text)
    tokens_without_stopwords = [token.text for token in doc if not token.is_stop]
    return tokens_without_stopwords

df['tokens'] = df['tokens'].apply(remove_stopwords)

In [None]:
#Lemmatize

# Function to lemmatize tokens
def lemmatize(tokens):
    text = ' '.join(tokens)
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    return lemmas

df['tokens'] = df['tokens'].apply(lemmatize)

### Section 3 - Topic modelling

In [None]:
dictionary = corpora.Dictionary(df['tokens'])

#Create document term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['tokens'] ]

In [None]:
lda = gensim.models.ldamodel.LdaModel
num_topics=8
%time ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

In [None]:
ldamodel.print_topics(num_topics=num_topics)

In [None]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')

pyLDAvis.display(lda_display)

### Section 4 - Sentiment Analysis

In [None]:
# Download the vader_lexicon resource
nltk.download('vader_lexicon')

# Create the SentimentIntensityAnalyzer object
sia = SentimentIntensityAnalyzer()

# Function to get sentiment scores for each list of tokens and return as separate columns
def get_sentiment_scores(tokens):
    text = ' '.join(tokens)
    scores = sia.polarity_scores(text)
    return scores['neg'], scores['neu'], scores['pos'], scores['compound']

df[['negative', 'neutral', 'positive', 'compound']] = df['tokens'].apply(get_sentiment_scores).apply(pd.Series)

df.head()

In [None]:
def vader_analysis(compound):
    if compound >= 0.5:
        return 'Positive'
    elif compound < 0:
        return 'Negative'
    else:
        return 'Neutral'

df['sentiment'] = df['compound'].apply(vader_analysis)

print(df)

In [None]:
vader_counts = df['sentiment'].value_counts()
vader_counts

In [None]:
%matplotlib inline
plt.figure(figsize=(12,6))

plt.subplot(1,3,2)
plt.pie(vader_counts.values, labels = vader_counts.index, autopct='%1.1f%%', shadow=False)
plt.show()

### Section 5 - Wordclouds

In [None]:
# Combine all tokenized words into a single string
all_tokens = ' '.join(df['tokens'].sum())

stopwords = set(STOPWORDS)

wc = WordCloud(
    background_color='white',
    stopwords=stopwords,
    max_words=70,
    max_font_size=30,
    scale=3,
    random_state=1
)

wc.generate(all_tokens)

plt.figure(figsize=(10, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')  # Turn off axis
plt.show()