In [None]:
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from wordcloud import WordCloud
from string import digits
import requests
import pandas as pd
import nltk
import string
import seaborn as sns
import re
nltk.download("stopwords")
stop_words = set(stopwords.words('english'))

Saving the reviews in a data frame

#### TASK 1
#### Data Loading for a different method

In [None]:
# Read a CSV file into a DataFrame using the read_csv function from pandas
# The file path "/kaggle/input/airways-customer-data/airline_data.csv" is specified
df=pd.read_csv("/kaggle/input/airways-customer-data/airline_data.csv")
df.head() # Display the first few rows (default is 5 rows) of the DataFrame using the head() method

Saving the dataframe in csv file 

In [None]:
df.to_csv("BA_reviews.csv") # Save the DataFrame 'df' to a CSV file named "BA_reviews.csv"
# The to_csv() method is used to export the DataFrame to a CSV file

#### TASK 2
#### Data cleaning

In [None]:
import pandas as pd

csv_path = "BA_reviews.csv" # Specify the file path for the CSV file containing the reviews data

df1 = pd.read_csv(csv_path) # Read the CSV file into a DataFrame using the read_csv function from pandas

df1.reset_index(drop=True, inplace=True) # Reset the index of the DataFrame. drop=True removes the old index column.
# inplace=True modifies the DataFrame in place, without creating a new object.

print(df1['reviews']) # Print the 'reviews' column of the DataFrame


In [None]:
# Display concise summary information about the DataFrame df1
# The info() method provides information such as data types, non-null values, and memory usage
df1.info()
# Generate descriptive statistics of the DataFrame df1
# The describe() method gives statistics like mean, standard deviation, min, and max for numeric columns
df1.describe()

There are no null values/ missing values in out dataset.
The dataset contains 1000 unique entries.
Removing (✅ Trip Verified | and Not Verified | ) to clean the data.
Removing any leading or trailing spaces.
Turning the review string to all lower case.


In [None]:
# Remove leading and trailing whitespaces from the 'reviews' column in DataFrame df1
df1['reviews'] = df1['reviews'].str.strip()
# Remove the prefix '✅ Trip Verified |' from the beginning of each entry in the 'reviews' column
df1['reviews']=df1['reviews'].str.lstrip('✅ Trip Verified |')
# Remove the prefix 'Not Verified |' from the beginning of each entry in the 'reviews' column
df1['reviews']=df1['reviews'].str.lstrip('Not Verified |')
# Convert all characters in the 'reviews' column to lowercase
df1['reviews']= df1['reviews'].str.lower()
# Print the modified DataFrame
print(df1)

## TASK 3
#### Sentiment analysis of reviews using nltk 

Remove punctutaions

In [None]:
#remove punctuation
# Replace all non-alphanumeric characters and whitespace in the 'reviews' column with an empty string
# The regular expression '[^\w\s]' matches any character that is not a word character (alphanumeric) or whitespace
df1['reviews'] = df1['reviews'].str.replace('[^\w\s]','')
# Print the modified 'reviews' column in the DataFrame df1
print(df1['reviews'])

Tokenize the review column

In [None]:
# tokenize
# Print the content of the 'reviews' column for the second row (index 1) and second column (index 1) using iloc
print(df1.iloc[1,1])
# Tokenize each entry in the 'reviews' column using NLTK's word_tokenize function
# The lambda function is applied to each row using the apply() method along the specified axis
df1['reviews'] = df1.apply(lambda row: nltk.word_tokenize(row['reviews']), axis=1)
# Print the tokenized content of the 'reviews' column for the first row (index 0) and second column (index 1) using iloc
print(df1.iloc[0,1])

Removing stopwords

In [None]:
# Apply a lambda function to the 'reviews' column to remove stop words
# The lambda function uses a list comprehension to filter out stop words from each entry
# The filtered words are then joined into a space-separated string
df1['reviews'] = df1['reviews'].apply(lambda x: ' '.join([word for word in x if word not in (stop_words)]))
# Print the first 20 rows of the modified DataFrame
print(df1.head(20))

Calculate polarity to gather sentiment tags.

In [None]:
# Define a function to calculate the polarity of a given text using TextBlob
def polarity_calc(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None
    
# Define a function to assign a sentiment tag based on the calculated polarity
def tag_cal(num):
    if num<0:
        return 'Negative'
    elif num>0:
        return 'Positive'
    else:
        return 'Neutral'
        
# Apply the polarity_calc function to the 'reviews' column and create a new 'polarity' column  
df1['polarity'] = df1['reviews'].apply(polarity_calc)

# Apply the tag_cal function to the 'polarity' column and create a new 'tag' column
df1['tag'] = df1['polarity'].apply(tag_cal)

# Print the DataFrame with the new 'polarity' and 'tag' columns
print(df1)

## TASK 4 
#### Analyze 

The percentage of various types of tags.


In [None]:
# Group the DataFrame df1 by the 'tag' column and calculate the percentage distribution of each sentiment tag
# The size() method counts the occurrences of each tag, and the result is divided by the total count of tags
# The result is multiplied by 100 to get the percentage distribution
(df1.groupby('tag').size()/df1['tag'].count())*100

Number of each kind of reviews

## TASK 5
#### Visualizing the results

In [None]:
# Initialize an empty string 'text' to concatenate positive reviews
text = " "

# Iterate over the DataFrame indices
for ind in df1.index:
    # Check if the sentiment tag is "Positive"
    if df1['tag'][ind] == "Positive":
        # Concatenate the positive review to the 'text' string
        text = text + df1['reviews'][ind]
# Generate a WordCloud from the concatenated positive reviews        
      
wordcloud_positive = WordCloud().generate(text)

# Display the generated WordCloud image
# Display the generated image:
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.axis("off")
plt.show()


In [None]:
# Initialize an empty string 'text2' to concatenate negative reviews
text2= " "        
# Iterate over the DataFrame indices
for ind in df1.index:
    # Check if the sentiment tag is "Negative"
    if df1['tag'][ind] == "Negative":
        # Concatenate the negative review to the 'text2' string
        text2 = text2 + df1['reviews'][ind]  
# Generate a WordCloud from the concatenated negative reviews
wordcloud_negative = WordCloud().generate(text2)
# Display the generated WordCloud image for negative reviews
plt.imshow(wordcloud_negative, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Plot a bar chart of the sentiment tag counts using Matplotlib and Seaborn
# The value_counts() method counts the occurrences of each sentiment tag
# The plot() method with kind='bar' is used to create a bar chart
df1['tag'].value_counts().plot(kind='bar')
# Set the font scale for better readability
sns.set(font_scale=1.4)
# Plot the bar chart with specified figure size, rotation, and labels
df1['tag'].value_counts().plot(kind='bar', figsize=(7, 6), rot=0)
plt.xlabel("Sentiment", labelpad=14)
plt.ylabel("No of reviews", labelpad=14)
plt.title("Sentient counts", y=1.02);

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Combine all reviews into a single string
all_reviews = ' '.join(df1['reviews'])

# Create a CountVectorizer object
vectorizer = CountVectorizer(max_features=1000, stop_words='english')

# Fit and transform the reviews data into a document-term matrix
dtm = vectorizer.fit_transform(df1['reviews'])

# Initialize the LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)

# Fit the LDA model on the document-term matrix
lda.fit(dtm)

# Display the top words for each topic
feature_names = vectorizer.get_feature_names_out()
for index, topic in enumerate(lda.components_):
    print(f"\nTop words for Topic #{index + 1}:")
    print([feature_names[i] for i in topic.argsort()[-10:]])

# Transform the document-term matrix into topic probabilities
topic_results = lda.transform(dtm)

# Add the topic probabilities to the DataFrame
for i in range(lda.n_components):
    df1[f'Topic_{i + 1}_Prob'] = topic_results[:, i]

# Display the DataFrame with topic probabilities
print(df1.head())
