In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
df = pd.read_csv("Tweets.csv")

In [3]:
# Step 1 – Select required columns
df = df[["airline_sentiment", "text"]]

In [25]:
# Display head and info of the modified DataFrame
print("DataFrame head after selecting columns:")
print(df.head())
print("\nDataFrame info:")
print(df.info())
print("\nDataFrame Shape:")
print(df.shape)

DataFrame head after selecting columns:
  airline_sentiment                                               text  \
0           neutral                @VirginAmerica What @dhepburn said.   
1          positive  @VirginAmerica plus you've added commercials t...   
2           neutral  @VirginAmerica I didn't today... Must mean I n...   
3          negative  @VirginAmerica it's really aggressive to blast...   
4          negative  @VirginAmerica and it's a really big bad thing...   

                                        text_cleaned  
0                  @ virginamerica @ dhepburn said .  
1  @ virginamerica plu 've ad commerci experi ......  
2  @ virginamerica n't today ... must mean need t...  
3  @ virginamerica 's realli aggress blast obnoxi...  
4            @ virginamerica 's realli big bad thing  

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------  

In [6]:
# Step 2 – Preprocess Text
# Download necessary NLTK resources
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
ps = PorterStemmer()

In [8]:
def clean_text(text):
    text = str(text) # Convert to string to handle potential NaNs
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)
    # Tokenize
    text = nltk.word_tokenize(text)

    # Remove stop words
    y = []
    for i in text:
        if i not in stopwords.words('english'):
            y.append(i)
    text = y[:]

    # Stemming
    y.clear()
    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

In [10]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [11]:
# Apply "clean_text" function to "text" column
df["text_cleaned"] = df["text"].apply(clean_text)

In [27]:
df.head(3)

Unnamed: 0,airline_sentiment,text,text_cleaned
0,neutral,@VirginAmerica What @dhepburn said.,@ virginamerica @ dhepburn said .
1,positive,@VirginAmerica plus you've added commercials t...,@ virginamerica plu 've ad commerci experi ......
2,neutral,@VirginAmerica I didn't today... Must mean I n...,@ virginamerica n't today ... must mean need t...


In [12]:
# Display head with the new column
print("\nDataFrame head after text cleaning:")
print(df[["text", "text_cleaned"]].head())


DataFrame head after text cleaning:
                                                text  \
0                @VirginAmerica What @dhepburn said.   
1  @VirginAmerica plus you've added commercials t...   
2  @VirginAmerica I didn't today... Must mean I n...   
3  @VirginAmerica it's really aggressive to blast...   
4  @VirginAmerica and it's a really big bad thing...   

                                        text_cleaned  
0                  @ virginamerica @ dhepburn said .  
1  @ virginamerica plu 've ad commerci experi ......  
2  @ virginamerica n't today ... must mean need t...  
3  @ virginamerica 's realli aggress blast obnoxi...  
4            @ virginamerica 's realli big bad thing  


In [13]:
# Step 3 – Feature Extraction
# Create TfidfVectorizer
tfidf = TfidfVectorizer(max_features=3000)

In [14]:
# Generate TF-IDF vector representation
X = tfidf.fit_transform(df["text_cleaned"]).toarray()

In [26]:
X.shape

(14640, 3000)

In [15]:
# Convert the column “airline_sentiment” to an array
Y = df["airline_sentiment"].values

In [16]:
print(f"\nShape of feature matrix X: {X.shape}")
print(f"Shape of target array Y: {Y.shape}")


Shape of feature matrix X: (14640, 3000)
Shape of target array Y: (14640,)


In [17]:
# Step 4 – Train Model
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [18]:
print(f"\nShape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")


Shape of X_train: (11712, 3000)
Shape of X_test: (2928, 3000)


In [19]:
# Train a Multinomial Naïve Bayes classifier
print("\n--- Multinomial Naïve Bayes Classifier ---")
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Accuracy (Multinomial Naïve Bayes): {accuracy_nb:.4f}")


--- Multinomial Naïve Bayes Classifier ---
Accuracy (Multinomial Naïve Bayes): 0.7213


In [20]:
# Train a Random Forest classifier (using the provided snippet logic)
print("\n--- Random Forest Classifier ---")
rf_model = RandomForestClassifier(random_state=2) # Set random_state for reproducibility
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy (Random Forest): {accuracy_rf:.4f}")


--- Random Forest Classifier ---
Accuracy (Random Forest): 0.7544


In [22]:
y_pred = rf_model.predict(X_test)

print(accuracy_score(y_test,y_pred))

0.7544398907103825


In [28]:
# Filter the DataFrame for "neutral" sentiment
neutral_tweets = df[df['airline_sentiment'] == 'neutral']

# Count the number of unique tweets in the 'text' column for the filtered data
unique_neutral_count = neutral_tweets['text'].nunique()

print(f"Total rows with 'neutral' sentiment: {len(neutral_tweets)}")
print(f"Number of unique tweets with 'neutral' sentiment: {unique_neutral_count}")

Total rows with 'neutral' sentiment: 3099
Number of unique tweets with 'neutral' sentiment: 3067


In [29]:
# Get the count of instances for each unique sentiment class
sentiment_counts = df['airline_sentiment'].value_counts()

# Print all sentiment counts for context
print("Sentiment Class Counts:")
print(sentiment_counts)

# Extract and print the count for "negative" sentiment
negative_count = sentiment_counts['negative']
print(f"\nNumber of 'negative' sentiment instances: {negative_count}")

Sentiment Class Counts:
airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64

Number of 'negative' sentiment instances: 9178


In [30]:
# Train a Multinomial Naïve Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)

# Train a Random Forest classifier
rf_model = RandomForestClassifier(random_state=2, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

In [31]:
print(accuracy_nb)
print(accuracy_rf)

0.7213114754098361
0.7544398907103825
