In [1]:
# !pip install pandas nltk scikit-learn

import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

print("Downloading NLTK resources...")
nltk.download('stopwords')
nltk.download('wordnet')
print("Downloads complete.")

Downloading NLTK resources...
Downloads complete.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [5]:
url = "Tweets.csv"
df = pd.read_csv(url)

df = df[['airline_sentiment', 'text']]

df.columns = ['sentiment', 'text']

print("--- First 5 rows of the dataset ---")
print(df.head())

print("\n--- Distribution of Sentiments ---")
print(df['sentiment'].value_counts())

--- First 5 rows of the dataset ---
  sentiment                                               text
0   neutral                @VirginAmerica What @dhepburn said.
1  positive  @VirginAmerica plus you've added commercials t...
2   neutral  @VirginAmerica I didn't today... Must mean I n...
3  negative  @VirginAmerica it's really aggressive to blast...
4  negative  @VirginAmerica and it's a really big bad thing...

--- Distribution of Sentiments ---
negative    9178
neutral     3099
positive    2363
Name: sentiment, dtype: int64


In [6]:
def preprocess_tweet(text):
    text = re.sub(r'(@\w+|https://\S+)', '', text)
    
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    
    tokens = text.split()
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    return " ".join(clean_tokens)

df['processed_text'] = df['text'].apply(preprocess_tweet)

print("--- Dataset after Preprocessing ---")
df[['sentiment', 'processed_text']].head()

--- Dataset after Preprocessing ---


Unnamed: 0,sentiment,processed_text
0,neutral,said
1,positive,plus added commercial experience tacky
2,neutral,today must mean need take another trip
3,negative,really aggressive blast obnoxious entertainmen...
4,negative,really big bad thing


In [7]:
X = df['processed_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"Training data shape: {X_train_tfidf.shape}")
print(f"Testing data shape: {X_test_tfidf.shape}")

Training data shape: (11712, 5000)
Testing data shape: (2928, 5000)


In [8]:
classifier = LogisticRegression(multi_class='ovr', solver='liblinear', max_iter=1000)
classifier.fit(X_train_tfidf, y_train)

print("Model training complete.")

Model training complete.




In [9]:
y_pred = classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"--- Model Accuracy ---\n{accuracy:.4f}\n")

labels = ['negative', 'neutral', 'positive']
cm = confusion_matrix(y_test, y_pred, labels=labels)
df_cm = pd.DataFrame(cm, index=labels, columns=labels)
print("--- Confusion Matrix ---")
print(df_cm)

print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, labels=labels))

--- Model Accuracy ---
0.7725

--- Confusion Matrix ---
          negative  neutral  positive
negative      1741       69        25
neutral        324      263        33
positive       150       65       258

--- Classification Report ---
              precision    recall  f1-score   support

    negative       0.79      0.95      0.86      1835
     neutral       0.66      0.42      0.52       620
    positive       0.82      0.55      0.65       473

    accuracy                           0.77      2928
   macro avg       0.75      0.64      0.68      2928
weighted avg       0.76      0.77      0.75      2928

