In [18]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [19]:
df = pd.read_csv("data\\crowdflower\\airline-sentiment\\train.tsv", sep="\t", encoding="latin1")
print(df.head())

                                                text     label
0                @VirginAmerica What @dhepburn said.   neutral
1  @VirginAmerica plus you've added commercials t...  positive
2  @VirginAmerica I didn't today... Must mean I n...   neutral
3  @VirginAmerica it's really aggressive to blast...  negative
4  @VirginAmerica and it's a really big bad thing...  negative


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

df = df.drop(columns=['text'])
df.dropna(subset=['cleaned_text'], inplace=True)
df = df[df['cleaned_text'] != '']

print("--- Data Preprocessing Complete ---")
print("Data Head:")
print(df.head())
print("\nSentiment Distribution:")
print(df['label'].value_counts())

--- Data Preprocessing Complete ---
Data Head:
      label                                       cleaned_text
0   neutral                                          what said
1  positive  plus youve added commercials to the experience...
2   neutral  i didnt today must mean i need to take another...
3  negative  its really aggressive to blast obnoxious enter...
4  negative            and its a really big bad thing about it

Sentiment Distribution:
label
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64


In [None]:
X = df['cleaned_text']
y = df['label']

vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=5000)

X_tfidf = vectorizer.fit_transform(X)

print("\n--- Feature Extraction Complete ---")
print("Shape of TF-IDF matrix:", X_tfidf.shape)

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

model = LogisticRegression(max_iter=2000)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("\n--- Model Evaluation ---")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


--- Feature Extraction Complete ---
Shape of TF-IDF matrix: (14640, 5000)

--- Model Evaluation ---
Accuracy: 0.7954

Classification Report:
              precision    recall  f1-score   support

    negative       0.82      0.94      0.88      1835
     neutral       0.67      0.56      0.61       620
    positive       0.84      0.54      0.66       473

    accuracy                           0.80      2928
   macro avg       0.78      0.68      0.71      2928
weighted avg       0.79      0.80      0.78      2928



In [30]:
print("\n--- Testing with New Sentences ---")
new_tweets = [
    "I love this airline, the service was amazing!",
    "They lost my luggage and the flight was delayed for hours.",
    "What time does the flight to Dallas leave?",
    "The operational failure are a testament to bad customer service.",
    "The crew is polite and helpful",
    "The board displays the flight information"
]

cleaned_new_tweets = [clean_text(tweet) for tweet in new_tweets]

new_tweets_tfidf = vectorizer.transform(cleaned_new_tweets)

new_predictions = model.predict(new_tweets_tfidf)

for tweet, sentiment in zip(new_tweets, new_predictions):
    print(f"Tweet: '{tweet}'\nPredicted Sentiment: {sentiment}\n")


--- Testing with New Sentences ---
Tweet: 'I love this airline, the service was amazing!'
Predicted Sentiment: positive

Tweet: 'They lost my luggage and the flight was delayed for hours.'
Predicted Sentiment: negative

Tweet: 'What time does the flight to Dallas leave?'
Predicted Sentiment: neutral

Tweet: 'The operational failure are a testament to bad customer service.'
Predicted Sentiment: negative

Tweet: 'The crew is polite and helpful'
Predicted Sentiment: positive

Tweet: 'The board displays the flight information'
Predicted Sentiment: negative

