# Twitter US Airline Sentiment

## ML Pipeline Preparation

### Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with read_sql_table
- Define feature and target variables X and Y

In [1]:
# import libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import re
import nltk
import pickle
import sys

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/student/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/student/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/student/nltk_data...


True

In [2]:
# load data from database
engine = create_engine('sqlite:///AirlineSentiment.db')
df = pd.read_sql_table('tweets', engine)
X = df['text']
Y = df['airline_sentiment']

### Tokenization Process
- Normalize text and remove punctuation
- Tokenize text
- Remove stop words
- Lemmatize verbs
- Return the list of clean tokens

In [3]:
def tokenize(text):
    # Normalize text and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words("english")]

    # Lemmatize verbs by specifying pos
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### Build machine learning pipeline for Random Forest Classifier
- Convert text to word count vectors
- Convert word count vectors to TF-IDF representation
- Instantiate the RandomForest Classifier

In [4]:
rf_pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),  # Convert text to word count vectors
    ('tfidf', TfidfTransformer()),  # Convert word count vectors to TF-IDF representation
    ('clf', RandomForestClassifier())  # Classifier using RandomForest
])

### Train Model
- Split data into train and test sets
- Train pipeline

In [5]:
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train pipeline
rf_pipeline.fit(X_train, Y_train)



### Test model
Report the accuracy, f1 score, precision and recall

In [6]:
# Make predictions
Y_pred = rf_pipeline.predict(X_test)

# print classification report
print(classification_report(Y_test, Y_pred, target_names=['negative', 'neutral', 'positive']))

              precision    recall  f1-score   support

    negative       0.80      0.92      0.86      1880
     neutral       0.64      0.46      0.53       582
    positive       0.74      0.57      0.64       459

    accuracy                           0.77      2921
   macro avg       0.73      0.65      0.68      2921
weighted avg       0.76      0.77      0.76      2921



### Improve model
Use grid search to find better parameters.

In [7]:
# Define parameters for grid search
rf_parameters = {
    'clf__n_estimators': [50, 100],
    'clf__min_samples_split': [2, 5]
}

# Perform grid search
rf_cv = GridSearchCV(rf_pipeline, param_grid=rf_parameters, verbose=3)
rf_cv.fit(X_train, Y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




[CV 1/5] END clf__min_samples_split=2, clf__n_estimators=50;, score=0.756 total time=  28.3s




[CV 2/5] END clf__min_samples_split=2, clf__n_estimators=50;, score=0.745 total time=  28.3s




[CV 3/5] END clf__min_samples_split=2, clf__n_estimators=50;, score=0.749 total time=  28.1s




[CV 4/5] END clf__min_samples_split=2, clf__n_estimators=50;, score=0.758 total time=  28.4s




[CV 5/5] END clf__min_samples_split=2, clf__n_estimators=50;, score=0.738 total time=  28.3s




[CV 1/5] END clf__min_samples_split=2, clf__n_estimators=100;, score=0.752 total time=  33.0s




[CV 2/5] END clf__min_samples_split=2, clf__n_estimators=100;, score=0.754 total time=  33.0s




[CV 3/5] END clf__min_samples_split=2, clf__n_estimators=100;, score=0.753 total time=  32.7s




[CV 4/5] END clf__min_samples_split=2, clf__n_estimators=100;, score=0.763 total time=  32.7s




[CV 5/5] END clf__min_samples_split=2, clf__n_estimators=100;, score=0.732 total time=  32.7s




[CV 1/5] END clf__min_samples_split=5, clf__n_estimators=50;, score=0.764 total time=  26.6s




[CV 2/5] END clf__min_samples_split=5, clf__n_estimators=50;, score=0.753 total time=  26.8s




[CV 3/5] END clf__min_samples_split=5, clf__n_estimators=50;, score=0.756 total time=  26.7s




[CV 4/5] END clf__min_samples_split=5, clf__n_estimators=50;, score=0.765 total time=  26.7s




[CV 5/5] END clf__min_samples_split=5, clf__n_estimators=50;, score=0.744 total time=  26.8s




[CV 1/5] END clf__min_samples_split=5, clf__n_estimators=100;, score=0.756 total time=  30.1s




[CV 2/5] END clf__min_samples_split=5, clf__n_estimators=100;, score=0.754 total time=  30.2s




[CV 3/5] END clf__min_samples_split=5, clf__n_estimators=100;, score=0.756 total time=  30.0s




[CV 4/5] END clf__min_samples_split=5, clf__n_estimators=100;, score=0.769 total time=  29.9s




[CV 5/5] END clf__min_samples_split=5, clf__n_estimators=100;, score=0.740 total time=  30.2s




### Test model
Show the accuracy, precision, and recall of the tuned model.

In [8]:
# Predict on test data
Y_pred = rf_cv.predict(X_test)

# print classification report
print(classification_report(Y_test, Y_pred, target_names=['negative', 'neutral', 'positive']))

# Calculate overall accuracy
accuracy = (Y_pred == Y_test).mean().mean()
print("Overall Accuracy:", accuracy)

              precision    recall  f1-score   support

    negative       0.80      0.94      0.86      1880
     neutral       0.66      0.42      0.51       582
    positive       0.74      0.56      0.64       459

    accuracy                           0.77      2921
   macro avg       0.73      0.64      0.67      2921
weighted avg       0.76      0.77      0.76      2921

Overall Accuracy: 0.7737076343717905


### Build machine learning pipeline for Gradient Boosting Classifier

In [9]:
gb_pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),  # Convert text to word count vectors
    ('tfidf', TfidfTransformer()),  # Convert word count vectors to TF-IDF representation
    ('clf', GradientBoostingClassifier())  # Classifier using GradientBoosting
])

### Train model

In [10]:
# Train pipeline
gb_pipeline.fit(X_train, Y_train)



### Test model

In [11]:
# Make predictions
Y_pred = gb_pipeline.predict(X_test)

# print classification report
print(classification_report(Y_test, Y_pred, target_names=['negative', 'neutral', 'positive']))

              precision    recall  f1-score   support

    negative       0.74      0.96      0.84      1880
     neutral       0.75      0.15      0.25       582
    positive       0.74      0.57      0.64       459

    accuracy                           0.74      2921
   macro avg       0.74      0.56      0.58      2921
weighted avg       0.74      0.74      0.69      2921



### Improve Gradient Boosting Model
Use grid search to find better parameters.

In [12]:
gb_parameters = {
    'clf__n_estimators': [50, 100],
    'clf__min_samples_split': [2, 5],
}

# Perform grid search
gb_cv = GridSearchCV(gb_pipeline, param_grid=gb_parameters, verbose=3)
gb_cv.fit(X_train, Y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




[CV 1/5] END clf__min_samples_split=2, clf__n_estimators=50;, score=0.711 total time=  28.1s




[CV 2/5] END clf__min_samples_split=2, clf__n_estimators=50;, score=0.705 total time=  27.8s




[CV 3/5] END clf__min_samples_split=2, clf__n_estimators=50;, score=0.703 total time=  27.8s




[CV 4/5] END clf__min_samples_split=2, clf__n_estimators=50;, score=0.702 total time=  27.7s




[CV 5/5] END clf__min_samples_split=2, clf__n_estimators=50;, score=0.690 total time=  28.0s




[CV 1/5] END clf__min_samples_split=2, clf__n_estimators=100;, score=0.723 total time=  32.4s




[CV 2/5] END clf__min_samples_split=2, clf__n_estimators=100;, score=0.716 total time=  33.0s




[CV 3/5] END clf__min_samples_split=2, clf__n_estimators=100;, score=0.717 total time=  32.5s




[CV 4/5] END clf__min_samples_split=2, clf__n_estimators=100;, score=0.711 total time=  32.5s




[CV 5/5] END clf__min_samples_split=2, clf__n_estimators=100;, score=0.703 total time=  32.7s




[CV 1/5] END clf__min_samples_split=5, clf__n_estimators=50;, score=0.710 total time=  28.4s




[CV 2/5] END clf__min_samples_split=5, clf__n_estimators=50;, score=0.705 total time=  28.3s




[CV 3/5] END clf__min_samples_split=5, clf__n_estimators=50;, score=0.703 total time=  28.7s




[CV 4/5] END clf__min_samples_split=5, clf__n_estimators=50;, score=0.703 total time=  28.2s




[CV 5/5] END clf__min_samples_split=5, clf__n_estimators=50;, score=0.688 total time=  28.0s




[CV 1/5] END clf__min_samples_split=5, clf__n_estimators=100;, score=0.725 total time=  32.5s




[CV 2/5] END clf__min_samples_split=5, clf__n_estimators=100;, score=0.715 total time=  32.9s




[CV 3/5] END clf__min_samples_split=5, clf__n_estimators=100;, score=0.718 total time=  32.8s




[CV 4/5] END clf__min_samples_split=5, clf__n_estimators=100;, score=0.709 total time=  32.8s




[CV 5/5] END clf__min_samples_split=5, clf__n_estimators=100;, score=0.703 total time=  33.0s




### Test model
Show the accuracy, precision, and recall of the tuned model.

In [13]:
# Predict on test data
Y_pred = gb_cv.predict(X_test)

# print classification report
print(classification_report(Y_test, Y_pred, target_names=['negative', 'neutral', 'positive']))

# Calculate overall accuracy
accuracy = (Y_pred == Y_test).mean().mean()
print("Overall Accuracy:", accuracy)

              precision    recall  f1-score   support

    negative       0.74      0.97      0.84      1880
     neutral       0.77      0.15      0.25       582
    positive       0.73      0.58      0.65       459

    accuracy                           0.74      2921
   macro avg       0.75      0.56      0.58      2921
weighted avg       0.75      0.74      0.69      2921

Overall Accuracy: 0.7422115713796645


### Build machine learning pipeline for Naive Bayes Classifier with Grid Search

In [14]:
# Define pipeline
nb_pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),  # Convert text to word count vectors
    ('tfidf', TfidfTransformer()),  # Convert word count vectors to TF-IDF representation
    ('clf', MultinomialNB())  # Classifier using Naive Bayes
])

### Train model using grid search

In [15]:
nb_parameters = {
    'clf__alpha': [0.1, 0.5, 1.0],  # Smoothing parameter (alpha)
}

# Perform grid search
nb_cv = GridSearchCV(nb_pipeline, param_grid=nb_parameters, verbose=3)
nb_cv.fit(X_train, Y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits




[CV 1/5] END ....................clf__alpha=0.1;, score=0.751 total time=  24.0s




[CV 2/5] END ....................clf__alpha=0.1;, score=0.739 total time=  25.3s




[CV 3/5] END ....................clf__alpha=0.1;, score=0.734 total time=  25.7s




[CV 4/5] END ....................clf__alpha=0.1;, score=0.731 total time=  24.5s




[CV 5/5] END ....................clf__alpha=0.1;, score=0.739 total time=  24.7s




[CV 1/5] END ....................clf__alpha=0.5;, score=0.714 total time=  25.8s




[CV 2/5] END ....................clf__alpha=0.5;, score=0.717 total time=  25.2s




[CV 3/5] END ....................clf__alpha=0.5;, score=0.709 total time=  24.2s




[CV 4/5] END ....................clf__alpha=0.5;, score=0.704 total time=  24.9s




[CV 5/5] END ....................clf__alpha=0.5;, score=0.704 total time=  25.4s




[CV 1/5] END ....................clf__alpha=1.0;, score=0.686 total time=  24.6s




[CV 2/5] END ....................clf__alpha=1.0;, score=0.686 total time=  24.3s




[CV 3/5] END ....................clf__alpha=1.0;, score=0.681 total time=  25.2s




[CV 4/5] END ....................clf__alpha=1.0;, score=0.677 total time=  25.5s




[CV 5/5] END ....................clf__alpha=1.0;, score=0.676 total time=  24.7s




### Test Model

In [16]:
# Predict on test data
Y_pred = nb_cv.predict(X_test)

# print classification report
print(classification_report(Y_test, Y_pred, target_names=['negative', 'neutral', 'positive']))

# Calculate overall accuracy
accuracy = (Y_pred == Y_test).mean().mean()
print("Overall Accuracy:", accuracy)

              precision    recall  f1-score   support

    negative       0.77      0.96      0.85      1880
     neutral       0.65      0.34      0.44       582
    positive       0.79      0.46      0.58       459

    accuracy                           0.76      2921
   macro avg       0.74      0.58      0.62      2921
weighted avg       0.75      0.76      0.73      2921

Overall Accuracy: 0.7569325573433756


- It's clear that the random forest model performed better than models built with Gradient Boosting and Naive Bayes. Therefore the random model will be saved for deployment.

### Build machine learning pipeline for Decision Tree Classifier with Grid Search

In [17]:
# Define pipeline
dt_pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),  # Convert text to word count vectors
    ('tfidf', TfidfTransformer()),  # Convert word count vectors to TF-IDF representation
    ('clf', DecisionTreeClassifier())  # Classifier using Decision Tree
])

### Train model using grid search

In [18]:
# Define parameters for grid search
dt_parameters = {
    'clf__min_samples_split': [2, 5],  # Minimum number of samples required to split an internal node
    'clf__min_samples_leaf': [1, 2]  # Minimum number of samples required to be at a leaf node
}

# Perform grid search
dt_cv = GridSearchCV(dt_pipeline, param_grid=dt_parameters, verbose=3)
dt_cv.fit(X_train, Y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




[CV 1/5] END clf__min_samples_leaf=1, clf__min_samples_split=2;, score=0.709 total time=  26.4s




[CV 2/5] END clf__min_samples_leaf=1, clf__min_samples_split=2;, score=0.695 total time=  26.3s




[CV 3/5] END clf__min_samples_leaf=1, clf__min_samples_split=2;, score=0.691 total time=  26.0s




[CV 4/5] END clf__min_samples_leaf=1, clf__min_samples_split=2;, score=0.701 total time=  26.3s




[CV 5/5] END clf__min_samples_leaf=1, clf__min_samples_split=2;, score=0.697 total time=  26.4s




[CV 1/5] END clf__min_samples_leaf=1, clf__min_samples_split=5;, score=0.698 total time=  25.6s




[CV 2/5] END clf__min_samples_leaf=1, clf__min_samples_split=5;, score=0.693 total time=  25.8s




[CV 3/5] END clf__min_samples_leaf=1, clf__min_samples_split=5;, score=0.694 total time=  26.3s




[CV 4/5] END clf__min_samples_leaf=1, clf__min_samples_split=5;, score=0.702 total time=  25.9s




[CV 5/5] END clf__min_samples_leaf=1, clf__min_samples_split=5;, score=0.687 total time=  25.3s




[CV 1/5] END clf__min_samples_leaf=2, clf__min_samples_split=2;, score=0.693 total time=  24.9s




[CV 2/5] END clf__min_samples_leaf=2, clf__min_samples_split=2;, score=0.694 total time=  24.9s




[CV 3/5] END clf__min_samples_leaf=2, clf__min_samples_split=2;, score=0.685 total time=  24.8s




[CV 4/5] END clf__min_samples_leaf=2, clf__min_samples_split=2;, score=0.690 total time=  24.8s




[CV 5/5] END clf__min_samples_leaf=2, clf__min_samples_split=2;, score=0.691 total time=  24.7s




[CV 1/5] END clf__min_samples_leaf=2, clf__min_samples_split=5;, score=0.699 total time=  24.8s




[CV 2/5] END clf__min_samples_leaf=2, clf__min_samples_split=5;, score=0.694 total time=  24.8s




[CV 3/5] END clf__min_samples_leaf=2, clf__min_samples_split=5;, score=0.690 total time=  24.8s




[CV 4/5] END clf__min_samples_leaf=2, clf__min_samples_split=5;, score=0.692 total time=  24.9s




[CV 5/5] END clf__min_samples_leaf=2, clf__min_samples_split=5;, score=0.682 total time=  24.8s




### Test Model

In [19]:
# Predict on test data
Y_pred = dt_cv.predict(X_test)

# print classification report
print(classification_report(Y_test, Y_pred, target_names=['negative', 'neutral', 'positive']))

# Calculate overall accuracy
accuracy = (Y_pred == Y_test).mean().mean()
print("Overall Accuracy:", accuracy)

              precision    recall  f1-score   support

    negative       0.82      0.81      0.81      1880
     neutral       0.48      0.49      0.48       582
    positive       0.59      0.61      0.60       459

    accuracy                           0.71      2921
   macro avg       0.63      0.63      0.63      2921
weighted avg       0.71      0.71      0.71      2921

Overall Accuracy: 0.7110578568983225


### Save the best model
Since the Random Forest model performed best, it will be saved and used for deployment.

In [20]:
# Save the random forest model to a file
with open('best_model.pkl', 'wb') as f:
    pickle.dump(rf_pipeline, f)