<a href="https://colab.research.google.com/github/dookda/cmu_py499/blob/main/proj_knot_sentiment/sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas numpy scikit-learn
!pip install torch torchvision         # for PyTorch
!pip install tensorflow                # for TF/Keras (if preferred)
!pip install transformers              # for BERT
!pip install nltk regex emoji          # for preprocessing

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

!cp /content/drive/MyDrive/_DATASET/sentiment/tweets.csv -d /content/tweets.csv
!cp /content/drive/MyDrive/_DATASET/sentiment/glove.6B.100d.txt -d /content/glove.6B.100d.txt

Mounted at /content/drive/


In [4]:
import pandas as pd
df = pd.read_csv('tweets.csv')
print(df.airline_sentiment.value_counts())


airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64


In [5]:
import os
import re
import io
import emoji
import zipfile
import urllib.request

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense

# 1. Download NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')

# 2. Load dataset
df = pd.read_csv('tweets.csv')[['text','airline_sentiment']].dropna()

# 3. Clean text
STOP = set(stopwords.words('english'))
def clean_text(s):
    s = emoji.demojize(s)                              # 😊 → :smiling_face:
    s = re.sub(r'http\S+|@\w+|#', '', s.lower())       # remove URLs, mentions, hashtags
    s = re.sub(r'[^a-z0-9_: ]', ' ', s)                # keep emoji codes & alphanum
    tokens = [t for t in s.split() if t not in STOP]
    return ' '.join(tokens)

df['clean_text'] = df['text'].map(clean_text)

# 4. Encode labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['airline_sentiment'])

# 5. Train/validation split
train_df, val_df = train_test_split(
    df[['clean_text','label']],
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)

# 6. Tokenize & pad
MAX_LEN = 50
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['clean_text'])

def to_padded(seqs):
    s = tokenizer.texts_to_sequences(seqs)
    return pad_sequences(s, maxlen=MAX_LEN, padding='post', truncating='post')

X_train, X_val = to_padded(train_df['clean_text']), to_padded(val_df['clean_text'])
y_train, y_val = train_df['label'].values, val_df['label'].values

vocab_size = len(tokenizer.word_index) + 1
EMB_DIM    = 100

# 7. Ensure GloVe file exists (download & unzip if needed)
GLOVE_FILE = 'glove.6B.100d.txt'
if not os.path.isfile(GLOVE_FILE):
    print("Downloading GloVe embeddings (862 MB zip)...")
    url = 'http://nlp.stanford.edu/data/glove.6B.zip'
    resp = urllib.request.urlopen(url)
    with zipfile.ZipFile(io.BytesIO(resp.read())) as z:
        print("Extracting GloVe 100d vectors...")
        z.extract(GLOVE_FILE)
    print("Done.")

# 8. Load GloVe into a dict
embeddings_index = {}
with open(GLOVE_FILE, 'r', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word   = values[0]
        vec    = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vec

# 9. Build embedding matrix
embedding_matrix = np.zeros((vocab_size, EMB_DIM))
for word, i in tokenizer.word_index.items():
    vec = embeddings_index.get(word)
    if vec is not None:
        embedding_matrix[i] = vec




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# 10. Define the Bi-LSTM model
model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=EMB_DIM,
        weights=[embedding_matrix],
        input_length=MAX_LEN,
        trainable=False
    ),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)
model.summary()

# 11. Train
model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=64
)

# 12. Evaluate
y_pred = model.predict(X_val).argmax(axis=-1)
print(classification_report(
    y_val, y_pred,
    target_names=le.classes_, digits=4
))


In [22]:
# 13. Test on a new tweet
def predict_sentiment(tweet):
    tweet_clean = clean_text(tweet)
    seq = tokenizer.texts_to_sequences([tweet_clean])
    padded_seq = pad_sequences(seq, maxlen=MAX_LEN, padding='post', truncating='post')
    pred = model.predict(padded_seq).argmax(axis=-1)
    return le.inverse_transform(pred)[0]
# Example usage
tweet = "I hate this airline!"



print(f"Sentiment: {predict_sentiment(tweet)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
Sentiment: negative
