In [10]:
import urllib.request
import tarfile
import os

# URL of the dataset
url = "https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz"
dataset_folder = "rt-polaritydata"

# Function to download the dataset
def download_dataset(url, download_path):
    if not os.path.exists(download_path):
        print(f"Downloading dataset from {url}...")
        urllib.request.urlretrieve(url, download_path)
        print("Download complete.")
    else:
        print("Dataset already downloaded.")

# Function to extract the tar.gz file
def extract_dataset(tar_path, extract_to):
    if not os.path.exists(extract_to):
        print(f"Extracting {tar_path}...")
        with tarfile.open(tar_path, "r:gz") as tar:
            tar.extractall(path=extract_to)
        print(f"Extraction complete. Files extracted to {extract_to}")
    else:
        print("Dataset already extracted.")

# Download and extract dataset
download_path = "rt-polaritydata.tar.gz"
download_dataset(url, download_path)
extract_dataset(download_path, dataset_folder)

# List the extracted files
print(f"Extracted files: {os.listdir(dataset_folder)}")


Downloading dataset from https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz...
Download complete.
Extracting rt-polaritydata.tar.gz...
Extraction complete. Files extracted to rt-polaritydata
Extracted files: ['rt-polaritydata.README.1.0.txt', 'rt-polaritydata']


In [11]:
%cd rt-polaritydata/rt-polaritydata

/content/rt-polaritydata/rt-polaritydata/rt-polaritydata/rt-polaritydata


In [12]:
%ls

rt-polarity.neg  rt-polarity.pos


In [13]:
# In Colab, install TensorFlow Hub and TensorFlow
!pip install tensorflow tensorflow-hub




In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle

# 1. Load the data from files
def load_data(positive_file, negative_file):
    # Reading positive and negative data
    with open(positive_file, 'r', encoding='latin-1') as pos_file:
        positive_data = pos_file.readlines()

    with open(negative_file, 'r', encoding='latin-1') as neg_file:
        negative_data = neg_file.readlines()

    # Strip leading/trailing whitespaces
    positive_data = [line.strip() for line in positive_data]
    negative_data = [line.strip() for line in negative_data]

    return positive_data, negative_data

# 2. Label the data
def prepare_dataframe(positive_data, negative_data):
    # Create dataframes for positive and negative data
    pos_df = pd.DataFrame({'text': positive_data, 'label': 1})  # 1 for positive
    neg_df = pd.DataFrame({'text': negative_data, 'label': 0})  # 0 for negative

    # Concatenate both dataframes
    data = pd.concat([pos_df, neg_df], ignore_index=True)

    # Shuffle the dataset
    data = shuffle(data).reset_index(drop=True)

    return data

# 3. Split data into training, validation, and test sets
def split_data(data):
    # Split into training (4000 each), validation (500 each), and test sets (831 each)
    train_pos = data[data['label'] == 1][:4000]
    train_neg = data[data['label'] == 0][:4000]
    val_pos = data[data['label'] == 1][4000:4500]
    val_neg = data[data['label'] == 0][4000:4500]
    test_pos = data[data['label'] == 1][4500:]
    test_neg = data[data['label'] == 0][4500:]

    # Combine positive and negative for each set
    train_data = pd.concat([train_pos, train_neg], ignore_index=True)
    val_data = pd.concat([val_pos, val_neg], ignore_index=True)
    test_data = pd.concat([test_pos, test_neg], ignore_index=True)

    # Shuffle each set to mix positives and negatives
    train_data = shuffle(train_data).reset_index(drop=True)
    val_data = shuffle(val_data).reset_index(drop=True)
    test_data = shuffle(test_data).reset_index(drop=True)

    return train_data, val_data, test_data

# 4. Vectorize the text using TF-IDF
def vectorize_data(train_data, val_data, test_data):
    # Use TF-IDF vectorizer to convert text to numerical features
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

    # Fit on training data and transform
    X_train = vectorizer.fit_transform(train_data['text'])
    X_val = vectorizer.transform(val_data['text'])
    X_test = vectorizer.transform(test_data['text'])

    # Extract labels
    y_train = train_data['label']
    y_val = val_data['label']
    y_test = test_data['label']

    return X_train, X_val, X_test, y_train, y_val, y_test


# Main function to run preprocessing
def preprocess_data(positive_file, negative_file):
    # Load the data
    positive_data, negative_data = load_data(positive_file, negative_file)

    # Prepare and shuffle the dataframe
    data = prepare_dataframe(positive_data, negative_data)

    # Split the data into training, validation, and test sets
    train_data, val_data, test_data = split_data(data)

    # Vectorize the data using TF-IDF
    # X_train, X_val, X_test, y_train, y_val, y_test = vectorize_data(train_data, val_data, test_data)

    # return X_train, X_val, X_test, y_train, y_val, y_test
    return train_data, val_data, test_data

# Paths to the data files
positive_file = 'rt-polarity.pos'
negative_file = 'rt-polarity.neg'

# Preprocess the data
train_data, val_data, test_data = preprocess_data(positive_file, negative_file)

# You can now use X_train, X_val, X_test, y_train, y_val, y_test for model training and evaluation.


In [14]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional


In [19]:
# Load the ELMo model from TensorFlow Hub
elmo = hub.load("https://tfhub.dev/google/elmo/3")

# Function to get ELMo embeddings
# def elmo_embedding(text):
#     embeddings = elmo.signatures["default"](tf.constant([text]))["elmo"]
#     return tf.reduce_mean(embeddings, axis=1).numpy()

# # Test embedding for one sentence
# print(elmo_embedding("This movie was absolutely amazing!"))

def elmo_embedding(sentences):
    # ELMo expects a list of sentences
    embeddings = elmo.signatures['default'](tf.convert_to_tensor(sentences))['elmo']
    # Averaging the embeddings for each sentence
    return np.array([np.mean(embedding, axis=0) for embedding in embeddings])

sample_sentences = ["This is a great movie.", "I didn't enjoy this film."]
elmo_embeddings = elmo_embedding(sample_sentences)
print(elmo_embeddings.shape)

(2, 1024)


In [20]:
# def convert_to_elmo_embeddings(data):
#     embeddings = [elmo_embedding(sentence) for sentence in data['text']]
#     return np.vstack(embeddings)

def convert_to_elmo_embeddings(data):
    embeddings = elmo_embedding(data['text'].tolist())
    return embeddings



# Convert the datasets
X_train = convert_to_elmo_embeddings(train_data)
X_val = convert_to_elmo_embeddings(val_data)
X_test = convert_to_elmo_embeddings(test_data)

# Extract labels
y_train = train_data['label'].values
y_val = val_data['label'].values
y_test = test_data['label'].values

# Check shape of embeddings
print(f'X_train shape: {X_train.shape}')
print(f'X_val shape: {X_val.shape}')
print(f'X_test shape: {X_test.shape}')


X_train shape: (8000, 1024)
X_val shape: (1000, 1024)
X_test shape: (1662, 1024)


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# Model training
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Validation
val_preds = clf.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, val_preds)}")

# Testing
test_preds = clf.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, test_preds)}")
print(classification_report(y_test, test_preds))


Validation Accuracy: 0.791
Test Accuracy: 0.8074608904933814
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       831
           1       0.81      0.80      0.81       831

    accuracy                           0.81      1662
   macro avg       0.81      0.81      0.81      1662
weighted avg       0.81      0.81      0.81      1662

