In [13]:
import numpy as np
import pickle

### Embedding method 1: GloVe

In [23]:
!sh src/build_vocab.sh
!sh src/cut_vocab.sh

In [None]:
!python src/pickle_vocab.py
!python src/cooc.py
!python src/glove_solution.py

In [14]:
# Load the word embeddings
embedding_matrix = np.load('src/embeddings.npy')
# Load the vocabulary
with open('src/vocab.pkl', 'rb') as f:
    vocabulary = pickle.load(f)
# Create a dictionary to map words to their embeddings
embeddings_dict = {word: embedding_matrix[index] for index, word in enumerate(vocabulary)}

### After embedding: Load the tweets into vectors


In [15]:
### After generating the embeddings using different method

def preprocess(tweet):
    # Assuming tweets are already tokenized and separated by spaces
    return tweet.lower().split()

def tweet_to_vector(tweet, embeddings_dict):
    words = preprocess(tweet)
    word_vectors = [embeddings_dict[word] for word in words if word in embeddings_dict]
    
    # Handle the case where tweet has no valid words found in embeddings
    if not word_vectors:
        return np.zeros(next(iter(embeddings_dict.values())).shape)
    
    # Compute the average vector
    tweet_vector = np.mean(word_vectors, axis=0)
    return tweet_vector

def process_tweets(tweet_path, embeddings_dict):
    with open(tweet_path, 'r') as f:
        tweets = f.readlines()
    tweets = [tweet.rstrip('\n') for tweet in tweets]
    tweet_vectors = np.array([tweet_to_vector(tweet, embeddings_dict) for tweet in tweets])
    return tweet_vectors

In [18]:
pos_vectors = process_tweets('data/twitter-datasets/train_pos.txt', embeddings_dict)
print(pos_vectors.shape)

neg_vectors = process_tweets('data/twitter-datasets/train_neg.txt', embeddings_dict)
print(neg_vectors.shape)

(100000, 20)
(100000, 20)
