In [1]:
import numpy as np
import pickle

In [2]:
pos_path = 'data/twitter-datasets/train_pos.txt'
neg_path = 'data/twitter-datasets/train_neg.txt'

with open(pos_path, 'r') as f:
    pos_tweets = f.readlines()
with open(neg_path, 'r') as f:
    neg_tweets = f.readlines()

### Method 1: GloVe Embedding 

In [1]:
# !sh src/glove/build_vocab.sh
# !sh src/glove/cut_vocab.sh

In [None]:
# !python src/glove/pickle_vocab.py
# !python src/glove/cooc.py
# !python src/glove/glove_solution.py

In [13]:
# Load the word embeddings
embedding_matrix = np.load('manipulated/glove_embeddings.npy')
# Load the vocabulary
with open('manipulated/vocab.pkl', 'rb') as f:
    vocabulary = pickle.load(f)
# Create a dictionary to map words to their embeddings
embeddings_dict = {word: embedding_matrix[index] for index, word in enumerate(vocabulary)}

In [14]:
### After generating the embeddings using different method

def preprocess(tweet):
    # Assuming tweets are already tokenized and separated by spaces
    return tweet.lower().split()

def tweet_to_vector(tweet, embeddings_dict):
    words = preprocess(tweet)
    word_vectors = [embeddings_dict[word] for word in words if word in embeddings_dict]
    
    # Handle the case where tweet has no valid words found in embeddings
    if not word_vectors:
        return np.zeros(next(iter(embeddings_dict.values())).shape)
    
    # Compute the average vector
    tweet_vector = np.mean(word_vectors, axis=0)
    return tweet_vector

def glove(tweets, embeddings_dict):
    tweets = [tweet.rstrip('\n') for tweet in tweets]
    tweet_vectors = np.array([tweet_to_vector(tweet, embeddings_dict) for tweet in tweets])
    return tweet_vectors

In [16]:

pos_vectors_glove = glove(pos_tweets, embeddings_dict)
print(pos_vectors_glove.shape)

neg_vectors_glove = glove(neg_tweets, embeddings_dict)
print(neg_vectors_glove.shape)

(100000, 20)
(100000, 20)


### Method 2: CountVectorizer

In [None]:
from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer()
def tokenize_tweets(tweet):
    return tweet_tokenizer.tokenize(tweet)

In [20]:

from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(tokenizer=tokenize_tweets)

pos_vectors_count = count_vectorizer.fit_transform(pos_tweets)
pos_vectors_count = pos_vectors_count.toarray() # if lacking memory, delete
print(pos_vectors_count.shape)

neg_vectors_count = count_vectorizer.fit_transform(neg_tweets)
neg_vectors_count = neg_vectors_count.toarray() # if lacking memory, delete
print(neg_vectors_count.shape)

(100000, 56704)
(100000, 82191)


### Method 3: TfidfVectorizer

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_tweets)

pos_vectors_tfidf = tfidf_vectorizer.fit_transform(pos_tweets)
pos_vectors_tfidf = pos_vectors_tfidf.toarray() # if lacking memory, delete
print(pos_vectors_tfidf.shape)

neg_vectors_tfidf = tfidf_vectorizer.fit_transform(neg_tweets)
neg_vectors_tfidf = neg_vectors_tfidf.toarray() # if lacking memory, delete
print(neg_vectors_tfidf.shape)



(100000, 56704)
(100000, 82191)


### Method 4: Pre-trained Word2Vec

In [7]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

model_w2v = Word2Vec(common_texts, vector_size=100, window=5, min_count=1, workers=4)
model_w2v.train(pos_tweets, total_examples=len(pos_tweets), epochs=10)
model_w2v.train(neg_tweets, total_examples=len(neg_tweets), epochs=10)


vector_size = 100  # This should match the size of your word vectors
default_vector = np.zeros(vector_size)

pos_vectors_w2v = np.array([
    np.mean(
        [model_w2v.wv[word] for word in tweet.split() if word in model_w2v.wv] or [default_vector],
        axis=0
    )
    for tweet in pos_tweets
])

neg_vectors_w2v = np.array([
    np.mean(
        [model_w2v.wv[word] for word in tweet.split() if word in model_w2v.wv] or [default_vector],
        axis=0
    )
    for tweet in neg_tweets
])

