# phase 2 - convert raw text into numeric representations

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

## Bag of Words

In [2]:
def apply_bow_to_file(input_filename: str, output_filename: str) -> None:
    # read data from input file
    data = pd.read_csv(input_filename)

    # remove rows where text is NaN
    data = data.dropna(subset=["text"]) 

    # extract corpus and indexes
    corpus = data["text"].values
    indexes = data["index"].values
    
    # create BoW model
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)

    bow_data = []
    for i, doc_index in enumerate(indexes):
        # get indices of nonzero word counts
        nonzero_indices = X[i].nonzero()[1]  
        # get the actual nonzero word counts
        nonzero_values = X[i].data  
        # create a sparse dictionary of word index -> count for the document
        sparse_representation = {int(idx): int(val) for idx, val in zip(nonzero_indices, nonzero_values)}
        # append the representation along with the document's original index
        bow_data.append({"index": doc_index, "text": sparse_representation})

    # save the processed BoW data to the output CSV file
    pd.DataFrame(bow_data).to_csv(output_filename, index=False)

In [4]:
apply_bow_to_file("../data/80_20/test_texts.csv", "../data/80_20/test_texts_bow.csv")

In [5]:
apply_bow_to_file("../data/80_20/train_texts.csv", "../data/80_20/train_texts_bow.csv")

In [6]:
apply_bow_to_file("../data/70_30/test_texts.csv", "../data/70_30/test_texts_bow.csv")

In [7]:
apply_bow_to_file("../data/70_30/train_texts.csv", "../data/70_30/train_texts_bow.csv")

## Word2Vec

In [8]:
def apply_word2vec_to_file(input_filename: str, output_filename: str) -> None:
    # read data from input file
    data = pd.read_csv(input_filename)
    
    # remove rows where text is NaN
    data = data.dropna(subset=["text"])

    # extract corpus and indexes
    corpus = data["text"].values
    indexes = data["index"].values

    # convert corpus into a list of lists of tokens
    corpus_tokenized = [word_tokenize(sentence) for sentence in corpus]

    # train the Word2Vec model and save it
    model = Word2Vec(sentences=corpus_tokenized, vector_size=50, window=5, min_count=5, workers=4, epochs=5)
    model.save("../models/word2vec_model")
    
    word2vec_data = []
    for i, doc_index in enumerate(indexes):
        review_as_word2vec = []
        for token in corpus_tokenized[i]:
            if token in model.wv:
                # if the word is in the vocabulary, get its vector
                review_as_word2vec.append(model.wv[token].tolist())
            else:
                # if not in vocabulary, represent it as a zero vector
                review_as_word2vec.append([0.0] * model.vector_size)
        # append the index and vectorized review to the output list
        word2vec_data.append({"index": doc_index, "text": review_as_word2vec})

    # save the processed w2v data to the output CSV file
    pd.DataFrame(word2vec_data).to_csv(output_filename, index=False)

In [9]:
apply_word2vec_to_file("../data/80_20/test_texts.csv", "../data/80_20/test_texts_w2v.csv")

In [10]:
apply_word2vec_to_file("../data/80_20/train_texts.csv", "../data/80_20/train_texts_w2v.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../models/word2vec_model'

In [None]:
apply_word2vec_to_file("../data/70_30/test_texts.csv", "../data/70_30/test_texts_w2v.csv")

In [None]:
apply_word2vec_to_file("../data/70_30/train_texts.csv", "../data/70_30/train_texts_w2v.csv")