In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [9]:
def apply_bow_to_file(filename: str) -> None:
    data = pd.read_csv(filename)
    data = data.dropna(subset=["text"]) # remove rows where text is NaN

    corpus = data["text"].values
    indexes = data["index"].values

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)

    bow_data = []
    for i, doc_index in enumerate(indexes):
        nonzero_indices = X[i].nonzero()[1]  # Get indices of nonzero values
        nonzero_values = X[i].data  # Get the actual nonzero values
        sparse_representation = {int(idx): int(val) for idx, val in zip(nonzero_indices, nonzero_values)}
        bow_data.append({"index": doc_index, "text": sparse_representation})

    output_filename = filename.split(".")[0] + "_bow.csv"
    pd.DataFrame(bow_data).to_csv(output_filename, index=False)

In [10]:
apply_bow_to_file("C:/Users/Lenovo/myFiles/nauka/studia/sem6/uczenie maszynowe/laby/sentiment_analysis/data/80_20/test_texts.csv")

In [11]:
apply_bow_to_file("C:/Users/Lenovo/myFiles/nauka/studia/sem6/uczenie maszynowe/laby/sentiment_analysis/data/80_20/train_texts.csv")

In [12]:
apply_bow_to_file("C:/Users/Lenovo/myFiles/nauka/studia/sem6/uczenie maszynowe/laby/sentiment_analysis/data/70_30/test_texts.csv")

In [13]:
apply_bow_to_file("C:/Users/Lenovo/myFiles/nauka/studia/sem6/uczenie maszynowe/laby/sentiment_analysis/data/70_30/train_texts.csv")

In [13]:
def apply_word2vec_to_file(filename: str) -> None:
    data = pd.read_csv(filename)
    data = data.dropna(subset=["text"])

    corpus = data["text"].values
    indexes = data["index"].values

    corpus_tokenized = [word_tokenize(sentence) for sentence in corpus]

    model = Word2Vec(sentences=corpus_tokenized, vector_size=50, window=5, min_count=5, workers=4, epochs=5)
    model.save("../models/word2vec_model")
    
    word2vec_data = []
    for i, doc_index in enumerate(indexes):
        review_as_word2vec = []
        for token in corpus_tokenized[i]:
            if token in model.wv:
                review_as_word2vec.append(model.wv[token].tolist())
            else:
                review_as_word2vec.append([0.0] * model.vector_size)
        word2vec_data.append({"index": doc_index, "text": review_as_word2vec})

    output_filename = filename.split(".")[0] + "_w2v.csv"
    pd.DataFrame(word2vec_data).to_csv(output_filename, index=False)

In [14]:
apply_word2vec_to_file("C:/Users/Lenovo/myFiles/nauka/studia/sem6/uczenie maszynowe/laby/sentiment_analysis/data/80_20/test_texts.csv")

In [15]:
apply_word2vec_to_file("C:/Users/Lenovo/myFiles/nauka/studia/sem6/uczenie maszynowe/laby/sentiment_analysis/data/80_20/train_texts.csv")

In [16]:
apply_word2vec_to_file("C:/Users/Lenovo/myFiles/nauka/studia/sem6/uczenie maszynowe/laby/sentiment_analysis/data/70_30/test_texts.csv")

In [17]:
apply_word2vec_to_file("C:/Users/Lenovo/myFiles/nauka/studia/sem6/uczenie maszynowe/laby/sentiment_analysis/data/70_30/train_texts.csv")