In [41]:
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
# For GloVe, additional setup is required.

def bow_embeddings(texts):
    """Generates Bag of Words embeddings."""
    vectorizer = CountVectorizer()
    return vectorizer.fit_transform(texts)

def word2vec_embeddings(texts):
    """Generates Word2Vec embeddings."""
    tokenized_texts = [text.split() for text in texts]
    model = Word2Vec(tokenized_texts, min_count=1)
    return model.wv

def glove_embeddings(texts):
    """Generates GloVe embeddings. Implementation required."""
    pass

# Example usage
texts = ["türkiye acil durum yönetim", "veri madencilik yapay zeka"]  # Preprocessed Turkish text data
bow_emb = bow_embeddings(texts)
w2v_emb = word2vec_embeddings(texts)
# glove_emb = glove_embeddings(texts)




In [37]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from TurkishStemmer import TurkishStemmer


def stem_text(tokens):
    stemmer = TurkishStemmer()
    return [stemmer.stem(word) for word in tokens]
# You may need to download specific resources from NLTK
# nltk.download('punkt')
# nltk.download('stopwords')

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_numbers(text):
    #text = re.sub(r'\d+\.\d+', '', text)
    return text

def to_lowercase(text):
    return text.lower()
def tokenize(text):
    return word_tokenize(text)
def remove_stopwords(tokens):
    turkish_stopwords = set(stopwords.words('turkish'))
    return [word for word in tokens if word not in turkish_stopwords]
def preprocess_text(text):
    text = remove_punctuation(text)
    text = remove_numbers(text)
    text = to_lowercase(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stem_text(tokens)
    # Apply additional steps like stemming or lemmatization if needed
    return ' '.join(tokens)
def preprocess_dataset(dataset):
    return [preprocess_text(text) for text in dataset]


In [39]:
# Example dataset
dataset = df_labels.Şikayeti_agg.values.tolist()

# Preprocess the dataset
preprocessed_dataset = preprocess_dataset(dataset)

In [42]:
# Example usage
bow_emb = bow_embeddings(preprocessed_dataset)

In [43]:
w2v_emb = word2vec_embeddings(preprocessed_dataset)


In [None]:
bow_emb.to

<359297x36621 sparse matrix of type '<class 'numpy.int64'>'
	with 1350479 stored elements in Compressed Sparse Row format>

In [65]:
import numpy as np
# Getting all the words in the model
words = list(w2v_emb.key_to_index.keys())

# Creating a matrix where each row is a vector representation of a word
word_vectors_matrix = np.array([w2v_emb[word] for word in words])


In [102]:
import gensim


# Training the Word2Vec model
model = gensim.models.Word2Vec(sentences=preprocessed_dataset, vector_size=768, window=5, min_count=1, workers=4)

# Now, you can get the vector for each word in your vocabulary
word_vectors = model.wv

In [103]:
# To get a vector representation for a whole text element, you can average the vectors of its words.
def text_to_vector(text, model):
    tokens = word_tokenize(text.lower())
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if vectors:
        # Averaging the vectors
        return np.mean(vectors, axis=0)
    else:
        # Return a zero vector if there are no words in the model's vocabulary
        return np.zeros(model.vector_size)


# Example: Convert all text elements to vectors
text_vectors = [text_to_vector(text, model) for text in preprocessed_dataset]
text_vectors = np.array(text_vectors)

In [105]:
save_results(text_vectors, 'w2vec_768dim_emb')