In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/codeReview/5_models/Mockery/preprocessed.csv')
df.head()

Unnamed: 0,description,Mockery
0,must black belt copypasting bravo,1
1,invent programing language understand word,1
2,fan obfuscation code masterclas confusion,1
3,mispel bug feature comit mesage,1
4,se fan what the fuck per minute coding style,1


**define function for save embedding representation**

In [3]:
def save_representation(df, file_path):
  # Write the DataFrame to a CSV file
  df.to_csv(file_path, index=False)

Vectorizing code review comments involves converting textual data into numerical vectors so that machine learning algorithms can process and analyze them. Here are several common approaches that we implement on our data to convert them from text to nimeric.

**1. Bag-of-Words (BoW):**

 BoW represents a document as an unordered set of words, disregarding grammar and word order but considering word frequency. we use  the CountVectorizer from scikit-learn to implement BoW.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def create_bow(comments):
    """
    Returns:
    - pd.DataFrame
        A new DataFrame with the Bag-of-Words representation.
    """
    # Create an instance of CountVectorizer
    vectorizer = CountVectorizer()

    # Fit and transform the comments to obtain the Bag-of-Words matrix
    bow_matrix = vectorizer.fit_transform(comments)

    # Convert the Bag-of-Words matrix to a DataFrame
    bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    return bow_df

# Create Bag-of-Words representation for the 'comments' column
bow_representation = create_bow(df['description'])

In [None]:
# Specify the path for the CSV file
csv_file_path = '/content/drive/MyDrive/codeReview/4_wordVectorization/bow_representation.csv'
save_representation(bow_representation,csv_file_path )

**2. Term Frequency-Inverse Document Frequency (TF-IDF):**

Similar to BoW, but it also considers the importance of words by giving higher weights to terms that are rare across all documents.
The TfidfVectorizer from scikit-learn is commonly used for TF-IDF vectorization.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def create_tfidf(comments):
    """
    Returns:
    - pd.DataFrame
        A new DataFrame with the TF-IDF representation.
    """

    # Create an instance of TfidfVectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the comments to obtain the TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(comments)

    # Convert the TF-IDF matrix to a DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    return tfidf_df

# Create TF-IDF representation for the 'comments' column
tfidf_representation = create_tfidf(df['description'])

In [None]:
# Specify the path for the CSV file
csv_file_path = '/content/drive/MyDrive/codeReview/4_wordVectorization/tfidf_representation.csv'

# Save TF-IDF representation to a CSV file
save_representation(tfidf_representation,csv_file_path )

**Word Embeddings (Word2Vec, GloVe, FastText):**

Word embeddings capture semantic relationships between words by representing them as dense vectors in a continuous vector space.
Gensim provides implementations for Word2Vec, and we can find pre-trained models for GloVe and FastText.



**3. Word2Vec**

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Tokenize the comments
tokenized_comments = df['description'].apply(lambda x: word_tokenize(x.lower()))  # Assuming comments are preprocessed and lowercased

# Train the Word2Vec model
model = Word2Vec(sentences=tokenized_comments, vector_size=100, window=5, min_count=1, workers=4)

def get_comment_vector(comment):
    tokens = word_tokenize(comment.lower())
    vector = sum(model.wv[word] for word in tokens if word in model.wv) / len(tokens)
    return vector

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Apply the function to the entire 'comments' column
df['comment_vectors'] = df['description'].apply(get_comment_vector)

In [None]:
# Specify the path for the CSV file
csv_file_path = '/content/drive/MyDrive/codeReview/4_wordVectorization/word2vec_representation.csv'
df1[['vector_dim_' + str(i) for i in range(df['comment_vectors'].iloc[0].shape[0])]] = pd.DataFrame(df['comment_vectors'].tolist(), index=df.index)

# Save Word2Vec representation to a CSV file
df1.to_csv(csv_file_path, index=False)

NameError: ignored

**4. GloVe**

It stands for Global Vectors. This is created by Stanford University. Glove has pre-defined dense vectors for around every 6 billion words of English literature along with many other general use characters like comma, braces, and semicolons.

In [None]:
import spacy

# Load spaCy model with GloVe embeddings
nlp = spacy.load("en_core_web_sm")


def create_glove(comments):
    """
    Returns:
    - pd.DataFrame
        A new DataFrame with the GloVe representation.
    """
    # Process comments with spaCy to get GloVe vectors
    glove_vectors = [nlp(comment).vector for comment in comments]

    # Convert the GloVe vectors to a DataFrame
    glove_df = pd.DataFrame(glove_vectors)

    return glove_df


# Create GloVe representation for the 'comments' column
glove_representation = create_glove(df['description'])

In [None]:
# Specify the path for the CSV file
csv_file_path = '/content/drive/MyDrive/codeReview/4_wordVectorization/glove_representation.csv'

# Save GloVe representation to a CSV file
save_representation(glove_representation, csv_file_path)

**5. FastText**

fastText is an open-source library, developed by the Facebook AI Research lab. Its main focus is on achieving scalable solutions for the tasks of text classification and representation while processing large datasets quickly and accurately. FastText is a modified version of word2vec.

In [4]:
from gensim.models import FastText

def create_fasttext_embedding_model(comments):

    # Tokenize comments into sentences
    sentences = [comment.split() for comment in comments]

    # Train FastText model
    model = FastText(sentences, vector_size=128, window=5, min_count=3, workers=4)

    return model

# Create FastText representation for the 'comments' column
fasttext_representation = create_fasttext_embedding_model(df['description'])


In [5]:
# Specify the path for the CSV file
csv_file_path = '/content/drive/MyDrive/codeReview/5_models/Mockery/fasttext_representation.bin'

# Save the trained FastText model
fasttext_representation.save(csv_file_path)

In [6]:
def create_fasttext(df, column_name, model, output_csv):
    """
    Save FastText embedding vectors for each comment in a pandas DataFrame to a CSV file.

    Parameters:
    - df: pandas DataFrame
        The DataFrame containing the comments.
    - column_name: str
        The name of the column containing the comments.
    - model: gensim.models.fasttext.FastText
        Trained FastText model.
    - output_csv: str
        Path to the output CSV file.
    """

    # Extract comments from the specified column
    comments = df[column_name]

    # Tokenize comments into sentences
    sentences = [comment.split() for comment in comments]

    # Get FastText embeddings for each comment
    embeddings = [model.wv[words].mean(axis=0) for words in sentences]

    # Create a DataFrame with comment vectors
    vectors_df = pd.DataFrame(embeddings, columns=[f'feature_{i}' for i in range(model.vector_size)])

    # Save the result to a CSV file
    vectors_df.to_csv(output_csv, index=False)



# Load the saved FastText model
loaded_fasttext_model = FastText.load("/content/drive/MyDrive/codeReview/5_models/Mockery/fasttext_representation.bin")

# Specify the path to the output CSV file
output_csv_path = "/content/drive/MyDrive/codeReview/5_models/Mockery/fasttext_representation.csv"

# Save FastText embeddings for each comment to a CSV file
create_fasttext(df, 'description', loaded_fasttext_model, output_csv_path)

**6. Universal Sentence Encoder (USE):**

Developed by Google, USE generates fixed-size vectors for input sentences. It captures semantic information and can be useful for various natural language processing tasks.
TensorFlow provides a pre-trained Universal Sentence Encoder.

In [7]:
import tensorflow as tf
import tensorflow_hub as hub

def create_use(comments):
    """
    Returns:
    - pd.DataFrame
        A new DataFrame with the Universal Sentence Encoder embeddings.
    """
    # Load the Universal Sentence Encoder module
    use_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    embed = hub.load(use_url)

    # Get embeddings for each comment
    embeddings = embed(comments)

    # Create a DataFrame with comment vectors
    vectors_df = pd.DataFrame(embeddings.numpy(), columns=[f'feature_{i}' for i in range(embeddings.shape[1])])

    return vectors_df

In [8]:
# Create USE representation for the 'comments' column
use_representation = create_use(df['description'])

In [9]:
# Specify the path for the CSV file
csv_file_path = '/content/drive/MyDrive/codeReview/5_models/Mockery/use_representation.csv'

# Save GloVe representation to a CSV file
use_representation.to_csv(csv_file_path, index=False)

**7. BERT Embeddings:**

BERT (Bidirectional Encoder Representations from Transformers) provides context-aware word embeddings, capturing the meaning of words in the context of the entire sentence.
The transformers library in Python provides access to pre-trained BERT models.

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

def create_bert(df, column_name, model_name="bert-base-uncased"):
    """re-trained BERT model.

    Returns:
    - pd.DataFrame
        A new DataFrame with the BERT embeddings.
    """

    # Load BERT tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Extract comments from the specified column
    comments = df[column_name].tolist()

    # Tokenize and encode comments
    encoded_comments = tokenizer(comments, padding=True, truncation=True, return_tensors="pt")

    # Forward pass to get BERT embeddings
    with torch.no_grad():
        outputs = model(**encoded_comments)

    # Extract the embeddings from the last layer
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    # Create a DataFrame with comment vectors
    vectors_df = pd.DataFrame(embeddings, columns=[f'feature_{i}' for i in range(embeddings.shape[1])])

    return vectors_df


In [None]:
# Create BERT representation for the 'comments' column
bert_representation = create_bert(df, 'description')

In [None]:
# Specify the path for the CSV file
csv_file_path = '/content/drive/MyDrive/codeReview/4_wordVectorization/bert_representation.csv'

# Save BERT representation to a CSV file
bert_representation.to_csv(csv_file_path, index=False)