<a href="https://colab.research.google.com/github/dataskeptic/relatorioPIBIC/blob/main/nilc_wordembeddings_distances_lemma_poo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pot

Collecting pot
  Downloading POT-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (789 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m790.0/790.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pot
Successfully installed pot-0.9.1


In [2]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
import pandas as pd
import numpy as np
import nltk
import re
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
def to_lowercase(data):
    return data.str.lower()

def remove_punct(data):
    return data.str.replace(r'[^\w\s]', ' ', regex=True)

def remove_stopwords(text):
    stop_words = set(stopwords.words('portuguese'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]

    return ' '.join(filtered_text)

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text, language='portuguese')
    return ' '.join([lemmatizer.lemmatize(token) for token in tokens])

def stem(text):
    stemmer = SnowballStemmer('portuguese')
    tokens = word_tokenize(text, language='portuguese')
    return ' '.join([stemmer.stem(token) for token in tokens])

def preprocess_text(data):
    data = remove_punct(data)
    data = to_lowercase(data)
    data = data.apply(remove_stopwords)
    data = data.apply(lemmatize)
    #data = data.apply(stem)
    return data

In [4]:
path_reference_answers = "drive/MyDrive/PIBIC/data/poo/reference_answers_extended.xlsx"
path_student_answers = "drive/MyDrive/PIBIC/data/poo/student_answers.xlsx"

In [6]:
reference_answers = pd.read_excel(path_reference_answers)
student_answers = pd.read_excel(path_student_answers)

print(reference_answers)
print("---------------------------------------")
print(student_answers)

    question_id                                        refans_text  \
0             1  Polimorfismo é uma característica das linguage...   
1             1  Polimorfismo é a capacidade de uma classe se c...   
2             1  Polimorfismo refere-se à capacidade de um méto...   
3             1  Em programação orientada a objetos, polimorfis...   
4             1  O polimorfismo é um conceito em programação or...   
5             2  Classe são modelos descritivos para a criação ...   
6             2  Uma classe é uma descrição abstrata de um tipo...   
7             2  Classes são estruturas fundamentais em program...   
8             2  Em linguagens orientadas a objetos, uma classe...   
9             2  Classes em programação orientada a objetos ser...   
10            3  Herança é um conceito que permite que uma clas...   
11            3  Herança na programação orientada a objetos é u...   
12            3  Herança é um mecanismo que permite que uma nov...   
13            3  Her

In [8]:
reference_answers['refans_preprocess'] = preprocess_text(reference_answers['refans_text'])

In [9]:
student_answers['answer_text'] = student_answers['answer_text'].fillna('')

In [10]:
student_answers['answer_preprocess'] = preprocess_text(student_answers['answer_text'])

In [11]:
def remove_empty(student_answers):
    student_answers = student_answers.apply(lambda x: 'vazio' if pd.isna(x) or x.strip() == '' else x)
    return student_answers

In [12]:
student_answers['answer_preprocess'] = remove_empty(student_answers['answer_preprocess'])

In [13]:
from ot import emd2

from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()
  register_backend(TensorflowBackend())


In [14]:
from gensim.models import KeyedVectors
model_cbow = KeyedVectors.load_word2vec_format("drive/MyDrive/PIBIC/wordembeddings/cbow_s300.txt")

In [15]:
model_cbow.init_sims(replace=True)

  model_cbow.init_sims(replace=True)


In [16]:
model_glove = KeyedVectors.load_word2vec_format("drive/MyDrive/PIBIC/wordembeddings/glove_s300.txt")

In [17]:
model_glove.init_sims(replace=True)

  model_glove.init_sims(replace=True)


In [18]:
model_skip = KeyedVectors.load_word2vec_format("drive/MyDrive/PIBIC/wordembeddings/skip_s300.txt")

In [19]:
model_skip.init_sims(replace=True)

  model_skip.init_sims(replace=True)


In [20]:
#for index, row in missing_words_df.iterrows():
#    print(f"Index: {index}, Missing Words: {row['missing_words']}")

In [21]:
from gensim.similarities import WmdSimilarity


In [22]:
def compute_wmd_similarities(student_df, reference_df, model, prefix):
    wmd_similarities = pd.DataFrame()

    for questionId in range(1, 16):
        temp_student_answers = student_df[student_df['question_id'] == questionId].copy()
        refans_answers = reference_df['refans_preprocess'][reference_df['question_id'] == questionId].copy()
        refans_answers = [ref.split() for ref in refans_answers]  # Split just once for each questionId
        wmd_similarity = WmdSimilarity(refans_answers, model)

        for index, row in temp_student_answers.iterrows():
            temp_answer = row['answer_preprocess']
            similarities = wmd_similarity[temp_answer.split()]

            similarity = max(similarities), np.mean(similarities), np.median(similarities), min(similarities)
            columns = [f"max_{prefix}", f"mean_{prefix}", f"median_{prefix}", f"min_{prefix}"]
            temp_student_answers.loc[index, columns] = similarity

        wmd_similarities = pd.concat([wmd_similarities, temp_student_answers])

    return wmd_similarities[columns]


final_df = student_answers.copy()

columns_cbow = compute_wmd_similarities(student_answers, reference_answers, model_cbow, "cbow")
final_df = pd.concat([final_df, columns_cbow], axis=1)

columns_glove = compute_wmd_similarities(student_answers, reference_answers, model_glove, "glove")
final_df = pd.concat([final_df, columns_glove], axis=1)

columns_skip = compute_wmd_similarities(student_answers, reference_answers, model_skip, "skip")
final_df = pd.concat([final_df, columns_skip], axis=1)
wmd_similarities = final_df.copy()
print(wmd_similarities)


     question_id                                        answer_text  notas  \
0              1  Polimorfismo é, como o nome sugere (múltiplas ...   1.11   
1              2  Classes são modelos/"estruturas" de coisas tra...   1.11   
2              3  Herança, em POO, é bem fácil de ser entendida ...   1.00   
3              4  Os possíveis modificadores de acesso são 4: pu...   1.11   
4              1  O polimorfismo estático ele permite que nós cr...   0.40   
..           ...                                                ...    ...   
119            4                                            Nao sei   0.00   
120            1  Polimorfismo permite que a partir de um contra...   0.70   
121            2  Classes são estruturas de dados, que reunem um...   0.60   
122            3  Herança permite que atributos ou métodos de um...   1.11   
123            4  public - Permite que qualquer método ou atribu...   0.75   

                                     answer_preprocess  max_cbo

In [23]:
def compute_cosine_similarities(student_df, reference_df, model, prefix):
    cosine_similarities = pd.DataFrame()

    for questionId in range(1, 5):
        temp_student_answers = student_df[student_df['question_id'] == questionId].copy()
        refans_answers = reference_df['refans_preprocess'][reference_df['question_id'] == questionId].copy()
        refans_answers = [ref.split() for ref in refans_answers]  # Split just once for each questionId

        for index, row in temp_student_answers.iterrows():
            #print(f'resposta: {row["answer_preprocess"]}')
            temp_answer = row['answer_preprocess'].split()
            similarities = ([model.n_similarity(temp_answer, ref) for ref in refans_answers])

            similarity = max(similarities), np.mean(similarities), np.median(similarities), min(similarities)
            columns = [f"max_{prefix}", f"mean_{prefix}", f"median_{prefix}", f"min_{prefix}"]
            temp_student_answers.loc[index, columns] = similarity

        cosine_similarities = pd.concat([cosine_similarities, temp_student_answers])

    return cosine_similarities[columns]

# Assuming student_answers and reference_answers are your dataframes and model_cbow, model_glove, and model_skip are your models

final_df = student_answers.copy()

columns_cbow = compute_cosine_similarities(student_answers, reference_answers, model_cbow, "cbow")
final_df = pd.concat([final_df, columns_cbow], axis=1)

columns_glove = compute_cosine_similarities(student_answers, reference_answers, model_glove, "glove")
final_df = pd.concat([final_df, columns_glove], axis=1)

columns_skip = compute_cosine_similarities(student_answers, reference_answers, model_skip, "skip")
final_df = pd.concat([final_df, columns_skip], axis=1)
cosine_similarities = final_df.copy()
print(cosine_similarities)

     question_id                                        answer_text  notas  \
0              1  Polimorfismo é, como o nome sugere (múltiplas ...   1.11   
1              2  Classes são modelos/"estruturas" de coisas tra...   1.11   
2              3  Herança, em POO, é bem fácil de ser entendida ...   1.00   
3              4  Os possíveis modificadores de acesso são 4: pu...   1.11   
4              1  O polimorfismo estático ele permite que nós cr...   0.40   
..           ...                                                ...    ...   
119            4                                            Nao sei   0.00   
120            1  Polimorfismo permite que a partir de um contra...   0.70   
121            2  Classes são estruturas de dados, que reunem um...   0.60   
122            3  Herança permite que atributos ou métodos de um...   1.11   
123            4  public - Permite que qualquer método ou atribu...   0.75   

                                     answer_preprocess  max_cbo

In [24]:
wmd_similarities.describe()

Unnamed: 0,question_id,notas,max_cbow,mean_cbow,median_cbow,min_cbow,max_glove,mean_glove,median_glove,min_glove,max_skip,mean_skip,median_skip,min_skip
count,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0
mean,2.5,0.751774,0.531753,0.506253,0.503829,0.488024,0.550259,0.523907,0.521725,0.504713,0.5383,0.51312,0.510976,0.495525
std,1.12257,0.377058,0.077205,0.042413,0.038552,0.033041,0.076674,0.043998,0.040743,0.034988,0.078125,0.045149,0.041804,0.036341
min,1.0,0.0,0.419923,0.418533,0.418871,0.417301,0.431605,0.428518,0.430305,0.424555,0.42045,0.419582,0.419355,0.418522
25%,1.75,0.6,0.507074,0.493806,0.492196,0.473265,0.529066,0.512405,0.510948,0.494053,0.516514,0.499353,0.497315,0.478477
50%,2.5,0.82,0.531266,0.511886,0.510646,0.49267,0.549622,0.527128,0.526986,0.508685,0.537974,0.518801,0.517471,0.502222
75%,3.25,1.11,0.552544,0.527925,0.527104,0.505637,0.571125,0.546997,0.545552,0.524746,0.560826,0.538068,0.538908,0.516299
max,4.0,1.11,1.0,0.656276,0.581445,0.561551,1.0,0.675409,0.598586,0.587655,1.0,0.665444,0.59332,0.577276


In [25]:
cosine_similarities.describe()

Unnamed: 0,question_id,notas,max_cbow,mean_cbow,median_cbow,min_cbow,max_glove,mean_glove,median_glove,min_glove,max_skip,mean_skip,median_skip,min_skip
count,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0
mean,2.5,0.751774,0.662372,0.602221,0.603281,0.542744,0.800464,0.75425,0.760966,0.69989,0.735579,0.686927,0.690006,0.634513
std,1.12257,0.377058,0.221566,0.214208,0.216873,0.211393,0.178041,0.18427,0.181457,0.198038,0.231426,0.228144,0.23104,0.223834
min,1.0,0.0,0.044508,0.009274,-0.001814,-0.010381,0.280406,0.212367,0.248683,0.13065,0.076571,0.06118,0.066056,0.041677
25%,1.75,0.6,0.664132,0.578446,0.575647,0.495748,0.785737,0.741294,0.745432,0.685059,0.760242,0.694588,0.702944,0.607276
50%,2.5,0.82,0.736541,0.6682,0.664838,0.598035,0.860457,0.8271,0.827874,0.781116,0.817937,0.771624,0.776091,0.705418
75%,3.25,1.11,0.780456,0.730865,0.736161,0.678915,0.90583,0.865806,0.874659,0.819772,0.854207,0.807301,0.815301,0.771453
max,4.0,1.11,1.0,0.86646,0.872981,0.823113,1.0,0.915108,0.926522,0.897376,1.0,0.917473,0.914,0.885001


In [26]:
def assign_grades(column):
    median = wmd_similarities[column].median()
    std = wmd_similarities[column].std()
    lower_bound = (median - std)
    upper_bound = (median + std)

    def grade(similarity):
      if similarity < lower_bound:
        return 0
      elif similarity > upper_bound:
          return 3
      else:
          # For values between lower_bound and upper_bound
          distance_to_lower = abs(similarity - lower_bound)
          distance_to_upper = abs(similarity - upper_bound)

          # If similarity is closer to the lower_bound, return 1, otherwise return 2
          return 1 if distance_to_lower < distance_to_upper else 2

    return cosine_similarities[column].apply(grade)

graded_wmd = wmd_similarities.copy()

# Assign grades to each column in the dataframe
for column in wmd_similarities.columns[4:]:
    graded_wmd[f"{column}_grade"] = assign_grades(column)

print(graded_wmd[['max_cbow_grade', 'mean_cbow_grade', 'median_cbow_grade',
       'min_cbow_grade', 'max_glove_grade', 'mean_glove_grade',
       'median_glove_grade', 'min_glove_grade', 'max_skip_grade',
       'mean_skip_grade', 'median_skip_grade', 'min_skip_grade']])

     max_cbow_grade  mean_cbow_grade  median_cbow_grade  min_cbow_grade  \
0                 3                3                  3               3   
1                 3                3                  3               3   
2                 3                3                  3               3   
3                 3                3                  3               3   
4                 3                3                  3               3   
..              ...              ...                ...             ...   
119               0                0                  0               0   
120               3                3                  3               3   
121               3                3                  3               3   
122               3                3                  3               3   
123               3                3                  3               0   

     max_glove_grade  mean_glove_grade  median_glove_grade  min_glove_grade  \
0                  3

In [27]:
def assign_grades(column):
    median = cosine_similarities[column].median()
    std = cosine_similarities[column].std()
    lower_bound = (median - std)
    upper_bound = (median + std)

    def grade(similarity):
      if similarity < lower_bound:
        return 0
      elif similarity > upper_bound:
          return 3
      else:
          # For values between lower_bound and upper_bound
          distance_to_lower = abs(similarity - lower_bound)
          distance_to_upper = abs(similarity - upper_bound)

          # If similarity is closer to the lower_bound, return 1, otherwise return 2
          return 1 if distance_to_lower < distance_to_upper else 2

    return cosine_similarities[column].apply(grade)

graded_cosines = cosine_similarities.copy()

# Assign grades to each column in the dataframe
for column in cosine_similarities.columns[4:]:
    graded_cosines[f"{column}_grade"] = assign_grades(column)

print(graded_cosines[['max_cbow_grade', 'mean_cbow_grade', 'median_cbow_grade',
       'min_cbow_grade', 'max_glove_grade', 'mean_glove_grade',
       'median_glove_grade', 'min_glove_grade', 'max_skip_grade',
       'mean_skip_grade', 'median_skip_grade', 'min_skip_grade']])

     max_cbow_grade  mean_cbow_grade  median_cbow_grade  min_cbow_grade  \
0                 2                2                  2               2   
1                 2                2                  2               2   
2                 2                2                  2               2   
3                 2                2                  2               2   
4                 2                2                  2               2   
..              ...              ...                ...             ...   
119               0                0                  0               0   
120               2                2                  2               2   
121               1                2                  2               2   
122               1                1                  1               2   
123               1                1                  1               1   

     max_glove_grade  mean_glove_grade  median_glove_grade  min_glove_grade  \
0                  2

In [28]:
def similarity_to_score(similarity):
    if similarity <= 0.3 :
        score = 0
    elif similarity <= 0.6:
        score = 1
    elif similarity <= 0.9:
        score = 2
    else:
        score = 3
    return score

graded_cosines['grade'] = graded_cosines['notas'].apply(similarity_to_score)
graded_wmd['grade'] = graded_wmd['notas'].apply(similarity_to_score)

graded_cosines['grade'].value_counts()

3    56
2    38
0    21
1     9
Name: grade, dtype: int64

In [29]:
grade_columns = [col for col in graded_wmd.columns if col.endswith('grade')]

grade_similarities = graded_wmd[grade_columns]

# Show frequencies for each 'grade' column
for col in grade_similarities.columns:
    print(f"Frequencies for {col}:")
    print(graded_wmd[col].value_counts())
    print("\n")

Frequencies for max_cbow_grade:
3    97
0    14
2     7
1     6
Name: max_cbow_grade, dtype: int64


Frequencies for mean_cbow_grade:
3    96
0    19
2     5
1     4
Name: mean_cbow_grade, dtype: int64


Frequencies for median_cbow_grade:
3    97
0    19
2     4
1     4
Name: median_cbow_grade, dtype: int64


Frequencies for min_cbow_grade:
3    84
0    28
2     9
1     3
Name: min_cbow_grade, dtype: int64


Frequencies for max_glove_grade:
3    110
0     13
1      1
Name: max_glove_grade, dtype: int64


Frequencies for mean_glove_grade:
3    110
0     14
Name: mean_glove_grade, dtype: int64


Frequencies for median_glove_grade:
3    110
0     14
Name: median_glove_grade, dtype: int64


Frequencies for min_glove_grade:
3    102
0     15
1      5
2      2
Name: min_glove_grade, dtype: int64


Frequencies for max_skip_grade:
3    102
0     14
2      6
1      2
Name: max_skip_grade, dtype: int64


Frequencies for mean_skip_grade:
3    102
0     16
1      3
2      3
Name: mean_skip_grade, 

In [30]:
grade_columns = [col for col in graded_cosines.columns if col.endswith('grade')]

grade_similarities = graded_cosines[grade_columns]

# Show frequencies for each 'grade' column
for col in grade_similarities.columns:
    print(f"Frequencies for {col}:")
    print(graded_cosines[col].value_counts())
    print("\n")

Frequencies for max_cbow_grade:
2    60
1    44
0    18
3     2
Name: max_cbow_grade, dtype: int64


Frequencies for mean_cbow_grade:
2    62
1    46
0    16
Name: mean_cbow_grade, dtype: int64


Frequencies for median_cbow_grade:
2    62
1    46
0    16
Name: median_cbow_grade, dtype: int64


Frequencies for min_cbow_grade:
2    61
1    46
0    16
3     1
Name: min_cbow_grade, dtype: int64


Frequencies for max_glove_grade:
2    62
1    47
0    15
Name: max_glove_grade, dtype: int64


Frequencies for mean_glove_grade:
2    62
1    44
0    18
Name: mean_glove_grade, dtype: int64


Frequencies for median_glove_grade:
2    62
1    44
0    18
Name: median_glove_grade, dtype: int64


Frequencies for min_glove_grade:
2    62
1    40
0    22
Name: min_glove_grade, dtype: int64


Frequencies for max_skip_grade:
2    62
1    42
0    20
Name: max_skip_grade, dtype: int64


Frequencies for mean_skip_grade:
2    62
1    41
0    21
Name: mean_skip_grade, dtype: int64


Frequencies for median_skip_

In [31]:
from sklearn.metrics import cohen_kappa_score

In [32]:
similarities_columns = [col for col in graded_wmd.columns if col.endswith('grade') and col != 'grade']
kappa_results = []
# Calculate Cohen's Kappa for each column with the reference 'grade' column
for col in similarities_columns:
    kappa_linear = cohen_kappa_score(graded_wmd[col], graded_wmd['grade'], weights='linear')
    kappa_quadratic = cohen_kappa_score(graded_wmd[col], graded_wmd['grade'], weights='quadratic')

    kappa_results.append({
        'Column': col,
        'Kappa_Linear': kappa_linear,
        'Kappa_Quadratic': kappa_quadratic
    })

# Transformar a lista de dicionários em um DataFrame
kappa_wmd = pd.DataFrame(kappa_results)

print(kappa_wmd)

                Column  Kappa_Linear  Kappa_Quadratic
0       max_cbow_grade      0.470463         0.656674
1      mean_cbow_grade      0.467049         0.622636
2    median_cbow_grade      0.474516         0.633044
3       min_cbow_grade      0.398577         0.531770
4      max_glove_grade      0.415589         0.547995
5     mean_glove_grade      0.425926         0.555753
6   median_glove_grade      0.425926         0.555753
7      min_glove_grade      0.518645         0.671600
8       max_skip_grade      0.486558         0.647090
9      mean_skip_grade      0.519154         0.680099
10   median_skip_grade      0.524652         0.682199
11      min_skip_grade      0.521507         0.667916


In [33]:
similarities_columns = [col for col in graded_cosines.columns if col.endswith('grade') and col != 'grade']
kappa_results = []
# Calculate Cohen's Kappa for each column with the reference 'grade' column
for col in similarities_columns:
    kappa_linear = cohen_kappa_score(graded_cosines[col], graded_cosines['grade'], weights='linear')
    kappa_quadratic = cohen_kappa_score(graded_cosines[col], graded_cosines['grade'], weights='quadratic')

    kappa_results.append({
        'Column': col,
        'Kappa_Linear': kappa_linear,
        'Kappa_Quadratic': kappa_quadratic
    })

# Transformar a lista de dicionários em um DataFrame
kappa_cosine = pd.DataFrame(kappa_results)

print(kappa_cosine)

                Column  Kappa_Linear  Kappa_Quadratic
0       max_cbow_grade      0.344791         0.484065
1      mean_cbow_grade      0.324779         0.460968
2    median_cbow_grade      0.311408         0.438351
3       min_cbow_grade      0.291810         0.421509
4      max_glove_grade      0.261345         0.397429
5     mean_glove_grade      0.290936         0.443731
6   median_glove_grade      0.290936         0.443731
7      min_glove_grade      0.277101         0.418552
8       max_skip_grade      0.323374         0.477690
9      mean_skip_grade      0.319759         0.451359
10   median_skip_grade      0.310235         0.441669
11      min_skip_grade      0.293596         0.408608


In [None]:
wmd_similarities.to_excel('drive/MyDrive/PIBIC/data/results/similarities/wmd_poo_steam.xlsx', index=False)
#cosine_similarities.to_excel('drive/MyDrive/PIBIC/data/results/similarities/cosine_biology_lemma.xlsx', index=False)
kappa_wmd.to_excel('drive/MyDrive/PIBIC/data/results/kappa/wmd_poo_steam.xlsx', index=False)
#kappa_cosine.to_excel('drive/MyDrive/PIBIC/data/results/kappa/kappa_cosine_biology_lemma.xlsx', index=False)