ANALYSIS OF THE DIFFERENCES BETWEEN QUESTIONS USEFUL FOR LEARNING AND NOT USEFUL FOR LEARNING

In [3]:
import notebook_hook
import logging, sys
logging.disable(sys.maxsize)
import pathlib
import json
from tqdm import tqdm
import pandas as pd
from collections import Counter
import numpy as np
from qg.notebooks.t_tests.t_test_functions import (
    count_tokens,
    diff_number_words_per_question,
    diff_number_of_concepts_per_question,
    diff_prop_concepts_per_words,
    most_frequent_concepts,
    diff_number_of_verbs,
    diff_prop_of_verbs,
)

from qg.results_analysis.objects.POSAnalysis import POS_analysis_object

In [4]:
# loading learning Q data...
ROOT_DIR = pathlib.Path().resolve().parents[1]
print(f"Root directory: {ROOT_DIR}")

with open(ROOT_DIR/"LearningQ_data"/"cls_balanced_dataset.json") as f:
    learningq = json.load(f)

df = pd.DataFrame.from_dict(learningq["test"])
df.head()

Root directory: C:\Users\DeboraOrsolich\Development\question_generation_models\deep_qg\qg


Unnamed: 0,text,labels
0,fraction are very confusing to me i did n't re...,0
1,"i may be jumping the gun here , but has the te...",1
2,is it because the position function can not be...,0
3,"at 2:04 he said he uses gesso sometimes , what...",0
4,when our hypothesis is herbs do nothing then w...,1


In [5]:
# preparing the data for the analysis...
useful = df[df["labels"]==1]
not_useful = df[df["labels"]==0]

sets = [useful, not_useful]
sets_name = ["useful", "not_useful"]
print(len(sets[0]), len(sets[1]))

480 480


### Are there statistically significant differences in the number of words in the questions of each group?

In [6]:
diff_number_words_per_question(sets, sets_name)

Yes, there are statistically significant differences in the length of the questions between the two groups
   P-value = 1.0373387011520134e-10
   Average length of useful questions 17.016666666666666 tokens
   Average length of not_useful questions 13.022916666666667 tokens


### Are there statistically significant differences in the number of concepts in the questions of each group?

In [7]:
strings_groups = diff_number_of_concepts_per_question(sets, sets_name)

100%|██████████| 480/480 [00:00<00:00, 77885.64it/s]
100%|██████████| 480/480 [00:00<00:00, 96675.43it/s]

Yes, there are statistically significant differences in the number of concepts in each question between the two groups
   P-value = 1.3678269253697558e-19
   Average number of concepts in useful questions: 3.0541666666666667
   Average number of concepts in not useful questions: 1.9333333333333333





### Are there statistically significant differences in the proportion of concepts per words in each question between both groups?

In [8]:
diff_prop_concepts_per_words(sets, sets_name)


100%|██████████| 480/480 [00:00<00:00, 73719.00it/s]
100%|██████████| 480/480 [00:00<00:00, 95997.80it/s]

Yes, there are statistically significant differences in the proportion of concepts per word in each question between the two groups
   P-value = 8.11429827881051e-14
   Average proportion of concepts per words in useful questions: 0.1833690717818408
   Average proportion of concepts per words in not_useful questions: 0.14494073167877858





### What are the most frequent concepts in each group?

In [9]:
cnt_useful, cnt_not_useful, all_strings_useful, all_strings_not_useful = most_frequent_concepts(strings_groups)

In [10]:
n = 10

print(f"{n} MORE FREQUENT CONCEPTS IN USEFUL QUESTIONS")
print(f"   Concept:   frequency")
for i in range(n):
    concept = cnt_useful[i][0]
    count = cnt_useful[i][1]
    print(f"  {i}.   {concept}:     {count}")

print()
print(f"{n} MORE FREQUENT CONCEPTS IN NOT USEFUL QUESTIONS")
print(f"   Concept:   frequency")
for i in range(n):
    concept = cnt_not_useful[i][0]
    count = cnt_not_useful[i][1]
    print(f"  {i}.   {concept}:     {count}")

10 MORE FREQUENT CONCEPTS IN USEFUL QUESTIONS
   Concept:   frequency
  0.   number:     21
  1.   asthma:     18
  2.   gcf:     17
  3.   sal:     17
  4.   function:     15
  5.   sequence:     14
  6.   numbers:     13
  7.   difference:     12
  8.   force:     11
  9.   equation:     11

10 MORE FREQUENT CONCEPTS IN NOT USEFUL QUESTIONS
   Concept:   frequency
  0.   video:     33
  1.   number:     14
  2.   sal:     12
  3.   answer:     11
  4.   question:     9
  5.   numbers:     9
  6.   time:     9
  7.   month:     7
  8.   way:     7
  9.   problem:     6


### Are there statistically significant differences in the number of verbs in each question between both groups?

In [11]:
verbs_groups = diff_number_of_verbs(sets, sets_name)

100%|██████████| 480/480 [00:00<00:00, 159973.45it/s]
100%|██████████| 480/480 [00:00<00:00, 159795.69it/s]

Yes, there are statistically significant differences in the number of VERBS in each question between the two groups
   P-value = 0.009772871825290252
   Average number of verbs in useful questions: 1.2291666666666667 verbs
   Average number of verbs in not useful questions: 1.0416666666666667 verbs





### Are there statistically significant differences in the proportion of verbs per words in each question between both groups?

In [12]:
diff_prop_of_verbs(sets, sets_name)

100%|██████████| 480/480 [00:00<00:00, 96039.02it/s]
100%|██████████| 480/480 [00:00<00:00, 120080.28it/s]

Yes, there are statistically significant differences in the proportion of VERBS per words in each question between the two groups
   P-value = 0.038727064317922937
   Average proportion of verbs per words in useful questions: 0.07043259976922919 verbs
   Average proportion of verbs per words in not useful questions: 0.07963727977227195 verbs





In [13]:
cnt_useful, cnt_not_useful, all_verbs_useful, all_verbs_not_useful = most_frequent_concepts(verbs_groups)

In [14]:
n = 10

print(f"{n} MORE FREQUENT VERBS IN USEFUL QUESTIONS")
print(f"   Verb:   frequency      Proportion")
for i in range(n):
    verb = cnt_useful[i][0]
    count = cnt_useful[i][1]
    prop = cnt_useful[i][1] / len(all_verbs_useful)
    print(f"  {i}.   {verb}:     {count}      {prop}")

print()
print(f"{n} MORE FREQUENT VERBS IN NOT USEFUL QUESTIONS")
print(f"   Verb:   frequency      Proportion")
for i in range(n):
    verb = cnt_not_useful[i][0]
    count = cnt_not_useful[i][1]
    prop = cnt_not_useful[i][1] / len(all_verbs_not_useful)
    print(f"  {i}.   {verb}:     {count}      {prop}")

10 MORE FREQUENT VERBS IN USEFUL QUESTIONS
   Verb:   frequency      Proportion
  0.   know:     20      0.03389830508474576
  1.   find:     19      0.03220338983050847
  2.   use:     18      0.030508474576271188
  3.   mean:     12      0.020338983050847456
  4.   write:     11      0.01864406779661017
  5.   divide:     9      0.015254237288135594
  6.   need:     9      0.015254237288135594
  7.   says:     8      0.013559322033898305
  8.   add:     8      0.013559322033898305
  9.   multiply:     8      0.013559322033898305

10 MORE FREQUENT VERBS IN NOT USEFUL QUESTIONS
   Verb:   frequency      Proportion
  0.   know:     23      0.046
  1.   use:     21      0.042
  2.   mean:     17      0.034
  3.   help:     12      0.024
  4.   solve:     12      0.024
  5.   explain:     10      0.02
  6.   find:     10      0.02
  7.   understand:     10      0.02
  8.   come:     9      0.018
  9.   need:     8      0.016


In [15]:
first_words = {}
for set, name in zip(sets, sets_name):
    pos_analysis = POS_analysis_object()
    questions_pipeline = pos_analysis.nlp_pipeline(set["text"].values)

    first_word = []
    for question in tqdm(questions_pipeline):
        first_word.append(str(question[0]))
    first_words[name] = first_word

100%|██████████| 480/480 [00:00<00:00, 480952.20it/s]
100%|██████████| 480/480 [00:00<00:00, 480952.20it/s]


In [16]:
cnt = Counter()
cnt_useful = Counter(first_words["useful"])
cnt_not_useful = Counter(first_words["not_useful"])

cnt_useful = sorted(cnt_useful.items(), key=lambda item: item[1])
cnt_not_useful = sorted(cnt_not_useful.items(), key=lambda item: item[1])

cnt_useful.reverse()
cnt_not_useful.reverse()

In [17]:
n = 20

print(f"{n} MORE FREQUENT FIRST WORDS IN USEFUL QUESTIONS")
print(f"   Init word:   frequency      Proportion")
for i in range(n):
    verb = cnt_useful[i][0]
    count = cnt_useful[i][1]
    prop = cnt_useful[i][1] / len(first_words["useful"])
    print(f"  {i}.   {verb}:     {count}      {prop}")

print()
print(f"{n} MORE FREQUENT FIRST WORDS IN NOT USEFUL QUESTIONS")
print(f"   Init word:   frequency      Proportion")
for i in range(n):
    verb = cnt_not_useful[i][0]
    count = cnt_not_useful[i][1]
    prop = cnt_not_useful[i][1] / len(first_words["not_useful"])
    print(f"  {i}.   {verb}:     {count}      {prop}")

20 MORE FREQUENT FIRST WORDS IN USEFUL QUESTIONS
   Init word:   frequency      Proportion
  0.   what:     69      0.14375
  1.   how:     69      0.14375
  2.   why:     37      0.07708333333333334
  3.   is:     37      0.07708333333333334
  4.   at:     27      0.05625
  5.   so:     24      0.05
  6.   if:     21      0.04375
  7.   does:     17      0.035416666666666666
  8.   can:     16      0.03333333333333333
  9.   i:     15      0.03125
  10.   are:     13      0.027083333333333334
  11.   would:     13      0.027083333333333334
  12.   in:     12      0.025
  13.   when:     11      0.022916666666666665
  14.   for:     8      0.016666666666666666
  15.   do:     7      0.014583333333333334
  16.   should:     5      0.010416666666666666
  17.   the:     4      0.008333333333333333
  18.   could:     4      0.008333333333333333
  19.   but:     4      0.008333333333333333

20 MORE FREQUENT FIRST WORDS IN NOT USEFUL QUESTIONS
   Init word:   frequency      Proportion
  0.  