# Генератор упражнений по английскому языку

Генератор по тексту генерирует 4 вида упражнений:
* Auxiliary verb - вставка вспомогательного глагола
* Words by association - подбор ассоциации к слову
* Choose the correct verb form - выбрать правильную форму глагола 
* Sentence transformation - выбрать правильное предложение 

Количество генерируемых каждого вида упражнений определяется в модуле - selection_of_k_exercises

Запись упражнений в датасет выполняется в модуле - filling_of_datasets

На выходе получаем датасет с набором упражнений.

In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
# pip install pyinflect

In [2]:
import json

import spacy
from spacy import displacy
import en_core_web_sm
import pyinflect

import gensim.downloader as api

import nltk
nltk.download("extended_omw")
from nltk.corpus import wordnet

[nltk_data] Downloading package extended_omw to
[nltk_data]     C:\Users\igor\AppData\Roaming\nltk_data...
[nltk_data]   Package extended_omw is already up-to-date!


In [3]:
# малая модель spacy
nlp = en_core_web_sm.load()

# малая модель glove wiki
# долго скачивает, если она еще не установлена
model = api.load("glove-wiki-gigaword-100")

In [4]:
filename = 'red_hat\Little_Red_Riding_Hood_Charles_Perrault.txt'

In [5]:
df = pd.DataFrame(columns=['raw', 'type', 'object', 'options', 'answer', 'description'])

In [6]:
list_text = []
with open(filename, 'r', encoding='UTF-8') as file:
    while line := file.readline():
        if len(line.rstrip()) > 0:
            for sent in nlp(line).sents:
                if len(str(sent).rstrip()) > 1:
                    list_text.append(str(sent).rstrip())

# Selection of k exercises

In [7]:
# Выборка k случайных упражнений для добавлениея в датасет
def selection_of_k_exercises(list_exercise, k=10): # k - количество упражнений включаемых в датасет
    return random.sample(list_exercise, k)

# Filling_of_datasets

In [8]:
# Модуль занесения готовых упражнений в датасет
def filling_of_datasets(list_of_exercises):
    for _ in list_k_exercises:
        df.loc[len(df)] = {'raw' : _[0],
                       'type' : _[1],
                       'object' : _[2],
                       'options' : _[3],
                       'answer' : _[4],
                       'description' : _[5]
                      }    
    return print('success')

# Auxiliary verb

In [9]:
# Создание упражнений: вставка вспомогательного глагола
def inserting_an_auxiliary_verb(list_text):
    helping_verbs = ['am', 'are', 'is', 'was', 'were', 'do', 'does', 'did', 'have', 'has', 'had', 'doing', 'having', 'should', 'will']
    exercise_auxiliary_verb = []
    
    for _sentences in list_text:
        sentences_auxiliary_verb = []
        for token in nlp(_sentences):
            if token.text in helping_verbs:
                sentences_auxiliary_verb.append(_sentences)
                sentences_auxiliary_verb.append('missing_word')
                sentences_exercise = _sentences.replace(token.text, "******")
                sentences_auxiliary_verb.append(sentences_exercise)
                sentences_auxiliary_verb.append([])
                i = _sentences.find(token.text)
                ln = len(token.text)
                sentences_auxiliary_verb.append(_sentences[i: i + ln])
                sentences_auxiliary_verb.append('Заполните пропуск вспомогательным глаголом')
                break # одна замена на предложение
        if len(sentences_auxiliary_verb) > 0:
            exercise_auxiliary_verb.append(sentences_auxiliary_verb)
    print(f'Количество созданных упражнений {len(exercise_auxiliary_verb)}')
    return exercise_auxiliary_verb

In [10]:
list_auxiliary_verb = inserting_an_auxiliary_verb(list_text)

Количество созданных упражнений 24


In [11]:
list_k_exercises = selection_of_k_exercises(list_auxiliary_verb, 10)
filling_of_datasets(list_k_exercises)

success


# Words by association

In [12]:
# Создание упражнений: подобрать ассоциации к слову
def words_by_association(list_text):
    # Упражнение: подобрать ассоциации к слову
    exercise_association = []
    
    for _sentences in list_text:
        association_list = []
        for token in nlp(_sentences):
            association_words = []
            if token.pos_ in ['NOUN', 'VERB'] and not(token.text.istitle()) and len(token.text) > 4:
                association_list.append(_sentences)
                association_list.append('missing_word')
                association_list.append(_sentences)
                association_list.append([])
                # ассоциаций в списке должно быть много, чтобы не пришлось угадывать слово из небольшого списка 
                msw1 = model.similar_by_word(token.text)
                msw2 = model.most_similar(positive=[token.text,'good'], negative=['bad'])
                msw3 = model.most_similar(positive=[token.text,'man'], negative=['woman'])
                msw4 = model.most_similar(positive=[token.text,'short'], negative=['long'])
                msw = msw1 + msw2 + msw3 + msw4
                for _ in msw:
                    association_words.append(_[0])

                msw5 = []
                for e in wordnet.synsets(token.text): # дополнительный поиск с помощью wordnet
                    for _ in e.lemma_names():
                        msw5.append(_)
                    for i in e.lemmas():
                        if i.antonyms(): # check whether the antonyms for the given word are available or not 
                            msw5.append(i.antonyms()[0].name()) # антонимы тоже могут быть ассоциациями            

                msw_all = association_words + msw5
                association_words_set = set(msw_all) # множество всех ассоциаций к слову
                w = set()
                for _ in association_words_set: # из списка ассоциаций удаляем однокоренные слова
                    if _.find(token.text.lower()) != -1 or token.text.lower().find(_) != -1: 
                        w.add(_)
                association_words_set = association_words_set - w
                association_words_set = [w.replace('_', ' ') for w in association_words_set]
                # association_list = [_sentences, token.text, association_words_set]
                if len(association_words_set) < 8:
                    break # Если ассоциаций мало не создаем упражнение на это слово
                association_list.append(association_words_set)
                association_list.append(f'Подберите ассоциацию к слову {token.text}')
                exercise_association.append(association_list)
                break # одно упражнение на одно предложение     
    print(f'Количество созданных упражнений {len(exercise_association)}')
    return exercise_association

In [13]:
list_words_by_association = words_by_association(list_text)

Количество созданных упражнений 33


In [14]:
list_k_exercises = selection_of_k_exercises(list_words_by_association, 10)
filling_of_datasets(list_k_exercises)

success


# Choose the correct verb form

In [15]:
# Создание упражнений: выбрать правильную форму глагола
def choose_correct_verb_form(list_text):
    exercise_correct_verb_form = []
    for _sentences in list_text:
        list_correct_verb_form = []
        for token in nlp(_sentences):            
            if token.pos_=='VERB':
                
                list_correct_verb_form.append(_sentences)
                list_correct_verb_form.append('select_word')
                sentences_exercise = _sentences.replace(token.text, "******")
                list_correct_verb_form.append(sentences_exercise)
                vrb = set()
                vrb.add(str(token))
                vrb.add(token._.inflect('VBP'))
                vrb.add(token._.inflect('VBZ'))
                
                vrb.add(token._.inflect('VBG'))
                vrb.add(token._.inflect('VBD'))
                # list_correct_verb_form = [_sentences, sentences_exercise, vrb, str(token)]
                list_correct_verb_form.append(vrb)
                list_correct_verb_form.append(token.text)
                list_correct_verb_form.append('Выберите слово')
                break # только одно упражнение в предложении
        if len(list_correct_verb_form) > 0:
            exercise_correct_verb_form.append(list_correct_verb_form)
    print(f'Количество созданных упражнений {len(exercise_correct_verb_form)}')
    return exercise_correct_verb_form   

In [16]:
list_choose_correct_verb_form = choose_correct_verb_form(list_text)

Количество созданных упражнений 40


In [17]:
list_k_exercises = selection_of_k_exercises(list_choose_correct_verb_form, 10)
filling_of_datasets(list_k_exercises)

success


# Sentence transformation

In [18]:
def sentence_transformation(list_text):
    # Упражнение: выбрать правильное предложение
    exercise_sentence_transf = []
    i=5    
    for _sentences in list_text:
        if len(_sentences) > 40: # трансформацию не будем делать на коротких предложениях 
            new_sent_1, new_sent_2 = _sentences, _sentences
            for token in nlp(_sentences):
                list_sentence_transf = []
                if token.pos_ in ['NOUN', 'VERB', 'ADV', 'ADJ']:
                    m, n = np.random.randint(0, i, 2)

                    new_word_1 = model.most_similar(token.text.lower(), topn=i)[m][0]
                    new_word_2 = model.most_similar(positive = [token.text.lower(), 'bad'], negative = ['good'], topn=i)[n][0]

                    new_word_1 = new_word_1.title() if token.text.istitle() else new_word_1
                    new_word_2 = new_word_2.title() if token.text.istitle() else new_word_2

                    new_sent_1 = new_sent_1.replace(token.text, new_word_1)
                    new_sent_2 = new_sent_2.replace(token.text, new_word_2)

                    list_sentence_transf.append(_sentences)
                    list_sentence_transf.append('select_word')
                    list_sentence_transf.append([])
                    sentence_transf = [_sentences, new_sent_1, new_sent_2]
                    random.shuffle(sentence_transf)
                    list_sentence_transf.append(sentence_transf)
                    list_sentence_transf.append(_sentences)
                    list_sentence_transf.append('Выберите правильное предложение')
            
                if len(list_sentence_transf) > 0:
                    exercise_sentence_transf.append(list_sentence_transf)
                    break
    print(f'Количество созданных упражнений {len(exercise_sentence_transf)}')
    return exercise_sentence_transf       

In [19]:
list_sentence_transformation = sentence_transformation(list_text)

Количество созданных упражнений 31


In [20]:
list_k_exercises = selection_of_k_exercises(list_sentence_transformation, 10)
filling_of_datasets(list_k_exercises)

success


In [21]:
df.tail()

Unnamed: 0,raw,type,object,options,answer,description
35,This good woman had a little red riding hood m...,select_word,[],[This good woman had a little red riding hood ...,This good woman had a little red riding hood m...,Выберите правильное предложение
36,"I say ""wolf,"" but there are various kinds of w...",select_word,[],"[I say ""wolf,"" but there are various kinds of ...","I say ""wolf,"" but there are various kinds of w...",Выберите правильное предложение
37,"""All the better to hug you with, my dear.",select_word,[],"[""All the even to hug you with, my dear., ""All...","""All the better to hug you with, my dear.",Выберите правильное предложение
38,"The wolf, seeing her come in, said to her, hid...",select_word,[],"[The monster, seeing her come in, said to her,...","The wolf, seeing her come in, said to her, hid...",Выберите правильное предложение
39,"The good grandmother, who was in bed, because ...",select_word,[],"[The worse grandmother, who was in bed, becaus...","The good grandmother, who was in bed, because ...",Выберите правильное предложение


In [22]:
df.to_csv('project_df.csv', index=False)