# Предобработка данных

Импортируем все необходимые в работе библиотеки.

In [1]:
import pandas as pd
import numpy as np
import os
import re
import zipfile 

import pysrt
import PyPDF2
from PyPDF2 import PdfReader
from nltk.stem import SnowballStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

## Предобработка документов со списками слов, соответствующих уровням владения английским языком

Документы хранятся в архиве *Oxford_CEFR_level.zip*. Он скачан и сохранен в папку с проектом. Извлечем их с помощью модуля zipfile.

In [2]:
archive = './Oxford_CEFR_level.zip'
with zipfile.ZipFile(archive, 'r') as zip_file:
    zip_file.extractall('.')

### American_Oxford_3000_by_CEFR_level

Прочитаем pdf-файлы со списками слов, соответствующими уровням владения английским языком. Начнем с файла *American_Oxford_3000_by_CEFR_level.pdf*

In [3]:
reader = PdfReader("American_Oxford_3000_by_CEFR_level.pdf")
number_of_pages = len(reader.pages)

Объединим текст со всех страниц выбранного документа в один файл.

In [4]:
a1_b2_words_american = ''

for i in range(number_of_pages):
    a1_b2_words_american += reader.pages[i].extract_text() + '\n'
    a1_b2_words_american = a1_b2_words_american.replace(f'{i + 1} / {len(reader.pages)}', '')

In [5]:
print(a1_b2_words_american)

© Oxford University Press The Oxford 3000™ by CEFR level  
(American English)
The Oxford 3000 is the list of the 3000 most important words to learn in English , from A 1 to B2 level.
A1
a, an indefinite article
about prep., adv.
above prep., adv.
across prep., adv.
action n.
activity n.
actor n.
actress n.
add v.
address n.
adult n.
advice n.
afraid adj.
after prep.
afternoon n.
again adv.
age n.
ago adv.
agree v.
air n.
airport n.
all det., pron.
also adv.
always adv.
amazing adj.
and conj.
angry adj.
animal n.
another det./pron.
answer n., v.
any det., pron.
anyone pron.
anything pron.
apartment n.
apple n.
April n.
area n.
arm n.
around prep., adv.
arrive v.
art n.
article n.
artist n.
as prep.
ask v.
at prep.
August n.
aunt n.
away adv.
awesome adj. 
baby n.
back n., adv.
bad adj.
bag n.
ball n.
banana n.band n.
bank (money)  n.
bar n.
baseball n.
basketball n.
bath n.
bathroom n.
be v., auxiliary v.
beach n.
beautiful adj.
because conj.
become v.
bed n.
bedroom n.
beer n.
before p

Создадим список текста, который нужно удалить из документа, чтобы в нем остались только слова и уровни английского.

In [6]:
to_delete = ['The Oxford 3000 is the list of the 3000 most important words to learn in English , from A 1 to B2 level.',
             '© Oxford University Press', 'The Oxford 3000™ by CEFR level', 
             'indefinite article', 'definite article', 'infinitive marker', 'prep., adv.',
             '/n.', 'adj.', 'prep.', 'adv.', 'det.', 'det./pron./adv.,' 'conj.', 'conj.', 'det./pron.',
             'pron.', 'n., adv.', 'exclam.', 'modal', 'auxiliary', 'v.', '/adv.', '/pron.', ' number',
             'n.', 'v,', '/number', '/', 'to  infinitive marker', 
            ]

Зададим функцию, которая будет:
1. Удалять весь текст в скобках;
2. Удалять лишний текст (который записан в список *to_delete*);
3. Заменять отдельно стоящие буквы *v* и цифры, которым не предшествуют буквы *A*, *B*, *C*, на пробелы

In [7]:
def drop_text(doc):
    doc = re.sub(r'\(.+?\)\s', '', doc)
    
    for word in to_delete:
        doc = doc.replace(word, '')
    
    doc = re.sub(r'\sv\s|,|(?<![ABC])\d', '', doc)
    
    return doc        

Применим данную функцию к нашему документу.

In [8]:
a1_b2_words_american = drop_text(a1_b2_words_american)

Посмотрим на результат.

In [9]:
print(a1_b2_words_american)

   

A1
a an 
about 
above 
across 
action 
activity 
actor 
actress 
add 
address 
adult 
advice 
afraid 
after 
afternoon 
again 
age 
ago 
agree 
air 
airport 
all  
also 
always 
amazing 
and 
angry 
animal 
another 
answer  
any  
anyone 
anything 
apartment 
apple 
April 
area 
arm 
around 
arrive 
art 
article 
artist 
as 
ask 
at 
August 
aunt 
away 
awesome  
baby 
back  
bad 
bag 
ball 
banana band 
bank  
bar 
baseball 
basketball 
bath 
bathroom 
be   
beach 
beautiful 
because 
become 
bed 
bedroom 
beer 
before 
begin 
beginning 
behind 
believe 
below  
best 
better 
between 
bicycle 
big 
bike 
bill 
bird 
birthday 
black  
blog 
blond 
blue  
boat 
body 
book 
boot 
bored 
boring 
born 
both 
bottle 
box 
boy 
boyfriend 
bread 
break  
breakfast 
bring 
brother 
brown  
build 
building 
bus 
business 
busy 
but butter 
buy 
by 
bye 
cafe 
cake 
call  
camera 
can  
cannot  
capital  
car 
card 
career 
carrot 
carry 
cat 
CD 
cent 
center 
chair 
change  
chart 
cheap 

Обработка удалась: в документе остались только слова и уровни знания английского.

Теперь разобъем слова по уровням.

In [10]:
A1_american = a1_b2_words_american.split('A1')[1].replace(' ', '\n').split('A2')[0].split('\n')
A1_american = [i.lower() for i in A1_american if i]

A2_american = a1_b2_words_american.split('A2')[1].split('B1')[0].replace(' ', '\n').split('\n')
A2_american = [i.lower() for i in A2_american if i]

B1_american = a1_b2_words_american.split('B1')[1].split('B2')[0].replace(' ', '\n').split('\n')
B1_american = [i.lower() for i in B1_american if i]

B2_american = a1_b2_words_american.split('B2')[1].replace(' ', '\n').split('\n')
B2_american = [i.lower() for i in B2_american if i]

Проверим, что преобразование осуществляется корректно.

In [11]:
print(B2_american)

['abandon', 'abroad', 'absolute', 'acceptable', 'accompany', 'account', 'accurate', 'accuse', 'acknowledge', 'acquire', 'actual', 'adapt', 'additional', 'address', 'adopt', 'advance', 'affair', 'afterward', 'agency', 'agenda', 'aggressive', 'aid', 'aircraft', 'alarm', 'alter', 'amount', 'anger', 'angle', 'anniversary', 'annual', 'anxious', 'apparent', 'apparently', 'appeal', 'approach', 'appropriate', 'approval', 'approve', 'arise', 'armed', 'arms', 'artificial', 'artistic', 'ashamed', 'aside', 'aspect', 'assess', 'assessment', 'associate', 'associated', 'association', 'assume', 'attempt', 'attorney', 'back', 'bacteria', 'bar', 'barrier', 'basically', 'battle', 'bear', 'beat', 'beg', 'being', 'bent', 'bet', 'beyond', 'bill', 'bitter', 'blame', 'blind', 'bond', 'border', 'breast', 'brief', 'broad', 'broadcast', 'budget', 'bullet', 'bunch', 'burn', 'bush', 'but', 'calculate', 'cancel', 'cancer', 'capable', 'capacity', 'capture', 'cast', 'catch', 'chain', 'chair', 'chairman', 'challenge',

Все корректно.

### American_Oxford_5000_by_CEFR_level

Аналогичным образом обработаем файл *American_Oxford_5000_by_CEFR_level.pdf*

In [12]:
reader = PdfReader("American_Oxford_5000_by_CEFR_level.pdf")
number_of_pages = len(reader.pages)

Сохраним все слова в одной переменной.

In [13]:
b2_c1_words_american = ''

for i in range(number_of_pages):
    b2_c1_words_american += reader.pages[i].extract_text() + '\n'
    b2_c1_words_american = b2_c1_words_american.replace(f'{i + 1} / {len(reader.pages)}', '')

Выведем на экран.

In [14]:
print(b2_c1_words_american)

© Oxford University Press The Oxford 5000 ™ by CEFR level 
(American English)
The Oxford 5000 is an expanded core word list for advanced learners of English. As well as the Oxford 
3000 , it includes an additional 2000 words for learners at B2-C1 level , which are listed here.
B2
absorb v.
abstract adj.
accent n.
accidentally adv.
accommodate v.
accommodation n.
accomplish v.
accountant n.
accuracy n.
accurately adv.
acid n.
acre n.
activate v.
addiction n.
additionally adv.
adequate adj.
adequately adv.
adjust v.
affordable adj.
aged adj.
agriculture n.
AIDS n.
alien n.
alongside prep.
altogether adv.
ambitious adj.
ambulance n.
amusing adj.
analyst n.
ancestor n.
animation n.
annually adv.
anticipate v.
anxiety n.
apology n.
applicant n.
appropriately adv.
arrow n.
artwork n.
asset n.
assign v.
assistance n.
assumption n.
assure v.
astonishing adj.
athletic adj.
attachment n.
audio adj.
awareness n.
awkward adj.
badge n.
balanced adj.
ballet n.
balloon n.barely adv.
bargain n.
baseme

Дополним список удаляемого текста.

In [15]:
to_delete += ('  B2', ' adj', 'adv .', 'v . ', 'The Oxford 5000 ™ by CEFR level', 'The Oxford 5000™ by CEFR level',
              'The Oxford 5000 is an expanded core word list for advanced learners of English. As well as the Oxford',
              '3000', 'it includes an additional 2000 words for learners at B2-C1 level', 'which are listed here.'
              )

Удалим лишние пробелы в словах *'residence'* и *'revenge'*.

In [16]:
b2_c1_words_american = b2_c1_words_american.replace('residen ce', 'residence')
b2_c1_words_american = b2_c1_words_american.replace('reve nge  ', 'revenge')

Удалим лишний текст с помощью ранее созданной функции *drop_text*.

In [17]:
b2_c1_words_american = drop_text(b2_c1_words_american)

In [18]:
print(b2_c1_words_american)

  
 
    
B2
absorb 
abstract 
accent 
accidentally 
accommodate 
accommodation 
accomplish 
accountant 
accuracy 
accurately 
acid 
acre 
activate 
addiction 
additionally 
adequate 
adequately 
adjust 
affordable 
aged 
agriculture 
AIDS 
alien 
alongside 
altogether 
ambitious 
ambulance 
amusing 
analyst 
ancestor 
animation 
annually 
anticipate 
anxiety 
apology 
applicant 
appropriately 
arrow 
artwork 
asset 
assign 
assistance 
assumption 
assure 
astonishing 
athletic 
attachment 
audio 
awareness 
awkward 
badge 
balanced 
ballet 
balloon barely 
bargain 
basement 
basket 
bat 
beneficial 
beside 
besides 
bias 
bid  
biological 
blanket 
blow 
bold 
bombing 
boost  
bound 
brick 
briefly 
broadcaster 
broadly 
buck 
bug 
cabin 
canal 
candle 
carbon 
castle
casual 
cave 
certainty 
certificate 
challenging 
championship 
charming 
chase  
cheek 
cheer  
chop 
circuit 
civilization 
clarify 
classify 
cliff 
clinic 
clip 
coincidence 
collector 
colony 
colorful 
comic  
com

Сохраним слова с уровнями B2 и C1 в соответствующие переменные.

In [19]:
B2_aux_american = b2_c1_words_american.split('B2')[1].split('C1')[0].replace(' ', '\n').split('\n')
B2_aux_american = [i.lower() for i in B2_aux_american if i]

C1_american = b2_c1_words_american.split('C1')[1].replace(' ', '\n').split('\n')
C1_american = [i.lower() for i in C1_american if i]

### The_Oxford_3000_by_CEFR_level

Далее сохраним слова британского английского. Начнем с документа *The_Oxford_3000_by_CEFR_level.pdf*

In [20]:
reader = PdfReader("The_Oxford_3000_by_CEFR_level.pdf")
number_of_pages = len(reader.pages)

Выгрузим содержимое документа.

In [21]:
a1_b2_words_british = ''

for i in range(number_of_pages):
    a1_b2_words_british += reader.pages[i].extract_text() + '\n'
    a1_b2_words_british = a1_b2_words_british.replace(f'{i + 1} / {len(reader.pages)}', '')

In [22]:
print(a1_b2_words_british)

© Oxford University Press The Oxford 3000™ by CEFR level
The Oxford 3000 is the list of the 3000 most important words to learn in English , from A 1 to B2 level.
A1
a, an indefinite article
about prep. , adv.
above prep. , adv.
across prep. , adv.
action n.
activity n.
actor n.
actress n.
add v.
address n.
adult n.
advice n.
afraid adj.
after prep.
afternoon n.
again adv.
age n.
ago adv.
agree v.
air n.
airport n.
all det. , pron.
also adv.
always adv.
amazing adj.
and conj.
angry adj.
animal n.
another det./pron.
answer n. , v.
any det. , pron.
anyone pron.
anything pron.
apartment n.
apple n.
April n.
area n.
arm n.
around prep. , adv.
arrive v.
art n.
article n.
artist n.
as prep.
ask v.
at prep.
August n.
aunt n.
autumn n.
away adv.
baby n.
back n. , adv.
bad adj.
bag n.
ball n.
banana n.
band n.
bank (money) n.
bath n.bathroom n.
be v. , auxiliary v.
beach n.
beautiful adj.
because conj.
become v.
bed n.
bedroom n.
beer n.
before prep.
begin v.
beginning n.
behind prep. , adv.
bel

Удалим лишний текст.

In [23]:
a1_b2_words_british = drop_text(a1_b2_words_british)

In [24]:
print(a1_b2_words_british)

 

A1
a an 
about   
above   
across   
action 
activity 
actor 
actress 
add 
address 
adult 
advice 
afraid 
after 
afternoon 
again 
age 
ago 
agree 
air 
airport 
all   
also 
always 
amazing 
and 
angry 
animal 
another 
answer   
any   
anyone 
anything 
apartment 
apple 
April 
area 
arm 
around   
arrive 
art 
article 
artist 
as 
ask 
at 
August 
aunt 
autumn 
away 
baby 
back   
bad 
bag 
ball 
banana 
band 
bank 
bath bathroom 
be    
beach 
beautiful 
because 
become 
bed 
bedroom 
beer 
before 
begin 
beginning 
behind   
believe 
below   
best 
better 
between 
bicycle 
big 
bike 
bill 
bird 
birthday 
black   
blog 
blonde 
blue   
boat 
body 
book 
boot 
bored 
boring 
born 
both 
bottle 
box 
boy 
boyfriend 
bread 
break   
breakfast 
bring 
brother 
brown   
build 
building 
bus 
business 
busy 
but 
butter 
buy 
by 
bye 
cafe 
cake 
call   
camera 
can  cannot  
capital   
car 
card 
career 
carrot 
carry 
cat 
CD 
cent 
centre 
century 
chair 
change   
chart 
cheap

Сохраним слова по уровням.

In [25]:
A1_british = a1_b2_words_british.split('A1')[1].replace(' ', '\n').split('A2')[0].replace(' ', '').split('\n')
A1_british = [i.lower() for i in A1_british if i]

A2_british = a1_b2_words_british.split('A2')[1].split('B1')[0].replace(' ', '\n').split('\n')
A2_british = [i.lower() for i in A2_british if i]

B1_british = a1_b2_words_british.split('B1')[1].split('B2')[0].replace(' ', '\n').split('\n')
B1_british = [i.lower() for i in B1_british if i]

B2_british = a1_b2_words_british.split('B2')[1].replace(' ', '\n').split('\n')
B2_british = [i.lower() for i in B2_british if i]

### The_Oxford_5000_by_CEFR_level

Наконец, удалим лишнее из документа *The_Oxford_5000_by_CEFR_level.pdf*

In [26]:
reader = PdfReader("The_Oxford_5000_by_CEFR_level.pdf")
number_of_pages = len(reader.pages)

Выгружаем содержимое документа.

In [27]:
b2_c1_words_british = ''

for i in range(number_of_pages):
    b2_c1_words_british += reader.pages[i].extract_text() + '\n'
    b2_c1_words_british = b2_c1_words_british.replace(f'{i + 1} / {len(reader.pages)}', '')

In [28]:
print(b2_c1_words_british)

© Oxford University Press The Oxford 5000 ™ by CEFR level
The Oxford 5000 is an expanded core word list for advanced learners of English. As well as the Oxford 
3000 , it includes an additional 2000 words for learners at B2-C1 level , which are listed here.
B2
absorb v.
abstract adj.
accent n.
accidentally adv.
accommodate v.
accomplish v.
accountant n.
accuracy n.
accurately adv.
acid n.
activate v.
addiction n.
additionally adv.
adequate adj.
adequately adv.
adjust v.
affordable adj.
agriculture n.
AIDS n.
alien n.
alongside prep.
altogether adv.
ambulance n.
amusing adj.
analyst n.
ancestor n.
animation n.
annually adv.
anticipate v.
anxiety n.
apology n.
applicant n.
appropriately adv.
arrow n.
artwork n.
aside adv.
asset n.
assign v.
assistance n.
assumption n.
assure v.
astonishing adj.
attachment n.
auction n.
audio adj.
automatic adj.
automatically adv.
awareness n.
awkward adj.
badge n.
balanced adj.
ballet n.
balloon n.
barely adv.
bargain n.
basement n.
basket n.
bat n.benef

Удалим лишний пробел в слове *'revenge'*.

In [29]:
b2_c1_words_british = b2_c1_words_british.replace('reve nge  ', 'revenge')

Удалим лишний текст.

In [30]:
b2_c1_words_british = drop_text(b2_c1_words_british)

In [31]:
print(b2_c1_words_british)

 
 
    
B2
absorb 
abstract 
accent 
accidentally 
accommodate 
accomplish 
accountant 
accuracy 
accurately 
acid 
activate 
addiction 
additionally 
adequate 
adequately 
adjust 
affordable 
agriculture 
AIDS 
alien 
alongside 
altogether 
ambulance 
amusing 
analyst 
ancestor 
animation 
annually 
anticipate 
anxiety 
apology 
applicant 
appropriately 
arrow 
artwork 
aside 
asset 
assign 
assistance 
assumption 
assure 
astonishing 
attachment 
auction 
audio 
automatic 
automatically 
awareness 
awkward 
badge 
balanced 
ballet 
balloon 
barely 
bargain 
basement 
basket 
bat beneficial 
beside 
besides   
bias 
bid   
biological 
blanket 
blow 
bold 
bombing 
booking 
boost   
bound 
brick 
briefly 
broadcaster 
broadly 
bug 
cabin 
canal 
candle 
carbon 
casual 
cave 
certainty 
certificate 
challenging 
championship 
charming 
chase   
cheek 
cheer   
choir 
chop 
circuit 
civilization 
clarify 
classify 
clerk 
cliff 
clinic 
clip 
coincidence 
collector 
colony 
colourful 
c

Сохраним слова по уровням.

In [32]:
B2_aux_british = b2_c1_words_british.split('B2')[1].split('C1')[0].replace(' ', '\n').split('\n')
B2_aux_british = [i.lower() for i in B2_aux_british if i]

C1_british = b2_c1_words_british.split('C1')[1].replace(' ', '\n').split('\n')
C1_british = [i.lower() for i in C1_british if i]

Объединим слова американского и британского ангилийского соответствующих уровней.

In [33]:
A1 = ' '.join(set(A1_american + A1_british))
A2 = ' '.join(set(A2_american + A2_british))
B1 = ' '.join(set(B1_american + B1_british))
B2 = ' '.join(set(B2_american + B2_british + B2_aux_american + B2_aux_british))
C1 = ' '.join(set(C1_american + C1_british))

**Выводы:**
1. Обработаны документы со списками слов, соответствующих разным уровням английского;
2. Получены соответствующие списки слов.

## Предобработка субтитров

У нас имеется архив с субтитрами *English_scores.zip*. Он скачан и сохранен в папку с проектом. Извлечем его содержимое.

In [35]:
archive_subs = './English_scores.zip'

zip_file = zipfile.ZipFile(archive_subs)

with zipfile.ZipFile(archive_subs, 'r') as zip_file:
    zip_file.extractall('.')

Сохраним все названия файлов с субтитрами в списке *sub_files*, а пути к файлам - в списке *sub_paths*.

In [36]:
subs_files = []
subs_paths = []

for dirpath, dirnames, filenames in os.walk("./Subtitles_all/"):       
    for filename in filenames:

        if '.srt' in filename:
            subs_files.append(filename)
            subs_paths.append(os.path.join(dirpath, filename))

            
print(len(subs_files))
print(len(subs_paths))

284
284


In [37]:
display(subs_files)

['The Walking Dead-S01E01-Days Gone Bye.English.srt',
 'The Walking Dead-S01E02-Guts.English.srt',
 'The Walking Dead-S01E03-Tell It To The Frogs.English.srt',
 'The Walking Dead-S01E04-Vatos.English.srt',
 'The Walking Dead-S01E05-Wildfire.English.srt',
 'The Walking Dead-S01E06-TS-19.English.srt',
 'AmericanBeauty1999.BRRip.srt',
 'Angelas.Christmas.Wish.2020.srt',
 'Indiana Jones And The Last Crusade DVDRip Xvid -IZON-.srt',
 'mechanic-resurrection_.srt',
 'Men.In.Black.1997.720p.Bluray.x264-SEPTiC.srt',
 'Rat.Race.2001.1080p.WEB-DL.DD5.1.H264-FGT.srt',
 'Seven.Worlds.One.Planet.S01E01.2160p.BluRay.Remux.eng.srt',
 'Seven.Worlds.One.Planet.S01E02.2160p.BluRay.Remux.eng.srt',
 'Seven.Worlds.One.Planet.S01E03.2160p.BluRay.Remux.eng.srt',
 'Seven.Worlds.One.Planet.S01E04.2160p.BluRay.Remux.eng.srt',
 'Seven.Worlds.One.Planet.S01E05.2160p.BluRay.Remux.eng.srt',
 'Seven.Worlds.One.Planet.S01E06.2160p.BluRay.Remux.eng.srt',
 'Seven.Worlds.One.Planet.S01E07.2160p.BluRay.Remux.eng.srt',
 'S

Сохраним все субтитры в список *subs*.

In [38]:
subs = []

for i in range(len(subs_paths)):
    try:
        subs.append(pysrt.open(f'{subs_paths[i]}').text)
    except:
        subs.append(pysrt.open(f'{subs_paths[i]}', encoding='latin-1').text)

В переменную *subs* субтитры добавляются в том же порядке, в котором идут названия фильмов в переменной *subs_paths*. Поэтому каждая пара *'Название фильма - Субтитры к фильму'* корректна.

Ознакомимся с несколькими файлами субтитров.

In [39]:
subs[0]

'( bugs chittering )\n( brakes squeak )\n- ( engine stops )\n- ( trunk clicks )\n( bird cawing )\n( birds chirping )\n( flies buzzing )\n( metal rattling )\n( shuffling footsteps )\nLittle girl?\nI\'m a policeman.\nLittle girl.\nDon\'t be afraid, okay?\nLittle girl.\n( panting )\n( growling )\nOh my God.\n( theme music playing )\n- ( police radio chatter )\n- Man: What\'s the difference\n- ( police radio chatter )\n- Man: What\'s the difference\nbetween men and women?\n- Man #2: This a joke?\n- No, I\'m serious.\nMan #2: I never met a woman\nwho knew how to turn off a light.\nThey\'re born thinking\nthe switch only goes one way--\n- on.\n- ( mutters )\nThey\'re struck blind\nthe second they leave a room.\nI mean every woman\nI ever let have a key--\nI swear to God,\nit\'s like I come home,\nhouse all lit up.\n( chuckles )\nAnd my job,\nyou see, apparently--\nbecause my chromosomes\nhappen to be different--\nis I\'ve then gotta walk\nthrough that house,\nturn off every single light\nthi

In [40]:
subs[100]

'Hey, Mike, it\'s Rachel.\nI know I shouldn\'t\nbe doing this.\nI know you\'re seeing Jenny,\nbut I can\'t stop thinking\nabout the kiss,\nand I can\'t go back.\n[Elevator dings]\nMorning, Jessica.\nMs. Pearson.\nHey, I just wanna\nthank you so much\nfor keeping me on despite the...\nI guess, I just\nwant to let you know\nthat I\'m gonna work hard,\nand I\'m gonna... you know,\nI\'m gonna make you proud.\nNot... not like a son...\nyou know...\nI mean, we don\'t really\nlook that much alike, I guess.\nThat\'s a... that\'s\na strange thing to say.\nUh, I just... what I meant\nis that I really appreciate\neverything that you... okay.\n[Construction sounds]\n- Hardman?\n- One would assume.\n- Did he ask you?\n- No, he did not.\nHe needs your signature.\nAnd yet there was a wall\nhere yesterday.\nWell, at least he didn\'t ask\nfor his office back.\nI wouldn\'t have given it back.\nI wouldn\'t have given\nyours back to you.\nUnless you asked really nicely.\nIsn\'t that...\n- Mine.\n- Shot ac

In [41]:
subs[200]

'<i>People always ask me<br/>if I know Tyler Durden.</i>\nThree minutes.\nThis is it. Ground zero.\nDo you have a speech for the occasion?\n<i>With a gun barrel between your teeth,<br/>you speak only in vowels.</i>\nI can\'t think of anything.\n<i>For a second, I forget about<br/>Tyler\'s controlled demolition thing</i>\n<i>and I wonder how clean that gun is.</i>\nIt\'s getting exciting now.\n<i>That old thing,<br/>how you always hurt the one you love.</i>\n<i>Well, it works both ways.</i>\n<i>We have front-row seats<br/>for this theater of mass destruction.</i>\n<i>The Demolitions Committee<br/>of Project Mayhem</i>\n<i>wrapped the foundations<br/>of 12 buildings with explosives.</i>\n<i>In two minutes,<br/>primary charges will blow base charges</i>\n<i>and a few blocks will be reduced<br/>to smouldering rubble.</i>\n<i>I know this because Tyler knows this.</i>\nTwo and a half.<br/>Think of everything we\'ve accomplished.\n<i>Suddenly I realize that all of this,<br/>the gun, the bombs

Видим, что субтитры содержат различные элементы, которые нужно удалить: текст в угловых скобках (*`<br>`*), знак переноса строки (*\n*), знаки препинания и иные вспомогательные символы, слова с многократным повторением гласных (*Whoooa*). Удалим эти элементы, чтобы остались только слова, и прмведем весь текст к нижнему регистру.

In [42]:
for i in range(len(subs)):
    subs[i] = re.sub("<.+?>|\\n|[^A-Za-z']|\w*aaa\w*|\w*ooo\w*", ' ', subs[i]).lower()    

Названия фильмов и соответствующие им уровни языка хранятся в файле *movies_labels.xlsx*.

In [43]:
data = pd.read_excel('./movies_labels.xlsx')
data

Unnamed: 0,id,Movie,Level
0,0,10_Cloverfield_lane(2016),B1
1,1,10_things_I_hate_about_you(1999),B1
2,2,A_knights_tale(2001),B2
3,3,A_star_is_born(2018),B2
4,4,Aladdin(1992),A2/A2+
...,...,...,...
236,236,Matilda(2022),C1
237,237,Bullet train,B1
238,238,Thor: love and thunder,B2
239,239,Lightyear,B2


Удалим столбец *id*. Он не несет полезной информации.

In [44]:
data = data.drop('id', axis=1)

Проверим датасет на наличие дубликатов.

In [45]:
data['Movie'].duplicated().sum()

4

Удалим дубликаты.

In [46]:
data = data.drop_duplicates(subset='Movie', ignore_index=True)

Определим, какие уровни английского языка указаны в таблице.

In [47]:
data['Level'].value_counts()

B2            101
B1             52
C1             40
A2/A2+         26
B1, B2          8
A2              6
A2/A2+, B1      4
Name: Level, dtype: int64

Помимо конкретных значений *A2*, *B1*, *B2*, *C1* также встречаются "спорные" значения *A2/A2+*, *B1, B2*, *A2/A2+, B1*. Посчитаем их количество.

In [48]:
data.loc[(data['Level'] == 'A2/A2+') |
         (data['Level'] == 'B1, B2') |
         (data['Level'] == 'A2/A2+, B1'), 'Level'].count()

38

Заменим "спорные" значения уровня более высоким значением из предложенных (кроме уровней 'А2/А2+' - его заменим на 'A2', так как уровень 'A2+' отсутствует в представленных pdf-документах.

In [49]:
data.loc[data.loc[:, 'Level'] == 'A2/A2+', 'Level'] = 'A2'
data.loc[data.loc[:, 'Level'] == 'B1, B2', 'Level'] = 'B2'
data.loc[data.loc[:,'Level'] == 'A2/A2+, B1', 'Level'] = 'B1'

В нашем размеченном датасете не представлены субтитры. Чтобы добавить их, создадим датафрейм с названиями фильмов и соответствующими им субтитрами.

In [50]:
subs_df = pd.DataFrame({'Movie': subs_files,
                        'Subtitles': subs})
subs_df['Movie'] = subs_df['Movie'].str[:-4]
subs_df

Unnamed: 0,Movie,Subtitles
0,The Walking Dead-S01E01-Days Gone Bye.English,bugs chittering brakes squeak engi...
1,The Walking Dead-S01E02-Guts.English,birds chirping bugs chittering boy...
2,The Walking Dead-S01E03-Tell It To The Frogs.E...,thunder rumbling merle that's right you ...
3,The Walking Dead-S01E04-Vatos.English,birds chirping what nothing it's not...
4,The Walking Dead-S01E05-Wildfire.English,walkie talkie squawks rick morgan i ...
...,...,...
279,Warm_bodies(2013),what am i doing with my life i'm so pale...
280,Westworld_scenes_of_Dr_Robert_Ford,music no one's complained music there's th...
281,We_are_the_Millers(2013),oh my god it's full on double rainbo...
282,While_You_Were_Sleeping(1995),lucy okay there are two things that i rem...


Объединим наш исходный датафрейм с только что созданным, чтобы в одной таблице содержалась вся необъодимая информация: название фильма, уровень английского, текст субтитров к фильму.

In [51]:
data = data.merge(subs_df, on='Movie', how='left')

Выясним, к каким фильмам, для которых известен уровень, отсутствуют субтитры.

In [52]:
data[data['Subtitles'].isna()]

Unnamed: 0,Movie,Level,Subtitles
79,The Secret Life of Pets.en,B2,
102,Up (2009),A2,
231,Glass Onion,B2,
232,Matilda(2022),C1,
233,Bullet train,B1,
234,Thor: love and thunder,B2,
235,Lightyear,B2,
236,The Grinch,B1,


Субтитры к фильмам *Up (2009)* и *The Grinch* есть в нашем распоряжении, но названия файлов субтитров отличаются от названий фильмов в таблице. Добавим их в датафрейм.

In [53]:
data.loc[data['Movie'] == 'Up (2009)', 'Subtitles'] = subs[subs_files.index('Up(2009).srt')]
data.loc[data['Movie'] == 'The Grinch', 'Subtitles'] = subs[
    subs_files.index('The.Grinch.2018.REMUX.1080p.Blu-ray.AVC.TrueHD.DTS-HD.MA.7.1-LEGi0N.English.srt')
                                                           ]

Cубтитры к фильмам *The Secret Life of Pets.en*, *Glass Onion*, *Matilda(2022)*, *Bullet train*, *Thor: love and thunder* и *Lightyear* среди предоставленных нам файлов отсутствуют. Скачаем их с сайта https://subscene.com/:

[The Secret Life of Pets](https://subscene.com/subtitles/english-text/V4aR-s9NQrs5TXNTm8ZlYoeiRiHxu6ynIzaekdZ-2sdakhABTE5GiH7J4YfJ08HWJsXz2VuhOdfrx_ts8_kgHPnofNyq71ZZhnauWzV_UWk5NBJgLa7ptEDY6lLJ0Xcd0)

[Glass Onion: A Knives Out Mystery](https://subscene.com/subtitles/english-text/SQlNzEkGYQwOWxcrRVPHg33Cd0rW_WqGK_PF210S2eSJx9g0Ru-O8xTBHXh7--JU1SBnA7xeeo8GqCMCstC33OPojcaaay3VWKfTlboQKgKOGeCtngWBhS7qbSKSj9oQ0)

[Roald Dahl's Matilda the Musical](https://subscene.com/subtitles/english-text/lI_OkATsWu5U-zF5Pl89ggvW1GPPmYBvdZd4P9NHOJWO0fr4UNtLI_rsCOTfzPV0GcU_lY0lBLG2McX8MIC3x_GH6uGHOyLzWUrza4k5FxC99ewrmPQDIFi1IP1ICHyQ0)

[Bullet Train](https://subscene.com/subtitles/english-text/72weiwbOaD0NuRa1i6dqnDtbL8RXZ-cAl3ArfnxTK_D4XKEZ1a23PhebVc0362EXgeiLtmsdVQsmC-x07AbW_IzHracr5Yw_mdIgXtnrvUD6IuglS1nZDJqiSrUMAvLk0)

[Thor: Love and Thunder](https://subscene.com/subtitles/english-text/u4YtT1Ugg3zHfRvFTeJkWvGZ0Z69zHwMqHldsbbVGHlleTNfttCch7-6Z_TsYHKMx1g62AdfQ54XlPkEYoDIYRQPiVrS0pdPlqrqjqNZFxNdlP0OYv7MsICEO_dpl4I20)

[Lightyear](https://subscene.com/subtitles/english-text/CIDzbAG6n0MGDDOw-VdxyRhoXL1A4WykRUNYcjb5t_PMtXAqU0RIGDVw6W2A1tRKMByS-8yFDT42SKexxOs1Cr0qj2F3b7YlcNIeeXJYmhtUt1FLicHC8Mg5g1vQcuMH0)

Скачаем данные файлы и разархивируем их в папку './Subtitles_all/Subtitles/'. Внесем в датафрейм субтитры, соответствующими данным фильмам, попутно удалив лишние символы.

In [54]:
data.loc[data['Movie'] == 'Bullet train', 'Subtitles'] = re.sub("<.+?>|\\n|[^A-Za-z']|\w*aaa\w*|\w*ooo\w*", ' ',
         pysrt.open('./Subtitles_all/Subtitles/Bullet.Train.2022.WEBRip.Netflix.srt', encoding='latin-1').text).lower()

data.loc[data['Movie'] == 'Glass Onion', 'Subtitles'] = re.sub("<.+?>|\\n|[^A-Za-z']|\w*aaa\w*|\w*ooo\w*", ' ',
         pysrt.open('./Subtitles_all/Subtitles/Glass.Onion.A Knives.Out.Mystery.2022.1080p.NF.WEB-DL.DDP5.1.Atmos.x264.srt', 
                    encoding='latin-1').text).lower()

data.loc[data['Movie'] == 'Lightyear', 'Subtitles'] = re.sub("<.+?>|\\n|[^A-Za-z']|\w*aaa\w*|\w*ooo\w*", ' ',
         pysrt.open('./Subtitles_all/Subtitles/Lightyear.2022.WEBRip.iTunes.srt', encoding='latin-1').text).lower()

data.loc[data['Movie'] == 'Matilda(2022)', 'Subtitles'] = re.sub("<.+?>|\\n|[^A-Za-z']|\w*aaa\w*|\w*ooo\w*", ' ',
         pysrt.open("./Subtitles_all/Subtitles/Roald Dahl's Matilda the Musical (2022).srt", 
                    encoding='latin-1').text).lower()

data.loc[data['Movie'] == 'Thor: love and thunder', 'Subtitles'] = re.sub("<.+?>|\\n|[^A-Za-z']|\w*aaa\w*|\w*ooo\w*", ' ',
         pysrt.open('./Subtitles_all/Subtitles/Thor.Love.and.Thunder.2022.WEB-DL.srt', 
                    encoding='latin-1').text).lower()

data.loc[data['Movie'] == 'The Secret Life of Pets.en', 'Subtitles'] = re.sub("<.+?>|\\n|[^A-Za-z']|\w*aaa\w*|\w*ooo\w*", ' ',
         pysrt.open('./Subtitles_all/Subtitles/The.Secret.Life.of.Pets.2016.BDRip.x264-SPARKS.SDH.srt', 
                    encoding='latin-1').text).lower()

**Выводы:**
1. Составлен датасет, содержащий название фильма, уровень английского для него, и текст субтитров;
2. Проведена предобработка субтитров: удалены лишние символы, оставлены только слова.

# Подготовка к обучению модели

Разделим наши данные на обучающиую и тестовую выборки.

In [55]:
X_train, X_test, y_train, y_test = train_test_split(data['Subtitles'], 
                                                   data['Level'], 
                                                   test_size=0.25, 
                                                   random_state=12345)

Добавим к обучающей выборке наборы слов, соответствующие уровням английского, которые мы получили в пункте 2.1.

In [56]:
X_train.loc[max(X_train.index) + 1] = A2
X_train.loc[max(X_train.index) + 1] = B1
X_train.loc[max(X_train.index) + 1] = B2
X_train.loc[max(X_train.index) + 1] = C1

y_train[max(y_train.index) + 1] = 'A2'
y_train[max(y_train.index) + 1] = 'B1'
y_train[max(y_train.index) + 1] = 'B2'
y_train[max(y_train.index) + 1] = 'C1'

Зададим список стоп-слов (местоимения, частицы и тому подобные слова, которые будут изъяты из обучающей выборки).

In [57]:
stop_words = stopwords.words('english')

Зададим несколько видов стемминга и лемматизации 

Стемминг - это способ упрощения формы слова, чтобы сходные слова, которые незначительно отличаются друг от друга, например окончанием, воспринимались моделью как одно и то же слово.

Лемматизация - это способ возврата слова в его изначальную форму.

In [58]:
snowball = SnowballStemmer(language='english')
lancaster = LancasterStemmer()
wnl = WordNetLemmatizer()

def tokenizer(text):
    return text.split()

def snowball_tokenizer(text):
    return [snowball.stem(word) for word in text.split()]

def lancaster_tokenizer(text):
    return [lancaster.stem(word) for word in text.split()]

def wnl_tokenizer(text):
    return [wnl.lemmatize(word) for word in text.split()]

Всего в проекте будем обучать 2 типа моделей:
- логистическую регрессию;
- случайный лес.

Перебирать гиперпараметры моделей будем с помощью случайного поиска с применением кросс-влидации - RandomizedSearchCV. Зададим для наших моделей наборы гиперпараметров.

**Логистическая регрессия**

In [59]:
lr_param_grid = [
    {
        'vect__stop_words': [stop_words, None],
        'vect__tokenizer': [tokenizer, snowball_tokenizer, lancaster_tokenizer, wnl_tokenizer],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [1.0, 10.0, 100.0],
        'clf__class_weight': [None, 'balanced']
    }
                 ]

tfidf = TfidfVectorizer()

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=12345))])

rs_lr_tfidf = RandomizedSearchCV(lr_tfidf, lr_param_grid,
                                 scoring='accuracy',
                                 n_iter=50,
                                 cv=5,
                                 verbose=1,
                                 n_jobs=-1)

**Случайный лес**

In [60]:
param_rfc = [
    {
        'vect__stop_words': [stop_words, None],
        'vect__tokenizer': [tokenizer, snowball_tokenizer, lancaster_tokenizer, wnl_tokenizer],
        'rfc__n_estimators': [60, 80, 100, 120, 140],
        'rfc__criterion': ['gini', 'entropy', 'log_loss'],
        'rfc__class_weight': [None, 'balanced', 'balanced_subsample'],
        'rfc__max_depth': [3, 5, 7, 9, None]
    }
              ]

rfc_tfidf = Pipeline([('vect', tfidf),
                      ('rfc', RandomForestClassifier(random_state=12345))])

rs_rfc_tfidf = RandomizedSearchCV(rfc_tfidf, param_rfc,
                                  scoring='accuracy',
                                  n_iter=50,
                                  cv=5,
                                  verbose=1,
                                  n_jobs=-1)

**Выводы:**
1. Данные разбиты на обучающую и тестовую выборку;
2. Подготовлена выборка гиперпараметров для обучения моделей.

# Обучение моделей

Обучим модель логистической регрессии.

In [62]:
%%time
rs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


125 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
125 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\V\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\V\anaconda3\lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\V\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\V\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_so

CPU times: total: 20.2 s
Wall time: 4min 48s


In [63]:
rs_lr_tfidf.best_score_

0.701051051051051

In [64]:
rs_lr_tfidf.best_params_

{'vect__tokenizer': <function __main__.lancaster_tokenizer(text)>,
 'vect__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down

Обучим модель случайного леса.

In [66]:
%%time
rs_rfc_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits




CPU times: total: 14.7 s
Wall time: 5min 44s


In [67]:
rs_rfc_tfidf.best_score_

0.6133633633633633

In [68]:
rs_rfc_tfidf.best_params_

{'vect__tokenizer': <function __main__.snowball_tokenizer(text)>,
 'vect__stop_words': None,
 'rfc__n_estimators': 140,
 'rfc__max_depth': 3,
 'rfc__criterion': 'gini',
 'rfc__class_weight': 'balanced_subsample'}

**Выводы**
1. Проведено обучение моделей логистической регрессии и случайного леса;
2. Лучшие результаты на кросс-валидации показала **модель логистической регрессии** со следущими гиперпараметрами:
- с применением стеммера Ланкастера;
- с использованием стоп-слов;
- с L2-регуляризацией;
- со сбалансированным весом классов;
- с параметром регуляризации C, равным 100.

# Тестирование модели

Проведем тестирование модели на тестовой выборке.

In [74]:
rs_lr_tfidf.score(X_test, y_test)

0.65

**Вывод:**
1. Выбранная модель показала на тестовой выборке значение метрики accuracy, равное 0.65