In [112]:
from transformers import pipeline 
from nltk import sent_tokenize
import nltk 
import torch
from glob import glob
import pandas as pd

In [4]:

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load model

In [26]:
model_name = "facebook/bart-large-mnli"
device = 0 if torch.cuda.is_available() else 'cpu'

In [33]:
def load_model(device): 
    theme_classifier = pipeline('zero-shot-classification', 
                                model="facebook/bart-large-mnli",
                                device=0)
    return theme_classifier

In [34]:
theme_classifier = load_model(device)



In [35]:
themes = ["friendship", "hope", "sacrifice", "battle", "self development", "betrayal", "love", "dialogue"]


In [46]:
theme_classifier(
    "I gave him a right hook then a left jab",
    themes,
    multi_label=True
)


{'sequence': 'I gave him a right hook then a left jab',
 'labels': ['battle',
  'self development',
  'hope',
  'sacrifice',
  'dialogue',
  'betrayal',
  'love',
  'friendship'],
 'scores': [0.9121260046958923,
  0.47499924898147583,
  0.08781825006008148,
  0.04499973729252815,
  0.02013293467462063,
  0.012040339410305023,
  0.004292357712984085,
  0.0028172177262604237]}

# Load dataset

In [65]:
import os
import glob  # Добавляем импорт

# Проверяем текущую рабочую директорию
print("Текущая директория:", os.getcwd())

# Проверяем существование пути
path = 'data/Subtitles'
print("Путь существует:", os.path.exists(path))

# Выведем содержимое директории
if os.path.exists(path):
    print("\nСодержимое директории:")
    for item in os.listdir(path):
        print(item)

# Попробуем использовать абсолютный путь
files = glob.glob(os.path.join(os.getcwd(), 'data', 'Subtitles', '*.ass'))
print("\nНайденные файлы:", files)

# Дополнительная проверка с использованием рекурсивного поиска
files_recursive = glob.glob(os.path.join(os.getcwd(), '**', '*.ass'), recursive=True)
print("\nФайлы (рекурсивный поиск):", files_recursive)

# Проверка наличия файлов с разными регистрами расширения
files_upper = glob.glob(os.path.join(os.getcwd(), 'data', 'Subtitles', '*.ASS'))
print("\nФайлы с расширением .ASS:", files_upper)

Текущая директория: c:\Users\Gani\Desktop\my_projects\Anime_NLP
Путь существует: True

Содержимое директории:
Naruto Season 1 - 01.ass
Naruto Season 1 - 02.ass
Naruto Season 1 - 03.ass
Naruto Season 1 - 04.ass
Naruto Season 1 - 05.ass
Naruto Season 1 - 06.ass
Naruto Season 1 - 07.ass
Naruto Season 1 - 08.ass
Naruto Season 1 - 09.ass
Naruto Season 1 - 10.srt
Naruto Season 1 - 11.srt
Naruto Season 1 - 12.ass
Naruto Season 1 - 13.ass
Naruto Season 1 - 14.ass
Naruto Season 1 - 15.ass
Naruto Season 1 - 16.ass
Naruto Season 1 - 17.ass
Naruto Season 1 - 18.ass
Naruto Season 1 - 19.ass
Naruto Season 1 - 20.ass
Naruto Season 1 - 21.ass
Naruto Season 1 - 22.ass
Naruto Season 1 - 23.ass
Naruto Season 1 - 24.ass
Naruto Season 1 - 25.ass
Naruto Season 1 - 26.ass
Naruto Season 2 - 27.ass
Naruto Season 2 - 28.ass
Naruto Season 2 - 29.ass
Naruto Season 2 - 30.ass
Naruto Season 2 - 31.ass
Naruto Season 2 - 32.ass
Naruto Season 2 - 33.ass
Naruto Season 2 - 34.ass
Naruto Season 2 - 35.ass
Naruto Season 2

In [63]:
os.chdir('..')

In [111]:
files = glob('data/Subtitles/*.ass')

In [72]:
files[:5]

['data/Subtitles\\Naruto Season 1 - 01.ass',
 'data/Subtitles\\Naruto Season 1 - 02.ass',
 'data/Subtitles\\Naruto Season 1 - 03.ass',
 'data/Subtitles\\Naruto Season 1 - 04.ass',
 'data/Subtitles\\Naruto Season 1 - 05.ass']

In [76]:
with open(files[0], 'r') as file:
    lines = file.readlines()
    lines = lines[27:]
    lines = [",".join(line.split(',')[9:]) for line in lines]
    

In [89]:
lines[:2]

['A long time ago, a powerful demon fox appeared with nine tails.\n',
 'With its powerful tails,\n']

In [86]:
lines = [line.replace('\\N', ' ') for line in lines]

In [88]:
" ".join(lines[:10])

"A long time ago, a powerful demon fox appeared with nine tails.\n With its powerful tails,\n it could smash mountains and create tidal waves.\n A band of Ninjas rose to defend their village from attack.\n We have to wait until the Fourth Hokage gets here!\n We can't let it get any closer to our village!\n One great Ninja was able to imprison the monster,\n but died in the process.\n This Ninja was known as… the Fourth Hokage.\n Naruto!\n"

In [103]:
files[0]

'data/Subtitles\\Naruto Season 1 - 01.ass'

In [110]:
int(files[0].split('-')[-1].split('.')[0].strip())

1

In [None]:
files = glob('data/Subtitles/*.ass')

In [114]:
def load_subtitles_dataset(dataset_path):
    subtitles_paths = glob(dataset_path+'/*.ass')
    
    scripts = []
    episodes_num = []
    
    for path in subtitles_paths: 
        #read lines
        with open(files[0], 'r') as file:
            lines = file.readlines()
            lines = lines[27:]
            lines = [",".join(line.split(',')[9:]) for line in lines]
        lines = [line.replace('\\N', ' ') for line in lines]
        script = " ".join(lines)
        
        episode = int(path.split('-')[-1].split('.')[0].strip())
        scripts.append(script)
        episodes_num.append(episode)
    
    df = pd.DataFrame.from_dict({"episode" : episodes_num, "script": scripts})      
    return df 

In [115]:
dataset_path = "data/Subtitles"
df = load_subtitles_dataset(dataset_path)

In [120]:
df.head()

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"A long time ago, a powerful demon fox appeared..."
2,3,"A long time ago, a powerful demon fox appeared..."
3,4,"A long time ago, a powerful demon fox appeared..."
4,5,"A long time ago, a powerful demon fox appeared..."


In [121]:
df.describe

<bound method NDFrame.describe of      episode                                             script
0          1  A long time ago, a powerful demon fox appeared...
1          2  A long time ago, a powerful demon fox appeared...
2          3  A long time ago, a powerful demon fox appeared...
3          4  A long time ago, a powerful demon fox appeared...
4          5  A long time ago, a powerful demon fox appeared...
..       ...                                                ...
213      216  A long time ago, a powerful demon fox appeared...
214      217  A long time ago, a powerful demon fox appeared...
215      218  A long time ago, a powerful demon fox appeared...
216      219  A long time ago, a powerful demon fox appeared...
217      220  A long time ago, a powerful demon fox appeared...

[218 rows x 2 columns]>