# Load Dataset

In [23]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

In [2]:
data_path = "../data/jutsus.jsonl"
df = pd.read_json(data_path, lines=True)
df.head()


Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
1,Agonizing Thorn Technique (Simply Put a Kanchō),"Taijutsu, Ninjutsu",After punching and kicking the opponent a few ...
2,Alder,"Kekkei Genkai, Taijutsu","Kimimaro rushes at his opponent, and using the..."
3,Afterimage Clone,"Ninjutsu, Clone Techniques","Shisui uses the Body Flicker Technique, and mo..."
4,Afterglow,Kenjutsu,Sasuke dashes toward his opponent and quickly ...


- ninjutsu, genjutsu, taijutsu
- 3가지만 원함
- 나머지 제거 처리

In [3]:
def simplify_jutsu(jutsu):
    if "Kenjutsu" in jutsu:
        return "Kenjutsu"
    if "Genjutsu" in jutsu:
        return "Kenjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"

In [4]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)

In [5]:
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...,Taijutsu
1,Agonizing Thorn Technique (Simply Put a Kanchō),"Taijutsu, Ninjutsu",After punching and kicking the opponent a few ...,Ninjutsu
2,Alder,"Kekkei Genkai, Taijutsu","Kimimaro rushes at his opponent, and using the...",Taijutsu
3,Afterimage Clone,"Ninjutsu, Clone Techniques","Shisui uses the Body Flicker Technique, and mo...",Ninjutsu
4,Afterglow,Kenjutsu,Sasuke dashes toward his opponent and quickly ...,Kenjutsu


In [6]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2168
Taijutsu     367
Kenjutsu     276
Name: count, dtype: int64

In [7]:
df['text'] = df['jutsu_name'] + ". " + df['jutsu_description']
df['jutsus'] = df['jutsu_type_simplified']
df = df[['text', 'jutsus']]
df = df.dropna()

In [8]:
df.head()

Unnamed: 0,text,jutsus
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu
1,Agonizing Thorn Technique (Simply Put a Kanchō...,Ninjutsu
2,"Alder. Kimimaro rushes at his opponent, and us...",Taijutsu
3,Afterimage Clone. Shisui uses the Body Flicker...,Ninjutsu
4,Afterglow. Sasuke dashes toward his opponent a...,Kenjutsu


In [9]:
from bs4 import BeautifulSoup


class Cleaner():
    def __init__(self):
        pass

    def put_line_breaks(self, text):
        return text.replace("<\p>", "<\p>\n")
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

In [10]:
text_column_name = 'text'
label_column_name = "jutsus"

In [11]:
# Clean text
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

In [12]:
df.head(2)

Unnamed: 0,text,jutsus,text_cleaned
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...
1,Agonizing Thorn Technique (Simply Put a Kanchō...,Ninjutsu,Agonizing Thorn Technique (Simply Put a Kanchō...


In [13]:
# Encode labels
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [14]:
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Kenjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [15]:
df['label'] = le.transform(df[label_column_name].tolist())

In [16]:
df.head()

Unnamed: 0,text,jutsus,text_cleaned,label
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...,2
1,Agonizing Thorn Technique (Simply Put a Kanchō...,Ninjutsu,Agonizing Thorn Technique (Simply Put a Kanchō...,1
2,"Alder. Kimimaro rushes at his opponent, and us...",Taijutsu,"Alder. Kimimaro rushes at his opponent, and us...",2
3,Afterimage Clone. Shisui uses the Body Flicker...,Ninjutsu,Afterimage Clone. Shisui uses the Body Flicker...,1
4,Afterglow. Sasuke dashes toward his opponent a...,Kenjutsu,Afterglow. Sasuke dashes toward his opponent a...,0


In [17]:
test_size = 0.2
df_train, df_test = train_test_split(df,
                                     test_size=test_size,
                                     stratify=df['label'],
                                     )

In [18]:
df_train['jutsus'].value_counts()

jutsus
Ninjutsu    1734
Taijutsu     293
Kenjutsu     221
Name: count, dtype: int64

# tokenize

In [19]:
model_name = "distilbert/distilbert-base-uncased"

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [22]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples['text_cleaned'], truncation=True)

In [24]:
# Convert Pandas to a hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# tokenize the datset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)

Map: 100%|██████████| 2248/2248 [00:00<00:00, 10271.82 examples/s]
Map: 100%|██████████| 563/563 [00:00<00:00, 14911.36 examples/s]


### Now, we're ready to write our own text classifier!