Pre-process the raw pokedex descriptions to form a training corpus

In [17]:
import re
import pandas as pd
from collections import Counter
from unidecode import unidecode
import json

In [5]:
df = pd.read_csv("../data/raw_data_pokedex.csv", encoding="utf-16-le", sep="\t")

# Filter for gen 1
df = df.loc[df["gen"] == "I"]

# Lowercase everything
df["english_name"] = df["english_name"].str.lower()
df["description"] = df["description"].str.lower()
df["classification"] = df["classification"].str.lower()

# Append type and classification to description
df["description"] =  (
    df["english_name"] + " " + 
    df["primary_type"] + " " + 
    df["classification"].replace(" pokemon", "") + " " + 
    df["description"]
    )

columns_to_keep = [
    "english_name",
    "description",
]
df = df[columns_to_keep]

# Replace all pronouns by the pokemon nouns
def transform(row):
    pokemon_name = row["english_name"]
    return (
        row["description"]
        .replace("’s", ' is')
        .replace("'s", ' is')
        .replace("it", f' {pokemon_name}')
        .replace("its", pokemon_name)
        .replace("this pokémon", pokemon_name)
    )
df["description"] = df.apply(lambda row: transform(row), axis=1)

df

Unnamed: 0,english_name,description
0,bulbasaur,bulbasaur grass seed pokémon there is a plant ...
1,ivysaur,ivysaur grass seed pokémon when the bulb on i...
2,venusaur,venusaur grass seed pokémon venusaurs plant b...
3,charmander,charmander fire lizard pokémon charmander has...
4,charmeleon,charmeleon fire flame pokémon charmeleon has ...
...,...,...
146,dratini,dratini dragon dragon pokémon dratini dwells n...
147,dragonair,dragonair dragon dragon pokémon dragonair live...
148,dragonite,dragon dragonitee dragon dragon pokémon drago...
149,mewtwo,mewtwo psychic genetic pokémon mewtwos dna is...


In [7]:
# Count the number of words
def unique_words_counts(text):
    # Remove punctuation marks
    text = re.sub(r'[^\w\s]', '', text)
    # Use regular expression to find all words
    words = re.findall(r'\b\w+\b', text.lower())
    # Count the occurrences of each word
    word_counts = Counter(words)
    return word_counts

# Extract all description strings
descriptions = df["description"].values
single_string = " ".join(descriptions)
unique_word_count = unique_words_counts(single_string)
print(f"Number of unique words: {len(unique_word_count)}")

Number of unique words: 1497


In [16]:
def remove_accents(text):
    return unidecode(text)

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()

df["description"] = df["description"].apply(remove_accents)
df["description"] = df["description"].apply(remove_punctuation)
df["description"] = df["description"].apply(remove_extra_spaces)
description = df["description"].values
list(description)

['bulbasaur grass seed pokemon there is a plant seed on bulbasaurs back right from the day bulbasaur is born the seed slowly grows larger',
 'ivysaur grass seed pokemon when the bulb on ivysaurs back grows large ivysaur appears to lose the abil ivysaury to stand on ivysaurs hind legs',
 'venusaur grass seed pokemon venusaurs plant blooms when venusaur is absorbing solar energy venusaur stays on the move to seek sunlight',
 'charmander fire lizard pokemon charmander has a preference for hot things when charmander rains steam is said to spout from the tip of charmanders tail',
 'charmeleon fire flame pokemon charmeleon has a barbaric nature in battle charmeleon whips charmeleons fiery tail around and slashes away w charmeleonh sharp claws',
 'charizard fire flame pokemon charizard sp charizards fire that is hot enough to melt boulders charizard may cause forest fires by blowing flames',
 'squirtle water tiny turtle pokemon when squirtle retracts squirtles long neck into squirtles shell s

In [None]:
cleansed_data = {
    "readme": "This dataset contains the cleansed data from the Pokedex.",
    "data": list(description)
}

# save as json
with open("../data/train_data_pokedex_entries.json", "w") as f:
    json.dump(cleansed_data, f, indent=4)