# PokeWhom Project

## Imports

In [341]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import nltk
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
import imblearn
from sklearn.naive_bayes import BernoulliNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import MultinomialNB
from collections import Counter


## Read Data and Clean

In [404]:
# jupyter notebook
#source venv/bin/activate
#docker build -t my-jupyter .
#features = pokemon descriptions, height, weight, stats?
#labels = type

def clean_type(type):
    types = type.strip("{}").split(",")
    types = [t.strip().lower() for t in types]
    if len(types) == 1:
        return types[0]
    return f"{types[0]}-{types[1]}"

def type_primary(type):
    types = type.strip("{}").split(",")
    types = [t.strip().lower() for t in types] 
    # if (len(types) > 1 and types[0] == "normal" and types[1] == "flying"):
    #     return types[1]
    return types[0]

def type_secondary(type):
    types = type.strip("{}").split(",")
    types = [t.strip().lower() for t in types] 
    if len(types) == 2:
        return types[1]
    return ""

def remove_stopwords(description):
    stop_words = set(stopwords.words("english"))
    words = description.split()
    filtered = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered)
    

nltk.download('stopwords')
pokemon = pd.read_csv("../Data/pokedex.csv")
pokemon["type_clean"] = pokemon["type"].apply(clean_type)
pokemon["type_primary"] = pokemon["type"].apply(type_primary)
pokemon["type_secondary"] = pokemon["type"].apply(type_secondary)
pokemon["desc_clean"] = pokemon["info"].str.lower().str.replace(r"[^a-z\s]", "", regex=True)
pokemon["desc_clean"] = pokemon["desc_clean"].apply(remove_stopwords)
pokemon["type_primary"].value_counts()
#pokemon.head(30)
#pokemon["type_primary"].value_counts().plot(kind="bar")
#there's a disproportianate amount of water and normal types?

        



[nltk_data] Downloading package stopwords to /home/db/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


type_primary
water       134
normal      118
grass       103
bug          83
fire         66
psychic      60
electric     59
rock         58
dark         45
poison       42
ground       40
fighting     40
dragon       37
steel        36
ghost        35
ice          31
fairy        29
flying        9
Name: count, dtype: int64

## Features

In [409]:
x = pokemon["desc_clean"] #features
y = pokemon["type_primary"] #labels
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)
vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


## Label Encoding

In [410]:
label_train = y_train
label_test = y_test
print(Counter(y_train))
encoder = LabelEncoder()
encoder.fit(pokemon["type_primary"])

label_train_enc = encoder.transform(label_train)
label_test_enc = encoder.transform(label_test)
print(Counter(label_train_enc))
print(Counter(label_test_enc))


Counter({'water': 107, 'normal': 94, 'grass': 83, 'bug': 66, 'fire': 53, 'psychic': 48, 'electric': 47, 'rock': 46, 'dark': 36, 'poison': 34, 'ground': 32, 'fighting': 32, 'dragon': 30, 'steel': 29, 'ghost': 28, 'ice': 25, 'fairy': 23, 'flying': 7})
Counter({np.int64(17): 107, np.int64(12): 94, np.int64(9): 83, np.int64(0): 66, np.int64(6): 53, np.int64(14): 48, np.int64(3): 47, np.int64(15): 46, np.int64(1): 36, np.int64(13): 34, np.int64(10): 32, np.int64(5): 32, np.int64(2): 30, np.int64(16): 29, np.int64(8): 28, np.int64(11): 25, np.int64(4): 23, np.int64(7): 7})
Counter({np.int64(17): 27, np.int64(12): 24, np.int64(9): 20, np.int64(0): 17, np.int64(6): 13, np.int64(14): 12, np.int64(15): 12, np.int64(3): 12, np.int64(1): 9, np.int64(5): 8, np.int64(10): 8, np.int64(13): 8, np.int64(2): 7, np.int64(8): 7, np.int64(16): 7, np.int64(4): 6, np.int64(11): 6, np.int64(7): 2})


## Model

In [411]:
label_map = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
# ros = ros = RandomOverSampler(
#     sampling_strategy="not majority",
#     random_state=42
# )
# X_train_vec,label_train_enc = ros.fit_resample(X_train_vec, label_train_enc)
model =  LogisticRegression(max_iter=1000, class_weight="balanced",solver="liblinear")
model.fit(X_train_vec, label_train_enc)
type_pred_enc = model.predict(X_test_vec)
type_pred = encoder.inverse_transform(type_pred_enc)
print("Test label distribution:", Counter(label_test))

Test label distribution: Counter({'water': 27, 'normal': 24, 'grass': 20, 'bug': 17, 'fire': 13, 'psychic': 12, 'rock': 12, 'electric': 12, 'dark': 9, 'fighting': 8, 'ground': 8, 'poison': 8, 'dragon': 7, 'ghost': 7, 'steel': 7, 'fairy': 6, 'ice': 6, 'flying': 2})


## Evaulate

In [412]:
print(classification_report(label_test, type_pred))

              precision    recall  f1-score   support

         bug       0.31      0.24      0.27        17
        dark       0.00      0.00      0.00         9
      dragon       0.00      0.00      0.00         7
    electric       0.83      0.42      0.56        12
       fairy       0.00      0.00      0.00         6
    fighting       0.00      0.00      0.00         8
        fire       0.47      0.62      0.53        13
      flying       0.00      0.00      0.00         2
       ghost       0.43      0.43      0.43         7
       grass       0.44      0.55      0.49        20
      ground       0.25      0.12      0.17         8
         ice       0.00      0.00      0.00         6
      normal       0.23      0.54      0.33        24
      poison       0.67      0.25      0.36         8
     psychic       0.31      0.42      0.36        12
        rock       0.33      0.33      0.33        12
       steel       0.00      0.00      0.00         7
       water       0.35    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
