# Animal 카테고리 분류기

## 분류기를 위한 라이브러리 임포트

In [47]:
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

## 학습을 위한 데이터 임포트

In [29]:
animal = pd.read_csv("animal.csv")
others = pd.read_csv("others.csv").replace(' ','', regex=True).replace(',',' ', regex=True)

## 상위 데이터만 보기

In [30]:
print(animal["labels"].head(3))
print(others["labels"].head(3))

0    Tiger Mammal Wildlife Vertebrate Bengaltiger T...
1    Cat Mammal Vertebrate Smalltomedium-sizedcats ...
2    Terrestrialanimal Mammal Vertebrate Wildlife L...
Name: labels, dtype: object
0                 Finger Skin Hand Nail Thumb Gesture 
1    Face Eyebrow Lip Hair Nose Skin Cheek Eye Beau...
2    Athlete Leg Sports Thigh Sportvenue Sprint Tra...
Name: labels, dtype: object


## 데이터 라벨링

In [31]:
animal = pd.DataFrame(animal["labels"])
animal["category"] = 1 # sports
others = pd.DataFrame(others["labels"])
others["category"] = 2 # suffled

## train, test셋으로 데이터 분류

In [35]:
data = animal.append(others)
train, test = train_test_split(data, test_size=0.33, random_state=42)
train["category"] = pd.Categorical(train["category"].values.astype('U'))
print(train.groupby("category").count())
print(test.groupby("category").count())

          labels
category        
1            902
2           1093
          labels
category        
1            467
2            517


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


## SVM으로 모델 학습

In [50]:
cv = CountVectorizer()
tdm = cv.fit_transform(train["labels"].values.astype('U'))
animal_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier())])
animal_clf = animal_clf.fit(train["labels"].values.astype('U'), train["category"].values.astype('U'))

## 테스트셋으로 정확도 평가

In [51]:
predicted_svm = animal_clf.predict(test["labels"].values.astype('U'))
print(np.mean(predicted_svm == test["category"].values.astype('U')))

0.9654471544715447


## 모델 저장

In [54]:
joblib.dump(animal_clf, 'animal_clf.pkl')

['animal_clf.pkl']

## 분류 테스트

In [55]:
animal_clf = joblib.load('animal_clf.pkl')
print(animal_clf.predict(['Shoulder Clothing Streetfashion Handbag Bag Fashion Fashionaccessory Joint Totebag']))

['2']
