# 언어 분류기

## [1] 데이터 불러오기 및 전처리

In [19]:
from sklearn.model_selection import train_test_split, cross_validate

import pandas as pd
import numpy as np

In [20]:
# 경고 무시
import warnings
warnings.filterwarnings(action = "ignore")

In [21]:
from collections import defaultdict

langs = defaultdict()
lang_list = ["en", "fr", "id", "tl"]

for i in range(4):
    lang = lang_list[i]
    for j in range(1, 6):
        fname = lang + "-" + str(5*i + j) + ".txt"
        with open("../Data/train/" + fname) as f:
            langs[lang + "-" + str(j)] = f.read().splitlines()

In [22]:
langs.keys()

dict_keys(['en-1', 'en-2', 'en-3', 'en-4', 'en-5', 'fr-1', 'fr-2', 'fr-3', 'fr-4', 'fr-5', 'id-1', 'id-2', 'id-3', 'id-4', 'id-5', 'tl-1', 'tl-2', 'tl-3', 'tl-4', 'tl-5'])

In [23]:
def make_text(key, dict):
    texts = "".join(dict[key])
    return texts.lower()

In [24]:
for key in langs:
    langs[key] = make_text(key, langs)

In [25]:
def alphabet_counter(langs: dict):
    from collections import Counter

    alphabet = "abcdefghijklmnopqrstuvwxyz"

    for key in langs:
        langs[key] = Counter(langs[key])

    # data, target 만들기
    data = pd.DataFrame(langs.values()).fillna(0)
    target = []
    for lang in langs.keys():
        target.append(lang[:2])

    newcol = []
    for col in data.columns:
        if col in alphabet:
            newcol.append(col)
    data = data.loc[:, newcol]

    return data, target



data, target = alphabet_counter(langs)

data.loc[21] = data.columns
data = data.sort_values(21, axis=1)
data = data.drop(21)


X = data.to_numpy()
y = np.array(target)

## [2] 데이터 나누기

In [26]:
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler()
X_scaled = X_scaled.fit_transform(X)

In [27]:
train_X, test_X, train_y, test_y = train_test_split(
    X_scaled, y, stratify=y
)

## [3] 모델 찾기

In [28]:
def all_estimators_classifier(data, target):
    from collections import defaultdict
    from sklearn.utils import all_estimators
    # 필터 타입에 해당하는 sklearn에 존재하는 모든 모델 이름과 객체 리스트로 반환
    models = all_estimators(type_filter = "classifier")

    train_X, test_X, train_y, test_y = train_test_split(
    data, target, test_size=0.2, random_state=42
    )

    scores = defaultdict()

    for name, model in models:
        try:
            # 모델 객체 생성
            md = model()
            # 학습
            md.fit(train_X, train_y)
            # 평가
            score = md.score(test_X, test_y)
            scores[name] = score
        except:
            pass

    return dict(scores)

In [29]:
all_estimators_classifier(X_scaled, y)

{'AdaBoostClassifier': 0.25,
 'BaggingClassifier': 0.25,
 'BernoulliNB': 1.0,
 'CalibratedClassifierCV': 0.75,
 'DecisionTreeClassifier': 0.0,
 'DummyClassifier': 0.0,
 'ExtraTreeClassifier': 0.5,
 'ExtraTreesClassifier': 0.5,
 'GaussianNB': 0.25,
 'GaussianProcessClassifier': 0.5,
 'GradientBoostingClassifier': 0.25,
 'HistGradientBoostingClassifier': 0.0,
 'KNeighborsClassifier': 0.75,
 'LinearDiscriminantAnalysis': 0.75,
 'LinearSVC': 1.0,
 'LogisticRegression': 1.0,
 'LogisticRegressionCV': 0.75,
 'MLPClassifier': 1.0,
 'NearestCentroid': 1.0,
 'NuSVC': 0.75,
 'PassiveAggressiveClassifier': 0.75,
 'Perceptron': 0.75,
 'QuadraticDiscriminantAnalysis': 0.25,
 'RandomForestClassifier': 0.75,
 'RidgeClassifier': 1.0,
 'RidgeClassifierCV': 1.0,
 'SGDClassifier': 0.75,
 'SVC': 0.0}

## [4] RidgeClassifier 사용 분석

In [40]:
from sklearn.linear_model import RidgeClassifier
model = RidgeClassifier().fit(train_X, train_y)

model.score(test_X, test_y)

1.0

In [41]:
# cross validate 사용
result = cross_validate(model, X_scaled, y, return_train_score=True)
result

{'fit_time': array([0.0054574 , 0.00298524, 0.0017314 , 0.00179696, 0.00148916]),
 'score_time': array([0.000669  , 0.00030327, 0.00031233, 0.00031209, 0.00027156]),
 'test_score': array([1.  , 1.  , 0.75, 0.75, 0.75]),
 'train_score': array([1., 1., 1., 1., 1.])}

## [5] 예측하기

In [32]:
langs_pred = defaultdict()
lang_list = ["en", "fr", "id", "tl"]

for i in range(4):
    lang = lang_list[i]
    for j in range(1, 3):
        fname = lang + "-" + str(2*i + j) + ".txt"
        with open("../Data/test/" + fname) as f:
            langs_pred[lang + "-" + str(j)] = f.read().splitlines()

In [33]:
for key in langs_pred:
    langs_pred[key] = make_text(key, langs_pred)

In [42]:
data_pred, target_pred = alphabet_counter(langs_pred)

data_pred.loc[9] = data_pred.columns
data_pred = data_pred.sort_values(9, axis=1)
data_pred = data_pred.drop(9)


X_pred = data_pred.to_numpy()
y_pred = np.array(target_pred)


X_pred_scaled = StandardScaler().fit_transform(X_pred)

In [43]:
print(target_pred)

['en', 'en', 'fr', 'fr', 'id', 'id', 'tl', 'tl']


In [44]:
model.predict(X_pred_scaled)

array(['en', 'fr', 'fr', 'fr', 'en', 'id', 'tl', 'en'], dtype='<U2')