#### 자동차 회사는 새로운 전략을 수립하기 위해 4개의 시장으로 세분화했습니다. <br /> 기존 고객 분류 자료를 바탕으로 신규 고객이 어떤 분류에 속할지 예측해주세요!<br> 
#### 예측할 값(y): "Segmentation" (1,2,3,4) <br /> 평가: Macro f1-score <br /> data: train.csv, test.csv

In [81]:
import pandas as pd
import numpy as np

In [82]:
train_df = pd.read_csv('./Dataset/train.csv')
test_df = pd.read_csv('./Dataset/test.csv')

In [83]:
train_df.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,4
1,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,2
2,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,2
3,461319,Male,Yes,56,No,Artist,0.0,Average,2.0,Cat_6,3
4,460156,Male,No,32,Yes,Healthcare,1.0,Low,3.0,Cat_6,3


In [84]:
test_df.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,458989,Female,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6
1,458994,Male,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6
2,459000,Male,Yes,59,No,Executive,11.0,High,2.0,Cat_6
3,459003,Male,Yes,47,Yes,Doctor,0.0,High,5.0,Cat_4
4,459005,Male,Yes,61,Yes,Doctor,5.0,Low,3.0,Cat_6


In [85]:
train_df.isnull().sum()

ID                 0
Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
Var_1              0
Segmentation       0
dtype: int64

In [86]:
test_df.isnull().sum()

ID                 0
Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
Var_1              0
dtype: int64

In [87]:
train_df['Segmentation'].value_counts()

Segmentation
4    1757
3    1720
1    1616
2    1572
Name: count, dtype: int64

In [88]:
train_df.columns

Index(['ID', 'Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession',
       'Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1',
       'Segmentation'],
      dtype='object')

In [89]:
# 범주형 데이터 변화

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cols = ['Gender','Ever_Married', 'Graduated', 'Profession','Spending_Score','Family_Size','Var_1']

for col in cols:
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.fit_transform(test_df[col])

In [90]:
# modeling

y_train = train_df.pop('Segmentation')
y_train

0       4
1       2
2       2
3       3
4       3
       ..
6660    2
6661    4
6662    4
6663    2
6664    2
Name: Segmentation, Length: 6665, dtype: int64

In [91]:
train_df.pop('ID')
cust_id = test_df.pop('ID')

In [92]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators= 100, random_state = 42, max_depth= 5)
rf.fit(train_df, y_train)
y_pred = rf.predict(test_df)

In [93]:
train_df.head()

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,1,0,22,0,5,1.0,2,3,3
1,0,1,67,1,2,1.0,2,0,5
2,1,1,67,1,7,0.0,1,1,5
3,1,1,56,0,0,0.0,0,1,5
4,1,0,32,1,5,1.0,2,2,5


In [96]:
# 교차 검증
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, train_df, y_train, scoring='f1_macro', cv=5)
print(scores)
print(scores.mean())

[0.51845156 0.50509426 0.50043692 0.52193564 0.51107816]
0.5113993072872057


In [98]:
help(cross_val_score)

Help on function cross_val_score in module sklearn.model_selection._validation:

cross_val_score(estimator, X, y=None, *, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, params=None, pre_dispatch='2*n_jobs', error_score=nan)
    Evaluate a score by cross-validation.

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        The data to fit. Can be for example a list, or an array.

    y : array-like of shape (n_samples,) or (n_samples, n_outputs),             default=None
        The target variable to try to predict in the case of
        supervised learning.

    groups : array-like of shape (n_samples,), default=None
        Group labels for the samples used while splitting the dataset into
        train/test set. Only used in conjunction with a "Group"

In [78]:
y_pred

array([1, 3, 2, ..., 1, 2, 4])

In [79]:
submission = pd.DataFrame({'ID': cust_id, 'Segmentation': y_pred})
submission.to_csv('9_26_11:28_2.csv', index=False)