# 외향적인 사람과 내향적인 사람 예측

In [None]:
import pandas as pd

In [None]:
person_test_df = pd.read_csv('test (1).csv')
person_train_df = pd.read_csv('train (1).csv')

In [None]:
person_train_df.head()

### 컬럼명 설명
- Time_spent_Alone : 혼자 보내는 시간
- Stage_fear	: 무대 공포증
- Social_event_attendance : 사회적 이벤트 참여
- Going_outside	 : 밖에 나가는 정도
- Drained_after_socializing : 밖에 나간후 피로감을 느끼는 정도
- Friends_circle_size : 친구 관계 규모
- Post_frequency : 온라인에 글 업로드 빈도
- Personality : 외향적/내향적(정답 레이블)

In [None]:
person_train_df.describe()

In [None]:
person_train_df.shape

### 데이터 전처리


In [None]:
person_train_df.isna().sum()

In [None]:
# 결측값 평균치로 채우기
fill_null = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']
person_train_df[fill_null] = person_train_df[fill_null].fillna(person_train_df[fill_null].mean())

In [None]:
# 결측값 처리 확인
person_train_df.isna().sum()

In [None]:
cat = ['Stage_fear','Drained_after_socializing']

In [None]:
cat_train_df = person_train_df.dropna()

In [None]:
cat_train_df.isna().sum()

In [None]:
stage_fear_map = {
    'Yes' : 0,
    'No' : 1
}

In [None]:
drained_map = {
    'Yes' : 0,
    'No' : 1
}

In [None]:
# 범주형 데이터 매핑
from sklearn.feature_selection import f_classif

cat_train_df['Stage_fear'] = cat_train_df['Stage_fear'].map(stage_fear_map)
cat_train_df['Drained_after_socializing'] = cat_train_df['Drained_after_socializing'].map(drained_map)

In [None]:
cat_train_df.isna().sum()

In [None]:
cat_train_df.dtypes

In [None]:
cat_train_df = cat_train_df.drop(columns='Personality')

In [None]:
per_train_df = person_train_df.drop(columns=['Personality','Drained_after_socializing','Stage_fear'])

In [None]:
per_train_df.dtypes

In [None]:
col = ['id','Time_spent_Alone','Social_event_attendance','Going_outside','Friends_circle_size','Post_frequency']
per_train_df[col] = per_train_df[col].apply(pd.to_numeric,errors = 'coerce')
cat_train_df[col] = cat_train_df[col].apply(pd.to_numeric,errors = 'coerce')

In [None]:
per_train_df = person_train_df.drop(columns=['Personality','Drained_after_socializing','Stage_fear'])

In [None]:
per_train_df = pd.DataFrame(per_train_df)
cat_train_df = pd.DataFrame(cat_train_df)

In [None]:
per_train_df[fill_null].dropna().shape

### 범주형 컬럼 f-검정
- 가장 유사도가 높은 컬럼 채택

In [None]:
# Stage_fear 컬럼 f-검정
X = per_train_df[fill_null]
y = cat_train_df['Stage_fear']

common_idx = X.index.intersection(y.index)
X = X.loc[common_idx]
y = y.loc[common_idx]
f_vals,p_vals = f_classif(X,y)

pd.DataFrame({'feature' : fill_null, 'F_score' : f_vals, 'p_value': p_vals})

In [None]:
# Drained_after_socializing 컬럼 f-검정
X1 = per_train_df[fill_null]
y1 = cat_train_df['Drained_after_socializing']

common_idx1 = X1.index.intersection(y1.index)
X1 = X1.loc[common_idx1]
y1 = y1.loc[common_idx1]
f_vals1,p_vals1 = f_classif(X1,y1)

pd.DataFrame({'feature' : fill_null, 'F_score' : f_vals1, 'p_value': p_vals1})

In [None]:
cat_train_df['Stage_fear'].unique()

In [None]:
person_train_df.isna().sum()

In [None]:
person_train_df['Stage_fear'] = person_train_df['Stage_fear'].map({
    'Yes' : 0,
    'No' : 1
})


In [None]:
person_train_df['Drained_after_socializing'] = person_train_df['Drained_after_socializing'].map({
    'Yes' : 0,
    'No' : 1
})

In [None]:
# 유사도가 높은 컬럼의 값에 따라 범주형 컬럼의 결측값 변경
threshhold = person_train_df['Time_spent_Alone'].mean()

person_train_df_filled = person_train_df.copy()

for idx in person_train_df.index:
    if pd.isna(person_train_df.loc[idx,'Stage_fear']):
        if person_train_df.loc[idx, 'Time_spent_Alone'] >= threshhold:
            person_train_df.loc[idx, 'Stage_fear'] = 0
        else:
            person_train_df.loc[idx, 'Stage_fear'] = 1

    if pd.isna(person_train_df.loc[idx,'Drained_after_socializing']):
          if person_train_df.loc[idx, 'Time_spent_Alone'] >= threshhold:
              person_train_df.loc[idx,'Drained_after_socializing'] = 0
          else:
            person_train_df.loc[idx, 'Drained_after_socializing'] = 1

In [None]:
person_train_df

In [None]:
person_train_df.head()

In [None]:
person_train_df['Personality'] = person_train_df['Personality'].map({
    'Extrovert' : 0,
    'Introvert' : 1
})

In [None]:
X = person_train_df.drop(columns=['id','Personality'])
y = person_train_df['Personality']

In [None]:
X.shape, y.shape

### 훈련 데이터, 테스트 데이터 분할

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2)


In [None]:
!pip install lazypredict

### lazypredict 사용해 상위 2개 알고리즘 선정

In [None]:
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0)
models, predictions = clf.fit(X_train, X_val, y_train, y_val)

print(models)

In [None]:
X.head()

In [None]:
X_train

In [None]:
from sklearn.linear_model import Perceptron
from sklearn.model_selection import cross_val_score

model = Perceptron(max_iter=1000, eta0=1.0,random_state=2020158011)
scores = cross_val_score(model, X_train, y_train, cv=5)
print(f"cross_validated accuracy : {scores}")

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

lgb = LGBMClassifier(random_state=2020158011)
model.fit(X_train,y_train)
preds = model.predict(X_val)

print(f"accuracy : {accuracy_score(y_val,preds)}")
print(classification_report(y_val,preds))

cm = confusion_matrix(y_val, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.show()

In [None]:
!pip install optuna

### LGBM 알고리즘 optuna 사용해 하이퍼 파라미터 조정

In [None]:
import optuna
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
# Optuna 목적 함수 정의
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'num_leaves': trial.suggest_int('num_leaves', 15, 150),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'random_state': 2020158011
    }
    model = LGBMClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    loss = log_loss(y_val,preds)
    return loss
    #return f1_score(y_val,preds)
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print('Best trial:', study.best_params)

Best trial: {'n_estimators': 147, 'learning_rate': 0.22790308217928787, 'max_depth': 3, 'num_leaves': 46, 'min_child_samples': 72, 'subsample': 0.9546063780884504, 'colsample_bytree': 0.7352959423814639, 'reg_alpha': 0.03923110923582118, 'reg_lambda': 0.6929538798742644}

In [None]:
# 가장 성능이 좋았던 하이퍼 파라미터 채택
best_params = study.best_params
print(best_params)

In [None]:
best_model = LGBMClassifier(**best_params)
best_model.fit(X,y)

### 테스트 데이터 전처리

In [None]:
test_df = pd.read_csv('test (1).csv')
test_df.isna().sum()

In [None]:
test_df[fill_null] = test_df[fill_null].fillna(test_df[fill_null].mean())

In [None]:
threshhold = test_df['Time_spent_Alone'].mean()

test_df_filled = test_df.copy()

for idx in test_df.index:
    if pd.isna(test_df.loc[idx,'Stage_fear']):
        if test_df.loc[idx, 'Time_spent_Alone'] >= threshhold:
            test_df.loc[idx, 'Stage_fear'] = 1
        else:
            test_df.loc[idx, 'Stage_fear'] = 0

    if pd.isna(test_df.loc[idx,'Drained_after_socializing']):
          if test_df.loc[idx, 'Time_spent_Alone'] >= threshhold:
              test_df.loc[idx,'Drained_after_socializing'] = 1
          else:
            test_df.loc[idx, 'Drained_after_socializing'] = 0

In [None]:
test_df.isna().sum()

In [None]:
test_df.columns

In [None]:
test_df['Stage_fear'] = test_df['Stage_fear'].map(stage_fear_map)
test_df['Drained_after_socializing'] = test_df['Drained_after_socializing'].map(drained_map)

### 예측 수행 후 저장

In [None]:
features = ['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
       'Going_outside', 'Drained_after_socializing', 'Friends_circle_size',
       'Post_frequency']
X_test = test_df[features]
y_test_pred = best_model.predict(X_test)

submission = pd.DataFrame({
    'id' : test_df['id'],
    'Personality' : y_test_pred
})
submission.to_csv('person_submission1.csv', index=False)

In [None]:
sub_df = pd.read_csv('person_submission1.csv') # 'Extrovert' : 0,
    #'Introvert' : 1

In [None]:
sub_df['Personality'] = sub_df['Personality'].map({
    0 : 'Extrovert',
    1 : 'Introvert'
})

In [None]:
sub_df.to_csv('person1.csv',index=False)

In [None]:
import pandas as pd
result_df = pd.read_csv('person1.csv',index_col=0)

In [None]:
result_df