In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv("galaxy_users.csv")
df.head(2)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


### Q1.

In [None]:
df_q1 = df.loc[:, "OnlineSecurity":"StreamingMovies"].copy()
df_q1.head(2)

In [None]:
df_q1["OnlineSecurity"].unique()

In [None]:
for col_name in df_q1.columns:
    print(col_name, df_q1[col_name].unique())

In [11]:
# df_q1.drop_duplicates() # 중복 행 제거.
# df_q1.unique() # 데이터프레임 객체에는 .unique() 메서드가 없음
# df_q1.apply(lambda x: [x.unique()]) # 시험버전은 리스트로 감싸주어야 한다.
df_q1.apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,Yes,Yes,Yes
2,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service


In [13]:
df_q1_sub = df_q1.loc[df_q1["OnlineSecurity"] != "No internet service", ]

In [15]:
# df_q1.loc[df_q1["OnlineSecurity"] == "No internet service", ]

In [16]:
# 만약 "No internet service"가 중구난방으로 들어있다면
# 반복문을 활용한 처리 또는 결측치를 활용한 처리가 필요함.
# ※ 결츠기를 활용한 처리는 사전에 결측치가 모두 제거된 상태에서 실시해야 한다.
df_q1_sub = df_q1.replace("No internet service", np.nan).dropna()
len(df_q1_sub)

5512

In [22]:
df_q1_sub.head(2).apply(lambda x: x.isin(["Yes", "No"]).sum(), axis = 1)

In [23]:
df_q1_sub.apply(lambda x: x.unique())

In [20]:
df_q1_sub = df_q1_sub.replace({"Yes": 1, "No": 0})
df_q1_sub["cnt"] = df_q1_sub.sum(axis = 1)

In [21]:
df_q1_sub.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,cnt
0,0,1,0,0,0,0,1
1,1,0,1,0,0,0,2


In [24]:
ser_cnt = df_q1_sub["cnt"].value_counts()
ser_cnt

cnt
3    1117
2    1033
1     966
4     850
0     693
5     569
6     284
Name: count, dtype: int64

In [27]:
round(ser_cnt[1] / ser_cnt[6], 1)

3.4

### Q2.

In [28]:
df_q2 = df[["tenure", "MonthlyCharges", "TotalCharges"]].copy()

In [30]:
7 // 2

3

In [31]:
df_q2["month"] = df_q2["TotalCharges"] // df_q2["MonthlyCharges"]
df_q2.head(2)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,month
0,1,29.85,29.85,1.0
1,34,56.95,1889.5,33.0


In [32]:
df_q2.iloc[:, [0, 1, 3]].corr().round(3) # 0.999!!!

Unnamed: 0,tenure,MonthlyCharges,month
tenure,1.0,0.247,0.999
MonthlyCharges,0.247,1.0,0.246
month,0.999,0.246,1.0


### Q3.

In [35]:
col1 = ["SeniorCitizen", "Partner", "Dependents", "tenure", "MonthlyCharges", "TotalCharges"]
col2 = ["OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingMovies", "PaperlessBilling"]
df_q3 = df[["Churn"] + col1 + col2].copy()
df_q3.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,No,0,Yes,No,1,29.85,29.85,No,Yes,No,No,No,Yes
1,No,0,No,No,34,56.95,1889.5,Yes,No,Yes,No,No,No


In [51]:
df_q3 = df_q3.replace({"Yes": 1, "No": 0})
df_q3.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,0,0,1,0,1,29.85,29.85,0,1,0,0,0,1
1,0,0,0,0,34,56.95,1889.5,1,0,1,0,0,0


In [42]:
ser_d = df_q3.dtypes
ser_d[ser_d == "object"].index

Index(['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingMovies'],
      dtype='object')

In [43]:
# df_q3_obj = df_q3.select_dtypes(exclude = "number") # 시험버전에서 버그로 쓸 수 없음.
df_q3_obj = df_q3[ser_d[ser_d == "object"].index]
df_q3_obj.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies
0,0,1,0,0,0
1,1,0,1,0,0


In [52]:
df_q3_obj.apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies
0,0,1,0,0,0
1,1,0,1,1,1
2,No internet service,No internet service,No internet service,No internet service,No internet service


In [53]:
df_q3 = df_q3.replace("No internet service", -1)

In [55]:
df_train, df_test = train_test_split(df_q3, train_size = 0.7,
                                     random_state = 123)
len(df_train), len(df_test)

(4922, 2110)

In [57]:
model_nor = MinMaxScaler().fit(df_train)
arr_train_nor = model_nor.transform(df_train)
arr_test_nor  = model_nor.transform(df_test)

In [58]:
arr_train_nor[:1, ]

array([[1.        , 0.        , 0.        , 0.        , 0.08450704,
        0.81116094, 0.07551927, 0.5       , 1.        , 0.5       ,
        0.5       , 1.        , 1.        ]])

In [61]:
model_nor.data_max_

In [62]:
model_nor.data_min_

In [65]:
model_lr = LogisticRegression(random_state = 123)
model_lr.fit(X = arr_train_nor[:, 1:],
             y = arr_train_nor[:,  0])
pred = model_lr.predict(arr_test_nor[:, 1:])
pred[:3]

array([0., 0., 0.])

In [67]:
f1_score(y_true = arr_test_nor[:, 0], 
               y_pred = pred)

0.5479723046488625

In [66]:
round(f1_score(y_true = arr_test_nor[:, 0], 
               y_pred = pred), 2)

0.55

#### Q. 특정 범주를 제외한 매우 많은 범주를 특정 값으로 일괄 치환해야 하는 경우

In [69]:
df_d = pd.read_csv("../diamonds.csv")
df_d.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


In [70]:
# df_d[["cut", "color", "clarity"]].apply(lambda x: [x.unique()]) # 시험버전
# 시험버전에서는 .explode() 를 두 번 써야될 수 있음.
df_d[["cut", "color", "clarity"]].apply(lambda x: x.unique())

cut         [Ideal, Premium, Good, Very Good, Fair]
color                         [E, I, J, H, F, G, D]
clarity    [SI2, SI1, VS1, VS2, VVS2, VVS1, I1, IF]
dtype: object

> H와 F를 제외한 나머지 범주를 전부 -1로 치환하려면??

In [71]:
ser_u = df_d[["cut", "color", "clarity"]].apply(lambda x: x.unique()).explode()
ser_u.head()

cut        Ideal
cut      Premium
cut         Good
cut    Very Good
cut         Fair
dtype: object

In [74]:
ser_repl = pd.Series(np.where(~ser_u.isin(["H", "F"]), -1, ser_u),
                     index = ser_u)
ser_repl.to_dict()

In [75]:
df_d2 = df_d.replace(ser_repl)
df_d2.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,-1,-1,-1,61.5,55.0,326,3.95,3.98,2.43
1,0.21,-1,-1,-1,59.8,61.0,326,3.89,3.84,2.31


In [76]:
df_d2[["cut", "color", "clarity"]].apply(lambda x: x.unique())

cut              [-1]
color      [-1, H, F]
clarity          [-1]
dtype: object