In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [3]:
df = pd.read_csv("galaxy_users.csv")
df.head(2)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


### Q1.

In [5]:
df_q1 = df.loc[:, "OnlineSecurity":"StreamingMovies"].copy()
df_q1.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,No,No,No


In [6]:
df_q1["OnlineSecurity"].unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [9]:
# df_q1.unique()
# df_q1.drop_duplicates()

In [11]:
for n_col in range(6):
    print(df_q1.columns[n_col], ":", df_q1.iloc[:, n_col].unique())

OnlineSecurity : ['No' 'Yes' 'No internet service']
OnlineBackup : ['Yes' 'No' 'No internet service']
DeviceProtection : ['No' 'Yes' 'No internet service']
TechSupport : ['No' 'Yes' 'No internet service']
StreamingTV : ['No' 'Yes' 'No internet service']
StreamingMovies : ['No' 'Yes' 'No internet service']


In [12]:
df_q1.apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,Yes,Yes,Yes
2,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service


In [13]:
pd.__version__

'1.5.1'

In [15]:
df_q1.apply(lambda x: x.unique()).melt()["value"].unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [16]:
df_q1_sub = df_q1.loc[df_q1["OnlineSecurity"] != "No internet service"]
len(df_q1), len(df_q1_sub)

(7032, 5512)

In [17]:
df_q1_sub.apply(lambda x: x.unique()).melt()["value"].unique()

array(['No', 'Yes'], dtype=object)

In [18]:
df_q1_sub2 = df_q1.loc[df_q1["OnlineSecurity"] == "No internet service"]
df_q1_sub2.head()

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
11,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
16,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
21,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
22,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
33,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service


In [22]:
df_q1_sub = df_q1_sub.replace({"Yes": 1, "No": 0})
df_q1_sub["cnt"] = df_q1_sub.sum(axis = 1)
df_q1_sub.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,cnt
0,0,1,0,0,0,0,1
1,1,0,1,0,0,0,2


In [23]:
df_q1_sub["cnt"].value_counts()

3    1117
2    1033
1     966
4     850
0     693
5     569
6     284
Name: cnt, dtype: int64

In [24]:
966 / 284

3.4014084507042255

In [25]:
round(df_q1_sub["cnt"].value_counts()[1] / df_q1_sub["cnt"].value_counts()[6], 1)

3.4

In [27]:
len(df_q1), len(df_q1_sub)

(7032, 5512)

In [28]:
df_q1_sub3 = df_q1.replace({"No internet service": np.nan}).dropna()
len(df_q1_sub3)

5512

yes랑 no만 들어있는 row를 추출하고자 한다!

In [31]:
df_q1.shape[1]

6

In [30]:
df_q1_sub4 = df_q1.loc[df_q1.apply(lambda x: x.isin(["Yes", "No"]).sum(),
                                                        axis = 1) == df_q1.shape[1], ]
len(df_q1_sub4)

5512

### Q2.

In [34]:
10 // 3

3

In [36]:
df_q2 = df[["tenure", "MonthlyCharges", "TotalCharges"]].copy()
df_q2.head(2)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,1,29.85,29.85
1,34,56.95,1889.5


In [37]:
df_q2["month"] = df_q2["TotalCharges"] // df_q2["MonthlyCharges"]
df_q2.head(2)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,month
0,1,29.85,29.85,1.0
1,34,56.95,1889.5,33.0


In [38]:
df_q2.drop("TotalCharges", axis = 1).corr()

Unnamed: 0,tenure,MonthlyCharges,month
tenure,1.0,0.246862,0.998831
MonthlyCharges,0.246862,1.0,0.246164
month,0.998831,0.246164,1.0


In [39]:
df_q2.drop("TotalCharges", axis = 1).corr()["month"].round(3)[0]

0.999

In [None]:
df_q2.corr()

### Q3.

In [44]:
col1 = ["SeniorCitizen", "Partner", "Dependents", "tenure", "MonthlyCharges", "TotalCharges"]
col2 = ["OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingMovies", "PaperlessBilling"]
df_q3 = df[["Churn"] + col1 + col2].copy()
df_q3.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,No,0,Yes,No,1,29.85,29.85,No,Yes,No,No,No,Yes
1,No,0,No,No,34,56.95,1889.5,Yes,No,Yes,No,No,No


In [45]:
df_q3 = df_q3.replace({"Yes": 1, "No": 0})
df_q3.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,0,0,1,0,1,29.85,29.85,0,1,0,0,0,1
1,0,0,0,0,34,56.95,1889.5,1,0,1,0,0,0


In [46]:
df_q3_cat = df_q3.select_dtypes(exclude = "number")
df_q3_cat.head()

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies
0,0,1,0,0,0
1,1,0,1,0,0
2,1,1,0,0,0
3,1,0,1,1,0
4,0,0,0,0,0


In [47]:
df_q3_cat.apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies
0,0,1,0,0,0
1,1,0,1,1,1
2,No internet service,No internet service,No internet service,No internet service,No internet service


In [48]:
df_q3 = df_q3.replace({"No internet service": -1})
df_q3.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,0,0,1,0,1,29.85,29.85,0,1,0,0,0,1
1,0,0,0,0,34,56.95,1889.5,1,0,1,0,0,0


In [49]:
df_train, df_test = train_test_split(df_q3, train_size = 0.7, random_state = 123)
len(df_train), len(df_test)

(4922, 2110)

In [50]:
model_nor = MinMaxScaler().fit(df_train)
df_train_nor = model_nor.transform(df_train)
df_test_nor  = model_nor.transform(df_test)

In [52]:
df_train_nor = pd.DataFrame(df_train_nor, columns = df_train.columns)
df_test_nor  = pd.DataFrame(df_test_nor,  columns = df_test.columns)

In [55]:
# df_train_nor[:2, ]
df_train_nor.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,1.0,0.0,0.0,0.0,0.084507,0.811161,0.075519,0.5,1.0,0.5,0.5,1.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.607374,0.006987,0.5,0.5,0.5,0.5,0.5,1.0


In [60]:
model_lr = LogisticRegression(random_state = 123)
model_lr.fit(X = df_train_nor.drop(columns = "Churn"),
             y = df_train_nor["Churn"])
pred = model_lr.predict(df_test_nor.drop(columns = "Churn"))
pred[:4]

array([0., 0., 0., 0.])

In [61]:
round(f1_score(y_true = df_test_nor["Churn"], 
               y_pred = pred), 2)

0.55

In [63]:
df_test_nor.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,0.0,0.0,0.0,0.0,0.478873,0.017937,0.076806,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.239437,0.572995,0.156272,0.5,0.5,0.5,1.0,0.5,1.0


In [65]:
pd.DataFrame(model_nor.inverse_transform(df_test_nor.head(2)), 
             columns = df_test.columns)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,0.0,0.0,0.0,0.0,35.0,20.2,684.4,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
1,0.0,0.0,0.0,0.0,18.0,75.9,1373.05,0.0,0.0,0.0,1.0,0.0,1.0


In [66]:
df_test.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
6819,0,0,0,0,35,20.2,684.4,-1,-1,-1,-1,-1,1
364,0,0,0,0,18,75.9,1373.05,0,0,0,1,0,1


In [67]:
pd.DataFrame(model_nor.inverse_transform(df_test_nor.iloc[:2, :3]), 
             columns = df_test.columns)

ValueError: operands could not be broadcast together with shapes (2,3) (13,) (2,3) 