#### 라이브러리 불러오기

In [2]:
import os
import pandas as pd

# 정적인 Matplotlib 대신, 대화형 그래프 및 시각화 지원 라이브러리
import plotly.express as px

#### 데이터 불러오기
* 네플릭스 사용자 데이터 셈플: 
  * 구독유형, 수익, 나이, 성별등이 있음

In [3]:
BASE_PATH = os.path.join(os.getcwd(), 'data')

In [4]:
data_df = pd.read_csv(
    os.path.join(BASE_PATH, 'Netflix Userbase.csv'),
    index_col='User ID'
)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2500 entries, 1 to 2500
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Subscription Type  2500 non-null   object
 1   Monthly Revenue    2500 non-null   int64 
 2   Join Date          2500 non-null   object
 3   Last Payment Date  2500 non-null   object
 4   Country            2500 non-null   object
 5   Age                2500 non-null   int64 
 6   Gender             2500 non-null   object
 7   Device             2500 non-null   object
 8   Plan Duration      2500 non-null   object
dtypes: int64(2), object(7)
memory usage: 195.3+ KB


#### 데이터 확인

##### 컬럼별 고유값 수

In [7]:
data_df.nunique()

Subscription Type      3
Monthly Revenue        6
Join Date            300
Last Payment Date     26
Country               10
Age                   26
Gender                 2
Device                 4
Plan Duration          1
dtype: int64

##### `구독유형`별 수익 및 나이 통계

In [8]:
data_df.groupby('Subscription Type') \
    [['Monthly Revenue', 'Age']].aggregate(['mean', 'std', 'min', 'max'])

Unnamed: 0_level_0,Monthly Revenue,Monthly Revenue,Monthly Revenue,Monthly Revenue,Age,Age,Age,Age
Unnamed: 0_level_1,mean,std,min,max,mean,std,min,max
Subscription Type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Basic,12.481481,1.681983,10,15,38.828829,7.190629,27,51
Premium,12.590723,1.733319,10,15,38.51296,7.302325,26,51
Standard,12.464844,1.64719,10,15,39.022135,7.020237,27,51


##### `국가별 구독유형별` 수익 및 나이 통계

In [9]:
data_df.groupby(['Country', 'Subscription Type']) \
    [['Monthly Revenue', 'Age']].aggregate(['mean', 'std', 'min', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Monthly Revenue,Monthly Revenue,Monthly Revenue,Monthly Revenue,Age,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,min,max,mean,std,min,max
Country,Subscription Type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Australia,Basic,12.193548,1.759154,10,15,37.935484,7.680866,27,51
Australia,Premium,12.415842,1.716201,10,15,38.613861,7.186056,27,51
Australia,Standard,12.529412,1.591891,10,15,38.098039,7.681809,27,51
Brazil,Basic,12.493151,1.727053,10,15,38.239726,7.479909,27,51
Brazil,Premium,12.424242,1.677616,10,15,38.969697,7.234833,27,51
Brazil,Standard,12.75,1.5,11,14,36.25,8.180261,29,46
Canada,Basic,12.524138,1.671172,10,15,38.393103,6.876183,27,51
Canada,Premium,12.340909,1.63922,10,15,38.488636,7.287202,27,51
Canada,Standard,12.47619,1.631587,10,15,39.440476,6.994578,28,51
France,Basic,12.361111,1.457057,10,15,39.277778,6.967726,28,51


#### 속성 공학(Feature Engineering)

##### 구독을 중단한 사람 확인
23-06-30일 이후 지불 내역이 없는 사람 확인

In [12]:
base_date = pd.to_datetime('2023-06-30')

In [11]:
data_df['Join Date'] = \
    pd.to_datetime(data_df['Join Date'], format='%Y-%m-%d')
data_df['Last Payment Date'] = \
    pd.to_datetime(data_df['Last Payment Date'], format='%Y-%m-%d')

In [13]:
data_df['Churned'] = \
    data_df['Last Payment Date'].apply(
        lambda date: True if date <= base_date else False
    )

In [15]:
data_df['Churned'].value_counts()

Churned
True     1674
False     826
Name: count, dtype: int64

In [16]:
work_df = data_df.copy()

##### 월평균 탈퇴자 수

In [17]:
work_df['Month'] = work_df['Join Date'].dt.to_period('M')

In [18]:
work_df.head()

Unnamed: 0_level_0,Subscription Type,Monthly Revenue,Join Date,Last Payment Date,Country,Age,Gender,Device,Plan Duration,Churned,Month
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Basic,10,2022-01-15,2023-10-06,United States,28,Male,Smartphone,1 Month,False,2022-01
2,Premium,15,2021-05-09,2023-06-22,Canada,35,Female,Tablet,1 Month,True,2021-05
3,Standard,12,2023-02-28,2023-06-27,United Kingdom,42,Male,Smart TV,1 Month,True,2023-02
4,Standard,12,2022-10-07,2023-06-26,Australia,51,Female,Laptop,1 Month,True,2022-10
5,Basic,10,2023-01-05,2023-06-28,Germany,33,Male,Smartphone,1 Month,True,2023-01


In [34]:
monthly_subscribers = work_df.groupby('Month')['Churned'].count()

In [35]:
monthly_subscribers

Month
2021-05      3
2021-08      1
2021-09      4
2021-10      2
2021-11      2
2021-12      2
2022-01     81
2022-02     87
2022-03     97
2022-04    106
2022-05    129
2022-06    336
2022-07    393
2022-08    291
2022-09    301
2022-10    415
2022-11    117
2022-12     95
2023-01      7
2023-02      5
2023-03      6
2023-04      2
2023-05      3
2023-06      2
2023-07      3
2023-08      4
2023-09      1
2023-10      1
2023-11      3
2023-12      1
Freq: M, Name: Churned, dtype: int64

In [36]:
work_df[work_df['Join Date'].dt.strftime('%Y-%m') == '2023-10']

Unnamed: 0_level_0,Subscription Type,Monthly Revenue,Join Date,Last Payment Date,Country,Age,Gender,Device,Plan Duration,Churned,Month,JMonth
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
21,Premium,15,2023-10-06,2023-06-22,United States,26,Female,Laptop,1 Month,True,2023-10,10


In [40]:
average_subscribers_per_month = monthly_subscribers.mean()
average_subscribers_per_month

83.33333333333333

#### 시각화

In [41]:
px.pie(data_df, names='Subscription Type')

In [42]:
px.pie(data_df, names='Country')

##### 넷플릭스 월 수입

In [47]:
histogram = px.histogram(data_df, x='Monthly Revenue', opacity=0.6)
histogram.update_layout(
    xaxis_title='Monthly Revenue',
    yaxis_title='Count',
    bargap=0.3,
    showlegend=False,
)
histogram.show()

In [56]:
px.histogram(data_df, x='Country', barmode='group')

##### 구독유형별로 얼마를 벌 수 있나?

In [57]:
px.histogram(data_df, x='Subscription Type', y='Monthly Revenue', barmode='group')

In [74]:
fig = px.histogram(data_df, x='Subscription Type', y='Monthly Revenue', barmode='group')
fig.update_layout(bargap=0.01, showlegend=False)
fig.show()

##### 나라별 유형별 수익

In [59]:
px.histogram(data_df, x='Country', color='Subscription Type', barmode='group')
# 수익을 더하란 말이 없는데 어떻게 수익을 더하지?

##### 연령대별 가장 인기있는 구독 유형

In [64]:
px.histogram(
    data_df,
    x='Age',
    color='Subscription Type',
    barmode='group',
    nbins=6
)

In [65]:
bins = [10, 20, 30, 40, 50, 60]
px.histogram(
    data_df,
    x='Age',
    color='Subscription Type',
    barmode='group',
    nbins=len(bins)-1,
    range_x=[min(bins), max(bins)],
    labels={'Age': 'Age Group'}
)

In [72]:
bins = [10, 20, 30, 40, 50, 60]
fig = px.histogram(
    data_df,
    x='Age',
    color='Subscription Type',
    barmode='group',
    nbins=len(bins)-1,
    histfunc='count',
    range_x=[min(bins), max(bins)],
    labels={'Age': 'Age Group'}
)
fig.update_xaxes(
    # type='category',
    tickmode='array',
    # 리이블이 출력되는 위치
    tickvals=[
        (start + end) / 2 for start, end in zip(bins[:-1], bins[1:]) # 중간 값 설정
    ],  
    ticktext=[
        f'{s}-{e}' for s, e in zip(bins[:-1], bins[1:])
    ]
)
fig.show()

##### 성별 인기있는 구독 방법

In [73]:
px.histogram(data_df, x='Gender', color='Subscription Type', barmode='group')