A Demo to run jupyter notebook

In [1]:
# connect to GPU
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
import dask

import os
os.chdir('../../')

cluster = LocalCUDACluster(
    CUDA_VISIBLE_DEVICES="0,1",
    protocol="ucx",
    enable_tcp_over_ucx=True,
    enable_infiniband=True,
    rmm_managed_memory=True,
    rmm_pool_size='24GB'
)
client = Client(cluster)

dask.config.set({"dataframe.backend": "cudf"})

from src.process.utils import CallDistanceUtil
driver = CallDistanceUtil()

In [None]:
user_info = driver.get_user_info(target='communication', method = 'mean')

  jitify._init_module()
This may cause some slowdown.
Consider scattering data ahead of time and using futures.


In [4]:
list(user_info.columns)

['client_nbr',
 'mean_communication_distance',
 'serv_id',
 'born_area_code',
 'register_district',
 'age',
 'male_flag',
 'tenure',
 'phone_brand',
 'phone_level',
 'phone_price',
 'evdo_support_flag',
 'arpu',
 'mou_total',
 'mou_local_callout',
 'mou_dist_callout',
 'network_usage_time',
 'use_evdo_flag',
 'use_onex_flag',
 'e9_service_flag',
 'e6_service_flag',
 'e9_service_premium_flag',
 '8card_service_flag',
 'smart_phone_flag',
 'govern_worker_flag',
 'business_purpose_flag',
 'red_mark_flag',
 'govern_cluster_flag',
 'govern_industry_flag',
 'vpn_support_flag']

In [3]:
import statsmodels.formula.api as smf
import pandas as pd

In [49]:
def encode_born_area_code(x):
    x_ = x // 100

    match x_:
        case 5106:
            return '德阳市'
        case 5101:
            return '成都市'
        case 5107:
            return '绵阳市'
        case 5110:
            return '内江市'
        case 5109:
            return '遂宁市'
        case 5130:
            return '已撤銷_达川地区'
        case 5102:
            return '已撤銷_乐山市市辖区'
        case 5108:
            return '广元市'
        case 5113:
            return '南充市'
        case 5129:
            return '已撤銷_南充地区'
        case 5103:
            return '自贡市'
        case 5111:
            return '乐山市'
        case 5131:
            return '已撤銷_雅安地區'
        case 5105:
            return '泸州市'
        case 5134:
            return '凉山彝族自治州'
        case 5139:
            return '已撤銷_眉山地区资阳地区'
        case 5137:
            return '已撤銷_巴中地區'
        case 5002:
            return '重慶市城口县'
        case 5138:
            return '已撤銷_眉山地区'
        case 5132:
            return '阿坝藏族羌族自治州'
        case 5125:
            return'已撤銷_宜宾市'
        case 5115:
            return '宜宾市'
        case 5116:
            return '广安市'
        case 5122:
            return '已撤銷_万县市'
        case 5104:
            return '攀枝花市'
        case 5133:
            return '甘孜藏族自治州'
        case 3303:
            return '浙江省温州市'
        case 6328:
            return '青海省海西蒙古族藏族自治州'
        case _:
            return '其他'

df = (
    user_info
    .born_area_code
    .apply(encode_born_area_code)
    .value_counts()[:26]
    .reset_index()
)
df['ratio'] = df['born_area_code'].apply(lambda x: x*100/user_info.shape[0])
df.rename(columns={'born_area_code': 'count'})

Unnamed: 0,index,count,ratio
0,德阳市,56085,84.906517
1,其他,2061,3.120127
2,绵阳市,1389,2.102793
3,成都市,1178,1.783362
4,内江市,927,1.403376
5,遂宁市,645,0.976459
6,已撤銷_乐山市市辖区,434,0.657028
7,已撤銷_雅安地區,346,0.523806
8,广元市,330,0.499584
9,已撤銷_达川地区,313,0.473848


In [50]:
user_info['born_area'] = user_info['born_area_code'].apply(encode_born_area_code)

In [63]:
data = (
        user_info[[
            'age', 'male_flag', 'tenure', 'business_purpose_flag', 'smart_phone_flag',
            'govern_worker_flag', 'govern_cluster_flag', 'govern_industry_flag',
            'register_district', 'born_area', 'mean_communication_distance'
        ]]
        .join([
            # compare to 中端
            pd.get_dummies(user_info['phone_level'], drop_first=True),
            # compare to 德阳现业
            # pd.get_dummies(user_info['register_district'], prefix='註冊地'),
            # pd.get_dummies(user_info['born_area'], prefix='出生地')

        ])
)

formula = """\
    mean_communication_distance ~ age + male_flag + tenure + \
        business_purpose_flag + smart_phone_flag + govern_worker_flag + \
        govern_cluster_flag + govern_industry_flag + 低端 + 超低端 + 超高端 + \
        高端 + register_district:born_area\
"""
smf.ols(formula, data=data).fit(cov_type='HC0').summary()



0,1,2,3
Dep. Variable:,mean_communication_distance,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,3728.0
Date:,"Mon, 29 Apr 2024",Prob (F-statistic):,0.0
Time:,06:52:01,Log-Likelihood:,-321020.0
No. Observations:,66055,AIC:,642200.0
Df Residuals:,66004,BIC:,642600.0
Df Model:,50,,
Covariance Type:,HC0,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,13.6661,0.917,14.900,0.000,11.868,15.464
born_area[T.凉山彝族自治州],2.4215,7.072,0.342,0.732,-11.440,16.283
born_area[T.已撤銷_眉山地区],-7.6044,1.943,-3.914,0.000,-11.412,-3.797
born_area[T.攀枝花市],45.8640,27.701,1.656,0.098,-8.429,100.157
born_area[T.浙江省温州市],1.6195,7.946,0.204,0.839,-13.955,17.194
born_area[T.甘孜藏族自治州],-6.1333,2.164,-2.834,0.005,-10.375,-1.891
born_area[T.青海省海西蒙古族藏族自治州],-8.1514,1.610,-5.062,0.000,-11.307,-4.995
register_district[T.什邡]:born_area[其他],-2.1508,0.450,-4.780,0.000,-3.033,-1.269
register_district[T.广汉]:born_area[其他],-1.0739,0.425,-2.525,0.012,-1.908,-0.240

0,1,2,3
Omnibus:,100692.134,Durbin-Watson:,1.912
Prob(Omnibus):,0.0,Jarque-Bera (JB):,44371575.994
Skew:,9.706,Prob(JB):,0.0
Kurtosis:,128.479,Cond. No.,1.39e+16


In [64]:
def simple_encode_born_area_code(x):
    x_ = x // 100
    
    match x_:
        case 5133:
            return '甘孜藏族自治州'
        case 5134:
            return '凉山彝族自治州'
        case 5138:
            return '已撤銷_眉山地区'
        case 6328:
            return '青海省海西蒙古族藏族自治州'
        case _:
            return '其他'

user_info['born_area'] = user_info['born_area_code'].apply(simple_encode_born_area_code)

data = (
    user_info[[
        'age', 'male_flag', 'tenure', 'business_purpose_flag', 'smart_phone_flag',
        'govern_worker_flag', 'govern_cluster_flag', 'govern_industry_flag',
        'register_district', 'born_area', 'mean_communication_distance'
    ]]
    .join([
        # compare to 中端
        pd.get_dummies(user_info['phone_level'], drop_first=True),

    ])
)
formula = """\
    mean_communication_distance ~ age + male_flag + tenure + \
        business_purpose_flag + smart_phone_flag + govern_worker_flag + \
        govern_cluster_flag + govern_industry_flag + 低端 + 超低端 + 超高端 + \
        高端 + C(born_area):C(register_district)\
"""
(
    smf.ols(
        formula,
        data=data,
    )
    .fit(cov_type='HC0')
    .summary()
)



0,1,2,3
Dep. Variable:,mean_communication_distance,R-squared:,0.009
Model:,OLS,Adj. R-squared:,0.009
Method:,Least Squares,F-statistic:,3862.0
Date:,"Mon, 29 Apr 2024",Prob (F-statistic):,0.0
Time:,06:56:41,Log-Likelihood:,-321100.0
No. Observations:,66055,AIC:,642300.0
Df Residuals:,66015,BIC:,642600.0
Df Model:,39,,
Covariance Type:,HC0,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,13.7527,0.930,14.793,0.000,11.931,15.575
C(register_district)[T.什邡],-2.1565,0.450,-4.797,0.000,-3.038,-1.275
C(register_district)[T.广汉],-1.0528,0.427,-2.468,0.014,-1.889,-0.217
C(register_district)[T.德阳现业],-1.2302,0.381,-3.230,0.001,-1.977,-0.484
C(register_district)[T.绵竹],-1.3468,0.460,-2.931,0.003,-2.247,-0.446
C(register_district)[T.罗江],-1.4862,0.580,-2.564,0.010,-2.622,-0.350
C(born_area)[T.凉山彝族自治州]:C(register_district)[中江],2.4108,7.071,0.341,0.733,-11.447,16.269
C(born_area)[T.已撤銷_眉山地区]:C(register_district)[中江],-7.6026,1.949,-3.901,0.000,-11.423,-3.783
C(born_area)[T.甘孜藏族自治州]:C(register_district)[中江],-6.1280,2.162,-2.834,0.005,-10.365,-1.891

0,1,2,3
Omnibus:,100865.575,Durbin-Watson:,1.913
Prob(Omnibus):,0.0,Jarque-Bera (JB):,44653999.84
Skew:,9.739,Prob(JB):,0.0
Kurtosis:,128.877,Cond. No.,1.39e+16


In [69]:
data = (
    user_info[[
        'age', 'male_flag', 'tenure', 'business_purpose_flag', 'smart_phone_flag',
        'govern_worker_flag', 'govern_cluster_flag', 'govern_industry_flag',
        'register_district', 'born_area', 'mean_communication_distance'
    ]]
    .join([
        # compare to 中端
        pd.get_dummies(user_info['phone_level'], drop_first=True),
        pd.get_dummies(user_info['register_district']),
        pd.get_dummies(user_info['born_area'])

    ])
)
formula = """\
    mean_communication_distance ~ age + male_flag + tenure + \
        business_purpose_flag + smart_phone_flag + govern_worker_flag + \
        govern_cluster_flag + govern_industry_flag + 低端 + 超低端 + 超高端 + \
        高端 + C(register_district) + C(born_area):C(register_district)\
"""
# (
#     smf.ols(
#         formula,
#         data=data,
#         drop_cols=['']
#     )
#     .fit(cov_type='HC0')
#     .summary()
# )

['age',
 'male_flag',
 'tenure',
 'business_purpose_flag',
 'smart_phone_flag',
 'govern_worker_flag',
 'govern_cluster_flag',
 'govern_industry_flag',
 'register_district',
 'born_area',
 'mean_communication_distance',
 '低端',
 '超低端',
 '超高端',
 '高端',
 '中江',
 '什邡',
 '广汉',
 '德阳现业',
 '绵竹',
 '罗江',
 '其他',
 '凉山彝族自治州',
 '已撤銷_眉山地区',
 '甘孜藏族自治州',
 '青海省海西蒙古族藏族自治州']