# Setup

In [1]:
import os
os.chdir('..')
import cudf
import hvplot.cudf

In [2]:
userinfo = cudf.read_csv('data/processed/201308/clean_user_info.csv')
clean_cdr = cudf.read_parquet('data/processed/201308/clean_cdr')

In [3]:
target = clean_cdr[
    (clean_cdr.calling_nbr.isin(userinfo.client_nbr))
    & (clean_cdr.called_nbr.isin(userinfo.client_nbr))
]

_test = cudf.concat([
    target[target.client_nbr == target.calling_nbr].calling_nbr.unique(),
    target[target.client_nbr == target.called_nbr].called_nbr.unique()
]).unique()


_control = cudf.concat([
    target[target.client_nbr == target.calling_nbr].called_nbr.unique(),
    target[target.client_nbr == target.called_nbr].calling_nbr.unique()
]).unique()

_test.isin(_control).sum() / len(_test)

0.9886039400000853

In [4]:
_control.isin(_test).sum() / len(_control)

0.9711253650177848

In [5]:
test = _test[~_test.isin(_control)]
control = _control[~_control.isin(_test)]

len(test)

2672

In [6]:
len(control)

6892

# Helper Class

In [7]:
class TCInspecter:
    continuous_features = [
        'age',
        'tenure',
        'phone_price',
        'arpu',
        'mou_total',
        'mou_local_callout',
        'network_usage_time',
        'mou_dist_callout'
    ]

    binary_features = [
        'male_flag',
        'evdo_support_flag',
        'use_evdo_flag',
        'use_onex_flag',
        'e9_service_flag',
        'e6_service_flag',
        'e9_service_premium_flag',
        '8card_service_flag',
        'smart_phone_flag',
        'govern_worker_flag',
        'business_purpose_flag',
        'red_mark_flag',
        'govern_cluster_flag',
        'govern_industry_flag',
        'vpn_support_flag'
    ]

    categorical_variables = [
        'register_district',
        'phone_level',
        'born_area_code',
        'phone_brand'
    ]

    def __init__(self, df, test, control):
        self.df = df
        self.test = df[df.client_nbr.isin(test)]
        self.control = df[df.client_nbr.isin(control)]


    @staticmethod
    def filter_outlier(data):
        q3 = data.quantile(0.75)
        q1 = data.quantile(0.25)
        iqr = q3 - q1

        upper = q3 + 1.5*iqr
        lower = q1 - 1.5*iqr

        output = data[(data > lower) & (data < upper)]
        if len(output) == 0:
            raise AttributeError
        else:
            return output


    def continuous_raw_plot(self):
        plot = (
            self.test[self.continuous_features[0]].hvplot.box(title='test') 
            + 
            self.control[self.continuous_features[0]].hvplot.box(title='control')
        )
        for i in range(1, len(self.continuous_features)):
            plot += (
                self.test[self.continuous_features[i]].hvplot.box(title='test') 
                + 
                self.control[self.continuous_features[i]].hvplot.box(title='control')
            )

        return plot.cols(2)


    def continuous_clean_plot(self):
        plot = (
            self.filter_outlier(self.test[self.continuous_features[0]]).hvplot.box(title='test') 
            + 
            self.filter_outlier(self.control[self.continuous_features[0]]).hvplot.box(title='control')
        )

        for i in range(1, len(self.continuous_features)):
            try:
                p = (
                    self.filter_outlier(self.test[self.continuous_features[i]]).hvplot.box(title='test') 
                    + 
                    self.filter_outlier(self.control[self.continuous_features[i]]).hvplot.box(title='control')
                )
                plot += p
            except:
                try:
                    p = self.filter_outlier(self.test[self.continuous_features[i]]).hvplot.box(title='test')
                    plot += p
                except:
                    try:
                        p = self.filter_outlier(self.control[self.continuous_features[i]]).hvplot.box(title='control')
                        plot += p
                    except:
                        continue
        return plot.cols(2)


    @property
    def binary_prob(self):
        control = []
        test = []
        for col in self.binary_features:
            control.append(self.control[col].mean())
            test.append(self.test[col].mean())
        df = cudf.DataFrame({
            'test': test,
            'control': control
        })
        df.index = self.binary_features

        return df

    def plot_category(self):
        df1 = (
            self.test['register_district'].value_counts()
            /
            len(self.test)
        ).to_frame().reset_index()
        df1['group'] = 'test'


        df2 = (
            self.control['register_district'].value_counts()
            /
            len(self.control)
        ).to_frame().reset_index()
        df2['group'] = 'control'

        plot = (
            cudf.concat([df1, df2])
            .rename(columns={'register_district': 'pct', 'index': 'register_district'})
            .hvplot.bar(by='register_district', y='pct', x='group', stacked=True)
        )

        for i in range(1, len(self.categorical_variables)):
            col = self.categorical_variables[i]
            df1 = (
                self.test[col].value_counts() 
                / 
                len(self.test)
            ).to_frame().reset_index()
            df1['group'] = 'test'


            df2 = (
                self.control[col].value_counts()
                / 
                len(self.control)
            ).to_frame().reset_index()
            df2['group'] = 'control'


            plot += (
                cudf.concat([df1, df2])
                .rename(columns={col: 'pct', 'index': col})
                .hvplot.bar(y='pct', x='group', by=col, stacked=True)
            )

        return plot.cols(1)

# Plot Continuous Variable

In [8]:
agent = TCInspecter(userinfo, test, control)

In [9]:
agent.continuous_raw_plot()

In [10]:
agent.continuous_clean_plot()

# Inspect Binary Variables

In [11]:
agent.binary_prob

Unnamed: 0,test,control
male_flag,0.683757,0.661492
evdo_support_flag,0.613024,0.812246
use_evdo_flag,0.462575,0.699071
use_onex_flag,0.175524,0.283952
e9_service_flag,0.54491,0.67368
e6_service_flag,0.15494,0.037145
e9_service_premium_flag,0.0,0.001161
8card_service_flag,0.007485,0.007545
smart_phone_flag,0.48241,0.700087
govern_worker_flag,0.318488,0.347649


# Inspect Categorial Variables

In [12]:
agent.plot_category()