# This notebook is also probably a bit janky. You might not be able to run it.
Takes the original collection of ad data, groups it by phone with some summary stats, and ads some additional phone-level columns. It isn't *quite* the process Jeff was using, but it's darn close.


## Setup

In [1]:
from itertools import chain
import ujson as json
import multiprocessing as mp
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier

from helpers import disaggregated_df
from helpers import aggregated_df
from helpers import dummify_df
from helpers import phone_str_to_dd_format

## Read Svebor's Merge

In [2]:
df = pd.read_csv('../../data/merged/data_to_use_by_ad_v3_with_exp_imgs.csv')
print(df.shape)
df['has_images'] = df['images_count'].notnull()
print(df.shape)

(191949, 63)
(191949, 64)


## Join in Steve's Data

In [3]:
steve = pd.read_csv('../../data/phone_aggregates/phones.csv')
steve_cols = ['n_ads',
              'n_distinct_locations',
              'location_tree_length',
              'n_outcall',
              'n_incall',
              'n_incall_and_outcall',
              'n_cooccurring_phones',
              'average_n_days_before_revisit']

steve_phone = steve.ix[:, ['phone'] + steve_cols].drop_duplicates()
print(steve_phone.shape)

(1527450, 9)


In [4]:
df = df.merge(steve_phone, how='left', on='phone')
print(df.columns)

Index(['class', 'phone', 'dd_id', 'price', 'duration_in_mins', 'flag', 'age',
       'area', 'area_type', 'ethnicity', 'cdr_id', 'price_per_min', 'Disease',
       'Year', 'Cases', 'Rate', 'MSA', 'property', 'rape', 'violent',
       'male_wage_mean', 'male_wage_p05', 'male_wage_p10', 'male_wage_p25',
       'male_wage_p50', 'male_wage_p75', 'male_wage_p90', 'male_wage_p95',
       'male_N', 'male_epop', 'qcew_code', 'msa', 'population', 'unemployment',
       'lt_highschool', 'highschool', 'some_college', 'college_plus',
       'frac_white', 'avg_commute', 'female_wage_mean', 'wage_var.wage',
       'female_wage_p05', 'female_wage_p10', 'female_wage_p25',
       'female_wage_p50', 'female_wage_p75', 'female_wage_p90',
       'female_wage_p95', 'female_N', 'wage_sum.wght', 'female_epop',
       'swnauthemp', 'swnftemp', 'ad_p10_msa', 'ad_p90_msa', 'ad_mean_msa',
       'ad_median_msa', 'ad_count_msa', 'ad_p50_msa', 'images_count',
       'exp_ads_from_simimages_count', 'similar_images_

## Sort out aggregation of continuous variables

In [5]:
numerical_vars = ['age',
                  'price',
                  'duration_in_mins',
                  'price_per_min',
                  'images_count',
                  'exp_ads_from_simimages_count',
                  'similar_images_count']

phone_level_vars = ['n_ads',
                    'n_distinct_locations',
                    'location_tree_length',
                    'n_outcall', 'n_incall',
                    'n_incall_and_outcall',
                    'n_cooccurring_phones']

In [6]:
numerical = df.groupby('phone')[numerical_vars].describe().unstack().reset_index()
print(numerical.shape)

(567, 57)


In [7]:
phone_level_vars = df.groupby('phone')[phone_level_vars].max().reset_index()
print(phone_level_vars.shape)

(567, 8)


## Sort out categorical variables

In [8]:
flag_dummies = pd.get_dummies(df['flag'])
flag_dummies = pd.concat([df['phone'], flag_dummies], axis=1)
discrete = flag_dummies.groupby('phone').mean().reset_index()

## Merge all and clean up.

In [18]:
phone_level = numerical.merge(discrete, on='phone', how='outer').merge(phone_level_vars, on='phone', how='outer')
print(phone_level.shape)

phone_level.set_index('phone', inplace=True)
phone_level['has_images'] = df.groupby('phone')['has_images'].max()
print(phone_level.shape)

(567, 128)
(567, 128)


In [19]:
phone_level.reset_index(inplace=True)
print(phone_level.shape)

(567, 129)


In [17]:
phone_level = phone_level.fillna(0)

In [None]:
phone_level.to_csv('../../data/merged/data_to_use_by_phone_v4.csv', index=False)
phone_level.to_pickle('../../data/merged/data_to_use_by_phone_v4.pkl')