# This Notebook
# ... Is Super Junky. As long as this note is here, maybe assume
# you can't run it.

In here, everything on which we're planning to train gets merged together and dumped to a CSV and a pickle for quick reference.

This pickle is **keyed on dd_id**. But it's **phone numbers** that are relevant for the classifier.


------

# Setup
## Imports

In [1]:
import ujson as json
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from itertools import chain
from tqdm import tqdm, tqdm_pandas
from sqlalchemy import create_engine
%matplotlib inline

from helpers import phone_str_to_dd_format
from helpers import disaggregated_df
from helpers import aggregated_df
from helpers import dummify_df
from helpers import df_of_tables_for_dd_ids

# Read Test Data

In [2]:
jsns = [json.loads(x) for x in open('../../data/test_ht_data/ht_evaluation_NOCLASS.json', 'r')]
print('{} test entries'.format(len(jsns)))
for jsn in jsns:
    jsn['cdr_id'] = jsn['ad']['_id']
print('{} unique test CDR IDs'.format(len(set(jsn['cdr_id'] for jsn in jsns))))

test_df = pd.DataFrame.from_records(jsns).ix[:, ['phone', 'url', 'cdr_id']]
print(test_df.shape)

del jsns

5650 test entries
5548 unique test CDR IDs
(5650, 3)


In [3]:
test_df.head()

Unnamed: 0,phone,url,cdr_id
0,[2149051457],http://fortworth.backpage.com/online/classifie...,1807741462BCBBEDA63F5AB7DCF8B78F89ACA7069D31D6...
1,[2149051457],http://fortworth.backpage.com/online/classifie...,167A757A134C36D9BFFC8CA368AFEDC812A8EA8BCF6341...
2,[2149051457],http://massagetroll.com/dallas-massages/214-90...,BF6AA2B8EFB36A2F886EBC77CCD2AC64A50D68FC9B01D7...
3,[2149051457],http://dallas.backpage.com/online/classifieds/...,6517C7C4ACF83CBDACAC85123C03C4735B865774EF8C6E...
4,[2149051457],http://massagetroll.com/dallas-massages/214-90...,DDD6A267A5E067029B91B94390A83A7859C8655F15CA69...


In [4]:
test_df['phone'] = test_df.phone.apply(lambda x:','.join(x))
test_df = disaggregated_df(test_df, 'phone', ',')
print(test_df.shape)

test_df['phone'] = test_df.phone.apply(phone_str_to_dd_format)
print('{} unique CDR-phone pairs.'.format(test_df.drop_duplicates().shape[0]))
print('{} unique CDR IDs.'.format(test_df.cdr_id.nunique()))
print('{} unique phones.'.format(test_df.phone.nunique()))

(5550, 3)
5550 unique CDR-phone pairs.
5548 unique CDR IDs.
37 unique phones.


### Missing Phones!

**Only 29** of these 37 phones are in the Lattice / Deep Dive dump:

In [5]:
phones_in_dd = set(['(214) 643-0854', '(214) 905-1457', '(225) 572-9627',
                    '(281) 384-0032', '(281) 818-8756', '(325) 267-2364',
                    '(401) 212-7204', '(401) 523-2205', '(469) 478-6538',
                    '(469) 671-6959', '(501) 952-3516', '(504) 613-5108',
                    '(520) 312-2202', '(561) 727-7483', '(585) 284-9482',
                    '(619) 419-9933', '(702) 208-8444', '(702) 706-4860',
                    '(713) 598-4991', '(806) 239-8076', '(806) 544-8003',
                    '(806) 730-5586', '(817) 727-8494', '(832) 247-9222',
                    '(832) 757-8556', '(832) 918-7312', '(832) 994-9119',
                    '(912) 318-2015', '(912) 318-2157'])


The missing ones:

In [6]:
missing_phones = set(test_df.phone.unique()) - phones_in_dd
print(missing_phones)

{'(419) 327-7637', '(562) 285-2815', '(361) 413-5273', '(832) 403-2271', '(562) 415-3838', '(361) 442-3991', '(484) 727-4685', '(361) 717-4718'}


### Missing Ads!

There are **only 4,345** of the 5,548 unique CDR IDs of that match the test data in the Lattice / Deep Dive dump:

In [7]:
cdr_str_list = ','.join(['"{}"'.format(x) for x in test_df.cdr_id.unique()])

sql_con = create_engine('sqlite:////Users/pmlandwehr/wkbnch/memex/memex_queries/dd_dump.db')
query_str_fmt = 'select {} from {} where {} in ({})'.format
query_str = query_str_fmt('*', 'dd_id_to_cdr_id', 'cdr_id', cdr_str_list)

cdr_df = pd.read_sql(query_str, sql_con)
print(cdr_df.shape)

(4345, 2)


# Read DeepDive Data For Test
## Get all ads that match on phones

In [8]:
sql_con = create_engine('sqlite:////Users/pmlandwehr/wkbnch/memex/memex_queries/dd_dump.db')
query_str_fmt = 'select {} from {} where {} in ({})'.format

In [9]:
phone_str_list = ','.join(['"{}"'.format(x) for x in test_df.phone.unique()])
query_str = query_str_fmt('*', 'dd_id_to_phone', 'phone', phone_str_list)
df = pd.read_sql(query_str, sql_con)
print(df.shape)

(5658, 2)


In [10]:
df = df.drop_duplicates()
print(df.shape)

(3034, 2)


In [11]:
df_2 = df_of_tables_for_dd_ids(list(df.dd_id.unique()),
                                ['dd_id_to_price_duration',
                                'dd_id_to_flag',
                                'dd_id_to_age',
                                'dd_id_to_cbsa',
                                'dd_id_to_ethnicity',
                                'dd_id_to_cdr_id'],
                               sql_con)
print(df_2.shape)

(3020, 9)


In [12]:
df_3 = df.merge(df_2, on='dd_id', how='outer')

**Clean up**

In [13]:
del df
del df_2

## Merge in other cdr_ids
Even though we know the Lattice database doesn't include *all* of the requisite CDR IDs, it does include some of, apparently not synced up with these phones. Let's add them in.

In [14]:
cdr_ids_to_get = set(cdr_df.cdr_id.unique()) - set(df_3.cdr_id.unique())
print(len(cdr_ids_to_get))

1467


In [15]:
df_4 = df_of_tables_for_dd_ids(list(cdr_df.ix[cdr_df.cdr_id.isin(cdr_ids_to_get), 'dd_id']),
                                ['dd_id_to_phone',
                                'dd_id_to_price_duration',
                                'dd_id_to_flag',
                                'dd_id_to_age',
                                'dd_id_to_cbsa',
                                'dd_id_to_ethnicity',
                                'dd_id_to_cdr_id'],
                               sql_con)
print(df_4.shape)

(1499, 10)


In [16]:
df_5 = df_3.merge(df_4, how='outer')
print(df_5.shape)

(4536, 10)


**Clean up**

In [17]:
del df_3
del df_4

For those ads with prices and durations, let's add a price per minute value

In [18]:
df_5['price_per_min'] = df_5.price / df_5.duration_in_mins

# Join Deep Dive Data with Greg's HT Data
## STD Data
It looks like MSA, CBSA, and Name all match to the same values, so could drop some colums

In [19]:
std_df = pd.read_excel('../../data/greg_correlates/std.xlsx')

In [20]:
std_df.head()

Unnamed: 0,MSA,CBSA,Name,Disease,Year,Cases,Rate
0,31000US12060,12060,"Atlanta-Sandy Springs-Roswell, GA",Chlamydia,2009,20337,370.2
1,31000US12420,12420,"Austin-Round Rock, TX",Chlamydia,2009,8456,495.9
2,31000US12580,12580,"Baltimore-Columbia-Towson, MD",Chlamydia,2009,12883,478.8
3,31000US13820,13820,"Birmingham-Hoover, AL",Chlamydia,2009,6120,541.1
4,31000US14460,14460,"Boston-Cambridge-Newton, MA-NH",Chlamydia,2009,13285,289.5


In [21]:
df_5 = df_5.merge(std_df.ix[:, ['Name', 'Disease', 'Year', 'Cases', 'Rate', 'MSA']],
                  left_on='area',
                  right_on='Name',
                  how='left')
del df_5['Name']
print(df_5.shape)

(25158, 16)


**Clean up**

In [22]:
del std_df

## MSA Characteristics
Note that we're using the **yearly** version of the file. We could also use the **monthly** version. I'm primarily choosing yearly because monthly had some import issues that it doesn't seem worth wrangling at this second.

In [23]:
msa_df = pd.read_csv('../../data/greg_correlates/msa_characteristics.csv')

In [24]:
msa_df.head()

Unnamed: 0,census_msa_code,property,rape,violent,male_wage_mean,male_wage_p05,male_wage_p10,male_wage_p25,male_wage_p50,male_wage_p75,...,wage_sum.wght,female_epop,swnauthemp,swnftemp,ad_p10_msa,ad_p90_msa,ad_mean_msa,ad_median_msa,ad_count_msa,ad_p50_msa
0,31000US10180,5538.6,63.0,593.8,,,,,,,...,,,197.0,179.0,80.0,275.219812,169.981313,150.0,1148.0,150.0
1,31000US10420,21815.75,273.75,2182.0,30.566309,6.35,9.3,14.8,22.5,35.5,...,363283.0,0.463908,1034.0,975.0,83.333333,206.404629,134.60709,115.0,4503.0,115.0
2,31000US10500,6969.6,40.6,961.0,,,,,,,...,,,,,80.0,251.629542,150.384258,140.0,1565.0,140.0
3,31000US10580,21216.2,175.4,2430.0,31.147955,7.0,9.75,15.0,25.0,37.5,...,434363.0,0.472414,,,90.0,252.108219,165.44329,157.268464,3711.0,157.268464
4,31000US10740,37465.25,453.25,6314.5,26.628241,5.7,7.5,12.0,20.25,34.0,...,451024.0,0.400557,1347.0,1208.0,87.5,244.352807,156.882813,144.974193,9978.0,144.974193


In [25]:
df_5 = df_5.merge(msa_df,
                  left_on='MSA',
                  right_on='census_msa_code',
                 how='left')
del df_5['census_msa_code']
print(df_5.shape)

(25158, 59)


**Clean up**

In [26]:
del msa_df

# Save the Results

**NOTE** that I haven't done any sort of join with the "missing" phones or "missing" cdr ids from the test.
This is a deliberate choice. We're keeping these files limited to things that map with deep dive.

In [27]:
df_5.to_csv('../../data/merged/partial_test_data_to_use_by_ad.csv', index=False)
df_5.to_pickle('../../data/merged/partial_test_data_to_use_by_ad.pkl')