In [3]:
from itertools import chain
# To make it work on OS X
import matplotlib
matplotlib.use('TkAgg')
# --
import ujson as json
import multiprocessing as mp
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_fscore_support

from helpers import disaggregated_df
from helpers import aggregated_df
from helpers import dummify_df

%matplotlib inline
sns.set(font='monospace')

In [4]:
def format_jsns_to_key_count(jsns):
    records = []
    for record in jsns:
        key = list(record.keys())
        #print(key[0],len(record[key[0]]))
        records.append((key[0],np.asarray([len(record[key[0]])])))
    return records

# Load similar images info

In [5]:
#jsns = [json.loads(x) for x in open('../../data/images_expanded/ht_evaluation_NOCLASS_images.jl', 'r')]
jsns = [json.loads(x) for x in open('../../data/images_expanded/ht_evaluation_images.jl', 'r')]
records_images = format_jsns_to_key_count(jsns)

In [6]:
print(len(records_images))

4557


In [7]:
df_images = pd.DataFrame.from_items(records_images,orient='index',columns=['images_count'])

In [8]:
#print(df_images.head())
print(len(df_images.index.get_values()))

4557


In [9]:
#jsns = [json.loads(x) for x in open('../../data/images_expanded/ht_evaluation_NOCLASS_similar_images_count_th100_dist_th0.1.jsonl', 'r')]
jsns = [json.loads(x) for x in open('../../data/images_expanded/ht_evaluation_similar_images_count_th100_dist_th0.1.jsonl', 'r')]
records_images_similar = format_jsns_to_key_count(jsns)
df_images_similar = pd.DataFrame.from_items(records_images_similar,orient='index',columns=['similar_images_count'])

In [10]:
print(len(df_images_similar.index.get_values()))

4557


In [11]:
#jsns = [json.loads(x) for x in open('../../data/images_expanded/ht_evaluation_NOCLASS_expanded_ads_from_similar_images_count_th100_dist_th0.1.jsonl', 'r')]
jsns = [json.loads(x) for x in open('../../data/images_expanded/ht_evaluation_expanded_ads_from_similar_images_count_th100_dist_th0.1.jsonl', 'r')]
records_exp_ads_from_simimages = format_jsns_to_key_count(jsns)
df_exp_ads_from_simimages = pd.DataFrame.from_items(records_exp_ads_from_simimages,orient='index',columns=['exp_ads_from_simimages_count'])

In [12]:
print(len(df_exp_ads_from_simimages.index.get_values()))
print(df_exp_ads_from_simimages.head())

4557
                                                    exp_ads_from_simimages_count
727D6028F94666B2238C0E5A9532C1CD9CA680DB36839D8...                          2672
869DB34ABF57BFD8ECE843E790D7A2D881079DA0A02B350...                           342
4BED77C8D79B28F9E5A097B149E972F0F0B3BB32025F278...                            48
7A81A5306AA6BAC08488851B7477836873465D840E1C7C1...                          1188
E070910AC34A470BBD239DF9783A297247C4DAABA26EE82...                           216


## Try to merge with df

In [13]:
df = pd.read_csv('../../data/eval/partial_test_data_to_use_by_ad.csv')

In [14]:
print(df.columns)

Index(['dd_id', 'phone', 'price', 'duration_in_mins', 'flag', 'age', 'area',
       'area_type', 'ethnicity', 'cdr_id', 'price_per_min', 'Disease', 'Year',
       'Cases', 'Rate', 'MSA', 'property', 'rape', 'violent', 'male_wage_mean',
       'male_wage_p05', 'male_wage_p10', 'male_wage_p25', 'male_wage_p50',
       'male_wage_p75', 'male_wage_p90', 'male_wage_p95', 'male_N',
       'male_epop', 'qcew_code', 'msa', 'population', 'unemployment',
       'lt_highschool', 'highschool', 'some_college', 'college_plus',
       'frac_white', 'avg_commute', 'female_wage_mean', 'wage_var.wage',
       'female_wage_p05', 'female_wage_p10', 'female_wage_p25',
       'female_wage_p50', 'female_wage_p75', 'female_wage_p90',
       'female_wage_p95', 'female_N', 'wage_sum.wght', 'female_epop',
       'swnauthemp', 'swnftemp', 'ad_p10_msa', 'ad_p90_msa', 'ad_mean_msa',
       'ad_median_msa', 'ad_count_msa', 'ad_p50_msa'],
      dtype='object')


In [15]:
# Merge column "cdr_id" with index of the other data frames
print(len(df.columns),df.columns)
df_with_img_count = pd.merge(df, df_images, how='left', left_on='cdr_id', right_index=True)
df_with_img_count = pd.merge(df_with_img_count, df_exp_ads_from_simimages, how='left', left_on='cdr_id', right_index=True)
df_with_img_count = pd.merge(df_with_img_count, df_images_similar, how='left', left_on='cdr_id', right_index=True)
print(len(df_with_img_count.columns),df_with_img_count.columns)
print(df.shape)
print(df_with_img_count.shape)
# Missing all the negatives?

59 Index(['dd_id', 'phone', 'price', 'duration_in_mins', 'flag', 'age', 'area',
       'area_type', 'ethnicity', 'cdr_id', 'price_per_min', 'Disease', 'Year',
       'Cases', 'Rate', 'MSA', 'property', 'rape', 'violent', 'male_wage_mean',
       'male_wage_p05', 'male_wage_p10', 'male_wage_p25', 'male_wage_p50',
       'male_wage_p75', 'male_wage_p90', 'male_wage_p95', 'male_N',
       'male_epop', 'qcew_code', 'msa', 'population', 'unemployment',
       'lt_highschool', 'highschool', 'some_college', 'college_plus',
       'frac_white', 'avg_commute', 'female_wage_mean', 'wage_var.wage',
       'female_wage_p05', 'female_wage_p10', 'female_wage_p25',
       'female_wage_p50', 'female_wage_p75', 'female_wage_p90',
       'female_wage_p95', 'female_N', 'wage_sum.wght', 'female_epop',
       'swnauthemp', 'swnftemp', 'ad_p10_msa', 'ad_p90_msa', 'ad_mean_msa',
       'ad_median_msa', 'ad_count_msa', 'ad_p50_msa'],
      dtype='object')
62 Index(['dd_id', 'phone', 'price', 'duration_in_mins

In [16]:
#print(df_with_img_count.columns)
print(pd.isnull(df_with_img_count.similar_images_count).sum())
#print(df.cdr_id[0])

6348


In [17]:
print(df_with_img_count.loc[:,['cdr_id','class','images_count', 'similar_images_count','exp_ads_from_simimages_count']].head())

                                              cdr_id  class  images_count  \
0  DE06D9924AD4166B1D3293455FA56E76D6D4BFDF29563F...    NaN           4.0   
1  DE06D9924AD4166B1D3293455FA56E76D6D4BFDF29563F...    NaN           4.0   
2  DE06D9924AD4166B1D3293455FA56E76D6D4BFDF29563F...    NaN           4.0   
3  DE06D9924AD4166B1D3293455FA56E76D6D4BFDF29563F...    NaN           4.0   
4  DE06D9924AD4166B1D3293455FA56E76D6D4BFDF29563F...    NaN           4.0   

   similar_images_count  exp_ads_from_simimages_count  
0                   0.0                           0.0  
1                   0.0                           0.0  
2                   0.0                           0.0  
3                   0.0                           0.0  
4                   0.0                           0.0  


In [18]:
df_with_img_count.to_csv('../../data/eval/partial_test_data_to_use_by_ad_with_exp_imgs_v2.csv', index=False)