In [1]:
import os
os.chdir('../..')

In [2]:
from classes.matching import Matching
from classes.helpers import *
import numpy as np
import json
import pandas as pd
import networkx as nx
from datetime import datetime
import gzip
import json

from sklearn.linear_model import LogisticRegression

import ast

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

data_folder = '../data/'

min_nbr_ratings = 5

# Prepare DF

In [3]:
ts = pd.read_csv(data_folder + 'tmp/time_series_{}_valid.csv'.format(min_nbr_ratings), header=[0,1])

# Transform strings into arrays
for i in ts.index:
    for key1 in ['ba', 'rb']:
        for key2 in ['dates', 'ratings', 'z_scores']:
            arr = ast.literal_eval(ts.loc[i][key1][key2])
            ts.set_value(i, (key1, key2), arr)

df = pd.read_csv(data_folder + 'matched/beers.csv', header=[0,1])

brews = {'ba': None, 'rb': None}
brews['ba'] = pd.read_csv(data_folder + 'ba/breweries.csv')
brews['rb'] = pd.read_csv(data_folder + 'rb/breweries.csv')

In [4]:
diffs = {'ba': [], 'rb': []}
for i in ts.index:
    row = ts.iloc[i]
    for key in ['rb', 'ba']:
        diffs[key].append(row[key]['z_scores'][0])
        
thresholds = {}
for key in ['ba', 'rb']:
    thresholds[key] = {}
    thresholds[key]['low'] = np.percentile(diffs[key], 15)
    thresholds[key]['high'] = np.percentile(diffs[key], 85)
    
for key in ['ba', 'rb']:
    tmp = []
    nbr_rats = []
    
    for i in ts.index:      
        if diffs[key][i] > thresholds[key]['high']:
            tmp.append('H')
        elif diffs[key][i] >= thresholds[key]['low']:
            tmp.append('M')
        else:
            tmp.append('L')
            
        nbr_rats.append(len(ts.loc[i][key]['ratings']))

    ts.loc[:, ((key, 'class'))] = tmp
    ts.loc[:, ((key, 'nbr_ratings'))] = nbr_rats

In [5]:
df_json = {'beer_id_ba': [], 'ba': [], 'rb': [], 'abv': [], 'location': [], 'style': []}

for i in ts.index:
    row_ts = ts.loc[i]
    beer_id = int(row_ts['ba']['beer_id'])
    
    df_json['beer_id_ba'].append(beer_id)
    df_json['ba'].append(row_ts['ba']['class'])
    df_json['rb'].append(row_ts['rb']['class'])
    
    row_beer = df[df['ba']['beer_id'] == beer_id]
    row_beer = row_beer.loc[row_beer.index[0]]
    brewery_id = int(row_beer['ba']['brewery_id'])
    
    df_json['abv'].append(row_beer['ba']['abv'])
    df_json['style'].append(row_beer['ba']['style'])

    row_brewery = brews['ba'][brews['ba']['id'] == brewery_id]
    row_brewery = row_brewery.loc[row_brewery.index[0]]

    loc = row_brewery['location']
    if 'United States' in loc:
        loc = 'United States'
    
    df_json['location'].append(loc)

In [6]:
prop_df = pd.DataFrame.from_dict(df_json)

In [7]:
prop_df.to_csv(data_folder + 'tmp/propensity_{}.csv'.format(min_nbr_ratings), index=False)

# Compute Propensity score

In [8]:
df = pd.read_csv(data_folder + 'tmp/propensity_{}.csv'.format(min_nbr_ratings))

In [9]:
style = list(df['style'].unique())
location = list(df['location'].unique())

In [10]:
df['style_nbr'] = [style.index(df.iloc[i]['style']) for i in df.index]
df['location_nbr'] = [location.index(df.iloc[i]['location']) for i in df.index]

## Match the beers on the propensity score

In [11]:
ids = {}
for cl in ['H-L', 'H-M', 'M-L']:
    
    # Create subdf
    subdf = df[((df['ba'] == cl[0]) & (df['rb'] == cl[-1])) | ((df['ba'] == cl[-1]) & (df['rb'] == cl[0]))]

    # Prepare data for Logistic regression
    X = np.array(subdf[['abv', 'style_nbr', 'location_nbr']])

    y = []

    for i in subdf.index:
        if subdf.loc[i]['ba'] == cl[0]:
            y.append(1)
        else:
            y.append(0)

    y = np.array(y)
    subdf['y'] = y
    
    # Apply logistic regression 
    logistic = LogisticRegression()
    logistic.fit(X, y)
    
    # Predict to get the propensity score
    vals = logistic.predict_proba(X)
    prop = []
    for i in vals:
        prop.append(i[0])
        
    subdf['prop'] = prop
    
    # Create the bipartite graph
    graph = nx.Graph()

    list0 = []
    list1 = []
    for i in subdf.index:
        if subdf.loc[i]['ba'] == cl[0]:
            list0.append(subdf.loc[i]['beer_id_ba'])
        else:
            list1.append(subdf.loc[i]['beer_id_ba'])

    graph.add_nodes_from(list0, bipartite=0)
    graph.add_nodes_from(list1, bipartite=1)

    edges = []
    for i in list0:
        for j in list1:
            val = np.abs(float(subdf[subdf['beer_id_ba'] == i]['prop']) - float(subdf[subdf['beer_id_ba'] == j]['prop']))

            edges.append((i,j, -val))

    graph.add_weighted_edges_from(edges)
    
    # Solve the min weight matching with max cardinality
    res = nx.max_weight_matching(graph, maxcardinality=True)
    
    # Get the result
    ids_tmp = []
    val = 0
    for i in res.keys():
        val += 0.5*np.abs(float(subdf[subdf['beer_id_ba'] == i]['prop']) - float(subdf[subdf['beer_id_ba'] == res[i]]['prop']))
        ids_tmp.append(int(i))
    
    # Save the matching
    ids[cl] = ids_tmp

    # Print some info
    print('{}: value = {:.3f} ({} matched on ({},{}))'.format(cl, val, int(len(ids_tmp)/2), len(list0), len(list1)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


H-L: value = 3.192 (68 matched on (79,68))
H-M: value = 8.345 (652 matched on (667,652))
M-L: value = 1.790 (669 matched on (669,681))


In [23]:
for key in ids.keys():
    ids[key] = [int(i) for i in ids[key]]

In [26]:
with open(data_folder + 'tmp/ids_propensity_{}.json'.format(min_nbr_ratings), 'w') as outfile:
    json.dump(ids, outfile)

In [19]:
ids['M-L']

[127049,
 95723,
 25119,
 72799,
 97325,
 118878,
 70265,
 268169,
 42196,
 207152,
 71668,
 60727,
 67259,
 107198,
 23864,
 214172,
 23279,
 48078,
 2598,
 6825,
 43490,
 131961,
 63219,
 2578,
 38795,
 68452,
 7778,
 4718,
 58788,
 3248,
 5964,
 68026,
 6142,
 95714,
 2583,
 9749,
 10570,
 24653,
 22114,
 138170,
 17144,
 91337,
 5612,
 192008,
 62937,
 39390,
 73704,
 33000,
 120351,
 183819,
 228961,
 71390,
 140104,
 86315,
 15014,
 48519,
 118002,
 174214,
 68714,
 4935,
 111620,
 255648,
 95555,
 104952,
 92436,
 144086,
 93765,
 31306,
 149295,
 173181,
 206262,
 223048,
 141390,
 60862,
 83757,
 136791,
 96714,
 167989,
 69447,
 37457,
 113574,
 96076,
 203080,
 26325,
 93603,
 86945,
 50604,
 251869,
 102240,
 16002,
 151663,
 66865,
 242043,
 171478,
 246984,
 142635,
 50261,
 96830,
 172351,
 3297,
 21213,
 139001,
 51792,
 146282,
 47258,
 57710,
 227948,
 220231,
 122116,
 60441,
 4598,
 6878,
 113863,
 39518,
 36862,
 123750,
 229153,
 57491,
 101696,
 40451,
 80369,
 2