In [None]:
import numpy as np
import pandas as pd

from bokeh.plotting import output_notebook, show, figure

from sklearn.metrics import classification_report, accuracy_score
from sklearn.cross_validation import train_test_split

from keras.models import Sequential
from keras.layers import Dense, Activation

output_notebook()
pd.set_option('display.max_columns', None)

In [None]:
label_colors = {0: 'red', 1: 'green'}

In [None]:
data = pd.read_csv('./Speed Dating Data.csv.zip')

drop_columns = [
    # redundant data
    'id',  # subset of iid
    'idg',  # subset of iid
    'partner',  # subset of pid
    'field',  # redundant to field_cd
    'from',  # redundant to zipcode
    'career',  # redundant to career_c
    'undergra',  # not "redundant", but we prefer the avg_sat, as it describes the university
    # can't understand
    'positin1',
    # future data: updated after a few dates
    'attr1_s', 'sinc1_s', 'intel1_s', 'fun1_s', 'amb1_s', 'shar1_s',  # what do you value?
    'attr3_s', 'sinc3_s', 'intel3_s', 'fun3_s', 'amb3_s',  # how do you rate yourself?
    # future data: one day after the event
    'satis_2',  # satisfaction
    'length',  # opinion on dates length
    'numdat_2',  # opinion no number of dates
    'attr7_2', 'sinc7_2', 'intel7_2', 'fun7_2', 'amb7_2', 'shar7_2',  # what affected your decisions?
    'attr1_2', 'sinc1_2', 'intel1_2', 'fun1_2', 'amb1_2', 'shar1_2',  # what do you value?
    'attr4_2', 'sinc4_2', 'intel4_2', 'fun4_2', 'amb4_2', 'shar4_2',  # what do you think others from your sex value?
    'attr2_2', 'sinc2_2', 'intel2_2', 'fun2_2', 'amb2_2', 'shar2_2',  # what do you think the opposite sex values?
    'attr3_2', 'sinc3_2', 'intel3_2', 'fun3_2', 'amb3_2',  # how do you rate yourself?
    'attr5_2', 'sinc5_2', 'intel5_2', 'fun5_2', 'amb5_2',  # how do you think others rate you?
    # future data: 3-4 weeks after matches sent
    'you_call',  # contacted by you
    'them_cal',  # contacted by them
    'date_3',  # had any date?
    'numdat_3',  # how many dates?
    'num_in_3',  # how many dates? (again? what?)
    'attr1_3', 'sinc1_3', 'intel1_3', 'fun1_3', 'amb1_3', 'shar1_3',  # what do you value?
    'attr7_3', 'sinc7_3', 'intel7_3', 'fun7_3', 'amb7_3', 'shar7_3',  # what affected your decisions?
    'attr4_3', 'sinc4_3', 'intel4_3', 'fun4_3', 'amb4_3', 'shar4_3',  # what do you think others from your sex value?
    'attr2_3', 'sinc2_3', 'intel2_3', 'fun2_3', 'amb2_3', 'shar2_3',  # what do you think the opposite sex values?
    'attr3_3', 'sinc3_3', 'intel3_3', 'fun3_3', 'amb3_3',  # how do you rate yourself?
    'attr5_3', 'sinc5_3', 'intel5_3', 'fun5_3', 'amb5_3',  # how do you think others rate you?
    # suspicious:
    'match_es'  # how many matches will you get. Probably asked after the event?
]

data.drop(data[drop_columns], axis=1, inplace=True)

better_column_names = {
    'iid': 'subject_id',
    'condtn': 'choice_type',
    'pid': 'partner_subject_id',
    'int_corr': 'interests_similarity',
    'samerace': 'same_race',
    'age_o': 'partner_age',
    'race_o': 'partner_race',
    'dec_o': 'partner_wants_match',
    'pf_o_att': 'partner_values_attractive',
    'pf_o_sin': 'partner_values_sincere',
    'pf_o_int': 'partner_values_intelligent',
    'pf_o_fun': 'partner_values_fun',
    'pf_o_amb': 'partner_values_ambitious',
    'pf_o_sha': 'partner_values_shared_interests',
    'attr_o': 'partner_rated_attractive',
    'sinc_o': 'partner_rated_sincere',
    'intel_o': 'partner_rated_intelligent',
    'fun_o': 'partner_rated_fun',
    'amb_o': 'partner_rated_ambitious',
    'shar_o': 'partner_rated_shared_interests',
    'like_o': 'partner_liked',
    'prob_o': 'partner_guess_yes',
    'met_o': 'partner_met_before',  
    'field_cd': 'field_id',
    'mn_sat': 'university_avg_sat',
    'tuition': 'university_tuition',
    'imprace': 'importance_same_race',
    'imprelig': 'importance_same_religion',
    'zipcode': 'zip',
    'income': 'zip_avg_income',
    'date': 'dating_type',
    'go_out': 'go_out_type',
    'career_c': 'career_id',
    'sports': 'interest_sports',
    'tvsports': 'interest_tv_sports',
    'exercise': 'interest_exercise',
    'dining': 'interest_dining',
    'museums': 'interest_museums',
    'art': 'interest_art',
    'hiking': 'interest_hiking',
    'gaming': 'interest_gaming',
    'clubbing': 'interest_clubbing',
    'reading': 'interest_reading',
    'tv': 'interest_tv',
    'theater': 'interest_theater',
    'movies': 'interest_movies',
    'concerts': 'interest_concerts',
    'music': 'interest_music',
    'shopping': 'interest_shopping',
    'yoga': 'interest_yoga',
    'exphappy': 'happy_expectance',
    'expnum': 'been_liked_expectance',
    'attr1_1': 'values_attractive',
    'sinc1_1': 'values_sincere',
    'intel1_1': 'values_intelligent',
    'fun1_1': 'values_fun',
    'amb1_1': 'values_ambitious',
    'shar1_1': 'values_shared_interests',
    'attr4_1': 'same_sex_values_attractive',
    'sinc4_1': 'same_sex_values_sincere',
    'intel4_1': 'same_sex_values_intelligent',
    'fun4_1': 'same_sex_values_fun',
    'amb4_1': 'same_sex_values_ambitious',
    'shar4_1': 'same_sex_values_shared_interests',
    'attr2_1': 'opposite_sex_values_attractive',
    'sinc2_1': 'opposite_sex_values_sincere',
    'intel2_1': 'opposite_sex_values_intelligent',
    'fun2_1': 'opposite_sex_values_fun',
    'amb2_1': 'opposite_sex_values_ambitious',
    'shar2_1': 'opposite_sex_values_shared_interests',
    'attr3_1': 'own_rate_attractive',
    'sinc3_1': 'own_rate_sincere',
    'intel3_1': 'own_rate_intelligent',
    'fun3_1': 'own_rate_fun',
    'amb3_1': 'own_rate_ambitious',
    'attr5_1': 'others_rate_attractive',
    'sinc5_1': 'others_rate_sincere',
    'intel5_1': 'others_rate_intelligent',
    'fun5_1': 'others_rate_fun',
    'amb5_1': 'others_rate_ambitious',
    'dec': 'wants_match',
    'attr': 'rated_attractive',
    'sinc': 'rated_sincere',
    'intel': 'rated_intelligent',
    'fun': 'rated_fun',
    'amb': 'rated_ambitious',
    'shar': 'rated_shared_interests',
    'like': 'liked',
    'prob': 'guess_yes',
    'met': 'met_before',
}
data.rename(columns=better_column_names, inplace=True)

# reorder columns
data = data[[    
    # experiment data
    'wave',
    'round',
    'order',
    'position',
    'choice_type',
        
    # subject data
    'subject_id',
    'gender',
    'age', 
    'race',
    'zip',
    'zip_avg_income',
    'field_id',
    'career_id',
    'university_avg_sat',
    'university_tuition',
    'dating_type',
    'go_out_type',
    'goal',
    'happy_expectance',
    'been_liked_expectance',
    'importance_same_race',
    'importance_same_religion',
    # interests
    'interest_sports',
    'interest_tv_sports',
    'interest_exercise',
    'interest_dining',
    'interest_museums',
    'interest_art',
    'interest_hiking',
    'interest_gaming',
    'interest_clubbing',
    'interest_reading',
    'interest_tv',
    'interest_theater',
    'interest_movies',
    'interest_concerts',
    'interest_music',
    'interest_shopping',
    'interest_yoga',
    # what do you value?
    'values_attractive',
    'values_sincere',
    'values_intelligent',
    'values_fun',
    'values_ambitious',
    'values_shared_interests',
    # what do you think others from your sex value?
    'same_sex_values_attractive',
    'same_sex_values_sincere',
    'same_sex_values_intelligent',
    'same_sex_values_fun',
    'same_sex_values_ambitious',
    'same_sex_values_shared_interests',
    # what do you think the opposite sex values?
    'opposite_sex_values_attractive',
    'opposite_sex_values_sincere',
    'opposite_sex_values_intelligent',
    'opposite_sex_values_fun',
    'opposite_sex_values_ambitious',
    'opposite_sex_values_shared_interests',
    # how do you rate yourself?
    'own_rate_attractive',
    'own_rate_sincere',
    'own_rate_intelligent',
    'own_rate_fun',
    'own_rate_ambitious',
    # how do you think others rate you?
    'others_rate_attractive',
    'others_rate_sincere',
    'others_rate_intelligent',
    'others_rate_fun',
    'others_rate_ambitious',

    # partner data
    'partner_subject_id',    
    'partner_age',
    'partner_race',
    # what does the partner value?
    'partner_values_attractive',
    'partner_values_sincere',
    'partner_values_intelligent',
    'partner_values_fun',
    'partner_values_ambitious',
    'partner_values_shared_interests',
        
    # coincidences
    'interests_similarity',
    'same_race',

    # date results
    'match',
    # you rated the partner
    'met_before',
    'rated_attractive',
    'rated_sincere',
    'rated_intelligent',
    'rated_fun',
    'rated_ambitious',
    'rated_shared_interests',
    # your thoughts on the date
    'liked',
    'guess_yes',
    'wants_match',
    # the partner rated you
    'partner_met_before',
    'partner_rated_attractive',
    'partner_rated_sincere',
    'partner_rated_intelligent',
    'partner_rated_fun',
    'partner_rated_ambitious',
    'partner_rated_shared_interests',
    # partner thoughts on the date
    'partner_liked',
    'partner_guess_yes',
    'partner_wants_match',
]]

# TODO los coded (career, field) parecen estar mal

data.head()

In [None]:
d = pd.DataFrame([[10], [20], [30], [30], [30], [30], [30], [50], [90]])
(pd.cut(d[0], 3, labels=False)) * ((d[0].max() - d[0].min()) / 3) + d[0].min()

In [None]:
def graph_dots(x, y, color, groups_x=None, groups_y=None):
    f = figure()

    f.xaxis.axis_label = x
    f.yaxis.axis_label = y

    graph_data = data[[x, y, color_from]].dropna()
    
    def discretize(column, groups_count=None):
        if groups_count:
            new_column = (pd.cut(column, groups_count, labels=False)) * ((column.max() - column.min()) / groups_count) + column.min()
        else:
            new_column = column
        return new_column
    
    graph_data[x] = discretize(graph_data[x], groups_x)
    graph_data[y] = discretize(graph_data[y], groups_y)
        
    pre_group = graph_data.groupby([x, y], as_index=False)
    grouped = pre_group.mean()
    
    max_color = data[color].max()
    min_color = data[color].min()
    def get_real_color(value):
        green = 255 * ((value - min_color) / (max_color - min_color))
        red = 255 - green
        return "#%02x%02x%02x" % (int(red), int(green), 0)

    grouped['real_color'] = grouped[color].apply(get_real_color)
    count = pre_group.agg('count')[color]
    grouped['count_proportion'] = count / (count.max() - count.min())
    
    f.circle(grouped[x],
             grouped[y],
             color=grouped['real_color'],
             size=3 + grouped['count_proportion'] * 15)
    show(f)
    

graph_dots(
    x='own_rate_attractive',
    y='partner_values_attractive',
    color='match',
    groups_x=30,
    groups_y=30,
)

graph_dots(
    x='values_intelligent',
    y='partner_values_intelligent',
    color='match',
    groups_x=30,
    groups_y=30,
)

graph_dots(
    x='liked',
    y='partner_liked',
    color='match',
    groups_x=30,
    groups_y=30,
)