In [1]:
import numpy as np

In [4]:
import pandas as pd
import re
rating_partner_participant = ['attractive_partner','sincere_partner','intelligence_parter', 
                             'funny_partner', 'ambition_partner', 'shared_interests_partner']
partner_cols = ['pref_o_attractive','pref_o_sincere','pref_o_intelligence','pref_o_funny','pref_o_ambitious','pref_o_shared_interests']
participant_cols = ['attractive_important', 'sincere_important', 'intelligence_important', 'funny_important', 'ambition_important', 'shared_interests_important']    

In [None]:
dating = pd.read_csv('dating-full.csv')
count_quotes = 0
count_lowercase = 0

In [None]:
def remove_quotes(column):
    global count_quotes
    count = column.str.count("\'.*\'").sum()
#     print (count)
    if count_quotes != count:
        count_quotes += count
    return column.str.strip('\'')

def convert_lowercase(column):
    global count_lowercase
    count_lowercase = len(column)- column.str.islower().sum()
#     print (count_lowercase)
    return column.str.lower()

In [None]:
dating[['race','race_o','field']] = dating[['race','race_o','field']].apply(remove_quotes)

In [None]:
dating[['field']] = dating[['field']].apply(convert_lowercase)

In [None]:
print ('Quotes removed from', count_quotes, 'cells')
print('Standardized', count_lowercase , 'cells to lower case.')

In [None]:
global_encoder_by_field = {}
def get_encoding(column):
    column = column.astype('category')
    encoding = {}
    for i, category in enumerate(column.cat.categories):
        encoding[category] = i
    global_encoder_by_field[column.name] = encoding
    return column.cat.codes

In [None]:
dating[['race','race_o','gender','field']] = dating[['race','race_o','gender','field']].apply(get_encoding)

In [None]:
print ('Value assigned for male in column gender:', global_encoder_by_field['gender']['male'])
print ('Value assigned for European/Caucasian-American in column race:', global_encoder_by_field['race']['European/Caucasian-American'])
print ('Value assigned for Latino/Hispanic American in column race o:', global_encoder_by_field['race_o']['Latino/Hispanic American'])
print ('Value assigned for law in column field:', global_encoder_by_field['field']['law'])

In [None]:
total_partner = 0
total_participant = 0

In [None]:
for i in range (0,6):
    total_partner += dating[partner_cols[i]]
    total_participant += dating[participant_cols[i]]
# print (total_partner)
# print (total_participant)

In [None]:
for i in range(0,6):
    dating[partner_cols[i]]/=total_partner
    dating[participant_cols[i]]/=total_participant

In [None]:
for i in range(0,6):
    participant_mean = dating[participant_cols[i]].sum()/len(dating[participant_cols[i]])
    print ('Mean of ', participant_cols[i], ':', round(participant_mean, 2))
for i in range(0,6): 
    partner_mean = dating[partner_cols[i]].sum()/len(dating[partner_cols[i]])
    print ('Mean of ', partner_cols[i], ':', round(partner_mean, 2))

In [None]:
dating.to_csv('dating.csv', index = False)

In [None]:
def get_print_participant_mean(dataframe):
    mean_scores = []
    for i in range(0,6):
        participant_mean = dataframe[participant_cols[i]].sum()/len(dataframe[participant_cols[i]])
        print ('Mean of ', participant_cols[i], ':', round(participant_mean, 2))
        mean_scores.append(participant_mean)
    return mean_scores

dating_female = dating[dating['gender'] == 0]
dating_male = dating[dating['gender'] == 1]
female_mean_scores = get_print_participant_mean(dating_female)
male_mean_scores = get_print_participant_mean(dating_male)

In [None]:
import matplotlib.pyplot as plt
ind = np.arange(6)
width = 0.35
p1 = plt.bar(ind, female_mean_scores, width,color = 'pink')
p2 = plt.bar(ind+width, male_mean_scores, width, color = 'blue')
plt.ylabel('Mean Scores')
plt.title('Preference scores of participants by gender')
plt.xticks(ind+width/2, (participant_cols[0], participant_cols[1], participant_cols[2], 
                 participant_cols[3], participant_cols[4], participant_cols[5]), rotation=80)
plt.yticks(np.arange(0,0.5,0.05))
plt.legend((p1[0], p2[0]), ('Female', 'Male'))

plt.show()
plt.savefig('gender_barplot')

In [None]:
def get_distinct_values_rating_partner(dataframe, attribute):
    print (dataframe[attribute].nunique()) 
    return (dataframe[attribute].unique())

In [None]:
unique_values = []
for i in range(0,6):
    unique_values.append(get_distinct_values_rating_partner(dating, rating_partner_participant[i]))

In [None]:
print (unique_values)

In [None]:
def get_success_rate(dataframe, attribute, value):
    dating_attribute_value = dataframe[dataframe[attribute] == value]
    dating_success = dating_attribute_value[dating_attribute_value['decision'] == 1]
    return len(dating_success)*1.0/len(dating_attribute_value)

In [None]:
get_success_rate(dating, rating_partner_participant[0], 1)

In [None]:
success_rates_all_attributes = []
for i in range(6):
        success_rate_attribute = []
        for value in unique_values[i]:
            success_rate_attribute.append(get_success_rate(dating, rating_partner_participant[i], value))
        success_rates_all_attributes.append(success_rate_attribute)

In [None]:
print (success_rates_all_attributes[0])

In [None]:
import numpy as np
import matplotlib.pyplot as plt
for i in range(6):
    area = np.pi*3
    plt.figure(figsize=(8,8))
    plt.scatter(unique_values[i], success_rates_all_attributes[i], s=area)

    plt.title('Scatter Plot for Partners who Perform Well on ' + rating_partner_participant[i])
    plt.xlabel('Attribute Value for ' + rating_partner_participant[i])
    plt.ylabel('Success Rate')

    plt.xticks(np.arange(0,11,1))
    plt.yticks(np.arange(0,1.1,0.1))
    plt.show()
    plt.savefig('scatter_plot_' + rating_partner_participant[i])

In [5]:
dating = pd.read_csv("dating.csv")
'''
clean data for columns gaming and reading
'''
column = 'gaming'
range_highest = 10
dating.loc[dating[column] > range_highest,column] = range_highest
dating.loc[dating['reading'] > range_highest,'reading'] = range_highest


In [6]:
def get_binned_column(dataframe, column, num_bins, bin_range):
    dataframe[column] = pd.cut(dataframe[column], bin_range, include_lowest = True,
                               labels = np.arange(num_bins), retbins = False)
    return dataframe[column]

In [7]:
non_binned_cols = ['gender', 'race', 'race_o', 'samerace', 'field', 'decision']
age_cols = ['age', 'age_o']
num_bins = 5
for column in dating:
    if column not in non_binned_cols:
        bin_range = np.arange(0,11,(10-0)/num_bins)
        '''
        Change bin range if needed
        '''
        if column in age_cols:
            bin_range = np.arange(18,59,(58-18)/num_bins)
#             print ("bin range for age ", column)
        elif column in partner_cols or column in participant_cols:
            bin_range = np.arange(0,1.1,(1.0)/num_bins)
#             print ("different bin range ", column)
        elif column == 'interests_correlate':
            bin_range = np.arange(-1,1.1,(1+1)/num_bins)
#             print ("interest column", column)
        
        '''
        get binned column
        '''
        dating[column] = get_binned_column(dating, column, num_bins, bin_range)
#         print (column, ": ", dating[column].values_count())

In [8]:
for column in dating:
    if column not in non_binned_cols:
        count = dating[column].value_counts(sort=False)
        print (column, ": ", count.tolist())

age :  [3710, 2932, 97, 0, 5]
age_o :  [3704, 2899, 136, 0, 5]
importance_same_race :  [2980, 1213, 977, 1013, 561]
importance_same_religion :  [3203, 1188, 1110, 742, 501]
pref_o_attractive :  [4333, 1987, 344, 51, 29]
pref_o_sincere :  [5500, 1225, 19, 0, 0]
pref_o_intelligence :  [4601, 2062, 81, 0, 0]
pref_o_funny :  [5616, 1103, 25, 0, 0]
pref_o_ambitious :  [6656, 88, 0, 0, 0]
pref_o_shared_interests :  [6467, 277, 0, 0, 0]
attractive_important :  [4323, 2017, 328, 57, 19]
sincere_important :  [5495, 1235, 14, 0, 0]
intelligence_important :  [4606, 2071, 67, 0, 0]
funny_important :  [5588, 1128, 28, 0, 0]
ambition_important :  [6644, 100, 0, 0, 0]
shared_interests_important :  [6494, 250, 0, 0, 0]
attractive :  [18, 276, 1462, 4122, 866]
sincere :  [33, 117, 487, 2715, 3392]
intelligence :  [34, 185, 1049, 3190, 2286]
funny :  [0, 19, 221, 3191, 3313]
ambition :  [84, 327, 1070, 2876, 2387]
attractive_partner :  [284, 948, 2418, 2390, 704]
sincere_partner :  [94, 353, 1627, 3282,

In [None]:
dating.to_csv("dating-binned.csv", index = False)

In [None]:
'''
split dataset
'''
testset=dating.sample(frac=0.2,random_state=47)
trainset=dating.drop(testset.index)
testset.to_csv("testSet.csv", index = False)
trainset.to_csv("trainingSet.csv", index = False)

In [None]:
trainset = pd.read_csv('trainingSet.csv')
testset = pd.read_csv('testSet.csv')
count_dict_yes = {}
count_dict_no = {}
for column in trainset:
    count_dict_yes[column] = trainset[trainset['decision'] == 1][column].value_counts(sort=False).to_dict()
    count_dict_no[column] = trainset[trainset['decision'] == 0][column].value_counts(sort=False).to_dict()
    print ('for column', column, 'bin count for decision 1 are', 
           count_dict_yes[column])
    print ('for column', column, 'bin count for decision 0 are',
           count_dict_no[column])

In [None]:
def get_probability(attribute, value, decision):
    if decision == 1:
        value_count = 0
        if value in count_dict_yes[attribute]:
            value_count = count_dict_yes[attribute][value]
        return (value_count+1.0)/(count_dict_yes['decision'][1]+len(count_dict_yes[attribute]))
    elif decision == 0:
        value_count = 0
        if value in count_dict_no[attribute]:
            value_count = count_dict_no[attribute][value]
        return (value_count+1.0)/(count_dict_no['decision'][0]+len(count_dict_no[attribute]))

In [None]:
prior_probability_dec_1 = count_dict_yes['decision'][1]/(count_dict_no['decision'][0]+count_dict_yes['decision'][1])
prior_probability_dec_0 = count_dict_no['decision'][0]/(count_dict_no['decision'][0]+count_dict_yes['decision'][1])

In [None]:
def inference_row(row):
    correct = 0
    prob_dec_1 = 1
    prob_dec_0 = 1
    for column in count_dict_yes:
        if column != 'decision':
#                 prob_dec_1 += np.log(get_probability(column, row[column], 1))
#                 prob_dec_0 += np.log(get_probability(column, row[column], 0))
                '''
                dont take logs
                '''
                prob_dec_1 *= get_probability(column, row[column], 1)
                prob_dec_0 *= get_probability(column, row[column], 0)
    '''
    multiply by prior probabilities
    '''
#     prob_dec_1 += np.log(prior_probability_dec_1)
#     prob_dec_0 += np.log(prior_probability_dec_0)
    # dont take logs
    prob_dec_1 *= prior_probability_dec_1
    prob_dec_0 *= prior_probability_dec_0
    
    predicted_value = 0
    true_value = row['decision']
    if prob_dec_1 > prob_dec_0:
        predicted_value = 1
    if predicted_value == true_value:
        correct = 1
    return correct

In [None]:
def inference(dataset):  
    dataset['correct_prediction'] = dataset.apply(inference_row, axis=1)
    correct_prediction_dict = dataset['correct_prediction'].value_counts().to_dict()
    return correct_prediction_dict

In [None]:
correct_predictions = inference(trainset)

In [None]:
print (correct_predictions[1]/(correct_predictions[1] + correct_predictions[0]))

In [None]:
print (correct_predictions)

In [None]:
for column in count_dict_yes:
    print (column)

In [None]:
for column in trainset:
    print (column)