In [1]:
import numpy as np

In [2]:
import pandas as pd
import re

In [None]:
dating = pd.read_csv('dating-full.csv')
count_quotes = 0
count_lowercase = 0

In [None]:
def remove_quotes(column):
    global count_quotes
    count = column.str.count("\'.*\'").sum()
#     print (count)
    if count_quotes != count:
        count_quotes += count
    return column.str.strip('\'')

def convert_lowercase(column):
    global count_lowercase
    count_lowercase = len(column)- column.str.islower().sum()
#     print (count_lowercase)
    return column.str.lower()

In [None]:
dating[['race','race_o','field']] = dating[['race','race_o','field']].apply(remove_quotes)

In [None]:
dating[['field']] = dating[['field']].apply(convert_lowercase)

In [None]:
print ('Quotes removed from', count_quotes, 'cells')
print('Standardized', count_lowercase , 'cells to lower case.')

In [None]:
global_encoder_by_field = {}
def get_encoding(column):
    column = column.astype('category')
    encoding = {}
    for i, category in enumerate(column.cat.categories):
        encoding[category] = i
    global_encoder_by_field[column.name] = encoding
    return column.cat.codes

In [None]:
dating[['race','race_o','gender','field']] = dating[['race','race_o','gender','field']].apply(get_encoding)

In [None]:
print ('Value assigned for male in column gender:', global_encoder_by_field['gender']['male'])
print ('Value assigned for European/Caucasian-American in column race:', global_encoder_by_field['race']['European/Caucasian-American'])
print ('Value assigned for Latino/Hispanic American in column race o:', global_encoder_by_field['race_o']['Latino/Hispanic American'])
print ('Value assigned for law in column field:', global_encoder_by_field['field']['law'])

In [None]:
partner_cols = ['pref_o_attractive','pref_o_sincere','pref_o_intelligence','pref_o_funny','pref_o_ambitious','pref_o_shared_interests']
participant_cols = ['attractive_important', 'sincere_important', 'intelligence_important', 'funny_important', 'ambition_important', 'shared_interests_important']    

In [None]:
total_partner = 0
total_participant = 0

In [None]:
for i in range (0,6):
    total_partner += dating[partner_cols[i]]
    total_participant += dating[participant_cols[i]]
# print (total_partner)
# print (total_participant)

In [None]:
for i in range(0,6):
    dating[partner_cols[i]]/=total_partner
    dating[participant_cols[i]]/=total_participant

In [None]:
for i in range(0,6):
    participant_mean = dating[participant_cols[i]].sum()/len(dating[participant_cols[i]])
    print ('Mean of ', participant_cols[i], ':', round(participant_mean, 2))
for i in range(0,6): 
    partner_mean = dating[partner_cols[i]].sum()/len(dating[partner_cols[i]])
    print ('Mean of ', partner_cols[i], ':', round(partner_mean, 2))

In [None]:
dating.to_csv('dating.csv', index = False)

In [None]:
def get_print_participant_mean(dataframe):
    mean_scores = []
    for i in range(0,6):
        participant_mean = dataframe[participant_cols[i]].sum()/len(dataframe[participant_cols[i]])
        print ('Mean of ', participant_cols[i], ':', round(participant_mean, 2))
        mean_scores.append(participant_mean)
    return mean_scores

dating_female = dating[dating['gender'] == 0]
dating_male = dating[dating['gender'] == 1]
female_mean_scores = get_print_participant_mean(dating_female)
male_mean_scores = get_print_participant_mean(dating_male)

In [None]:
import matplotlib.pyplot as plt
ind = np.arange(6)
width = 0.35
p1 = plt.bar(ind, female_mean_scores, width,color = 'pink')
p2 = plt.bar(ind+width, male_mean_scores, width, color = 'blue')
plt.ylabel('Mean Scores')
plt.title('Preference scores of participants by gender')
plt.xticks(ind+width/2, (participant_cols[0], participant_cols[1], participant_cols[2], 
                 participant_cols[3], participant_cols[4], participant_cols[5]), rotation=80)
plt.yticks(np.arange(0,0.5,0.05))
plt.legend((p1[0], p2[0]), ('Female', 'Male'))

plt.show()

In [None]:
def get_distinct_values_rating_partner(dataframe, attribute):
    print (dataframe[attribute].nunique()) 
    return (dataframe[attribute].unique())

In [None]:
rating_partner_participant = ['attractive_partner','sincere_partner','intelligence_parter', 
                             'funny_partner', 'ambition_partner', 'shared_interests_partner']
unique_values = []
for i in range(0,6):
    unique_values.append(get_distinct_values_rating_partner(dating, rating_partner_participant[i]))

In [None]:
print (unique_values)

In [None]:
def get_success_rate(dataframe, attribute, value):
    dating_attribute_value = dataframe[dataframe[attribute] == value]
    dating_success = dating_attribute_value[dating_attribute_value['decision'] == 1]
    return len(dating_success)*1.0/len(dating_attribute_value)

In [None]:
get_success_rate(dating, rating_partner_participant[0], 1)

In [None]:
success_rates_all_attributes = []
for i in range(6):
        success_rate_attribute = []
        for value in unique_values[i]:
            success_rate_attribute.append(get_success_rate(dating, rating_partner_participant[i], value))
        success_rates_all_attributes.append(success_rate_attribute)

In [None]:
print (success_rates_all_attributes[0])

In [None]:
import numpy as np
import matplotlib.pyplot as plt
for i in range(6):
    area = np.pi*3
    plt.figure(figsize=(8,8))
    plt.scatter(unique_values[i], success_rates_all_attributes[i], s=area)

    plt.title('Scatter Plot for Partners who Perform Well on ' + rating_partner_participant[i])
    plt.xlabel('Attribute Value for ' + rating_partner_participant[i])
    plt.ylabel('Success Rate')

    plt.xticks(np.arange(0,11,1))
    plt.yticks(np.arange(0,1.1,0.1))
    plt.show()
    plt.savefig('scatter_plot_' + rating_partner_participant[i])

In [None]:
dating = pd.read_csv("dating.csv")
'''
clean data for columns gaming and reading
'''
column = 'gaming'
range_highest = 10
dating.loc[dating[column] > range_highest,column] = range_highest
dating.loc[dating['reading'] > range_highest,'reading'] = range_highest


In [None]:
def get_binned_column(dataframe, column, num_bins, bin_range):
    dataframe[column] = pd.cut(dataframe[column], bin_range, include_lowest = True,
                               labels = np.arange(num_bins), retbins = False)
    return dataframe[column]

In [None]:
non_binned_cols = ['gender', 'race', 'race_o', 'samerace', 'field', 'decision']
age_cols = ['age', 'age_o']
num_bins = 5
for column in dating:
    if column not in non_binned_cols:
        bin_range = np.arange(0,11,(10-0)/num_bins)
        '''
        Change bin range if needed
        '''
        if column in age_cols:
            bin_range = np.arange(18,59,(58-18)/num_bins)
#             print ("bin range for age ", column)
        elif column in partner_cols or column in participant_cols:
            bin_range = np.arange(0,1.1,(1.0)/num_bins)
#             print ("different bin range ", column)
        elif column == 'interests_correlate':
            bin_range = np.arange(-1,1.1,(1+1)/num_bins)
#             print ("interest column", column)
        
        '''
        get binned column
        '''
        dating[column] = get_binned_column(dating, column, num_bins, bin_range)
        print (column, ": ", dating[column].values_count)

In [None]:
for column in dating:
    if column not in non_binned_cols:
        count = dating[column].value_counts(sort=False)
        print (column, ": ", count.tolist())

In [None]:
dating.to_csv("dating-binned.csv", index = False)

In [None]:
'''
split dataset
'''
testset=dating.sample(frac=0.2,random_state=47)
trainset=dating.drop(testset.index)
testset.to_csv("testSet.csv", index = False)
trainset.to_csv("trainingSet.csv", index = False)

In [3]:
trainset = pd.read_csv('trainingSet.csv')
testset = pd.read_csv('testSet.csv')
count_dict_yes = {}
count_dict_no = {}
for column in trainset:
    count_dict_yes[column] = trainset[trainset['decision'] == 1][column].value_counts(sort=False).to_dict()
    count_dict_no[column] = trainset[trainset['decision'] == 0][column].value_counts(sort=False).to_dict()
    print ('for column', column, 'bin count for decision 1 are', 
           count_dict_yes[column])
    print ('for column', column, 'bin count for decision 0 are',
           count_dict_no[column])

for column gender bin count for decision 1 are {0: 969, 1: 1366}
for column gender bin count for decision 0 are {0: 1676, 1: 1384}
for column age bin count for decision 1 are {0: 1288, 2: 40, 4: 2, 1: 1005}
for column age bin count for decision 0 are {0: 1701, 2: 37, 4: 1, 1: 1321}
for column age_o bin count for decision 1 are {0: 1339, 2: 35, 4: 1, 1: 960}
for column age_o bin count for decision 0 are {0: 1650, 2: 75, 4: 4, 1: 1331}
for column race bin count for decision 1 are {0: 615, 2: 1242, 4: 179, 1: 143, 3: 156}
for column race bin count for decision 0 are {0: 698, 2: 1822, 4: 176, 1: 126, 3: 238}
for column race_o bin count for decision 1 are {0: 464, 2: 1401, 4: 148, 1: 120, 3: 202}
for column race_o bin count for decision 0 are {0: 846, 2: 1625, 4: 208, 1: 158, 3: 223}
for column samerace bin count for decision 1 are {0: 1353, 1: 982}
for column samerace bin count for decision 0 are {0: 1862, 1: 1198}
for column importance_same_race bin count for decision 1 are {0: 1123, 2: 3

for column music bin count for decision 1 are {0: 27, 2: 356, 4: 1000, 1: 75, 3: 877}
for column music bin count for decision 0 are {0: 22, 2: 519, 4: 1243, 1: 84, 3: 1192}
for column shopping bin count for decision 1 are {0: 407, 2: 639, 4: 372, 1: 360, 3: 557}
for column shopping bin count for decision 0 are {0: 492, 2: 710, 4: 607, 1: 484, 3: 767}
for column yoga bin count for decision 1 are {0: 765, 2: 503, 4: 245, 1: 441, 3: 381}
for column yoga bin count for decision 0 are {0: 1058, 2: 595, 4: 271, 1: 657, 3: 479}
for column interests_correlate bin count for decision 1 are {0: 2, 2: 862, 4: 218, 1: 252, 3: 1001}
for column interests_correlate bin count for decision 0 are {0: 12, 2: 1146, 4: 253, 1: 365, 3: 1284}
for column expected_happy_with_sd_people bin count for decision 1 are {0: 93, 2: 1133, 4: 116, 1: 390, 3: 603}
for column expected_happy_with_sd_people bin count for decision 0 are {0: 172, 2: 1463, 4: 98, 1: 641, 3: 686}
for column like bin count for decision 1 are {0: 5

In [4]:
def get_probability(attribute, value, decision):
    if decision == 1:
        value_count = 0
        if value in count_dict_yes[attribute]:
            value_count = count_dict_yes[attribute][value]
        return (value_count+1.0)/(count_dict_yes['decision'][1]+len(count_dict_yes[attribute]))
    elif decision == 0:
        value_count = 0
        if value in count_dict_no[attribute]:
            value_count = count_dict_no[attribute][value]
        return (value_count+1.0)/(count_dict_no['decision'][0]+len(count_dict_no[attribute]))

In [5]:
prior_probability_dec_1 = count_dict_yes['decision'][1]/(count_dict_no['decision'][0]+count_dict_yes['decision'][1])
prior_probability_dec_0 = count_dict_no['decision'][0]/(count_dict_no['decision'][0]+count_dict_yes['decision'][1])

In [25]:
def inference_row(row):
    correct = 0
    prob_dec_1 = 1
    prob_dec_0 = 1
    for column in count_dict_yes:
        if column != 'decision':
#                 prob_dec_1 += np.log(get_probability(column, row[column], 1))
#                 prob_dec_0 += np.log(get_probability(column, row[column], 0))
                '''
                dont take logs
                '''
                prob_dec_1 *= get_probability(column, row[column], 1)
                prob_dec_0 *= get_probability(column, row[column], 0)
    '''
    multiply by prior probabilities
    '''
#     prob_dec_1 += np.log(prior_probability_dec_1)
#     prob_dec_0 += np.log(prior_probability_dec_0)
    # dont take logs
    prob_dec_1 *= prior_probability_dec_1
    prob_dec_0 *= prior_probability_dec_0
    
    predicted_value = 0
    true_value = row['decision']
    if prob_dec_1 > prob_dec_0:
        predicted_value = 1
    if predicted_value == true_value:
        correct = 1
    return correct

In [26]:
def inference(dataset):  
    dataset['correct_prediction'] = dataset.apply(inference_row, axis=1)
    correct_prediction_dict = dataset['correct_prediction'].value_counts().to_dict()
    return correct_prediction_dict

In [29]:
correct_predictions = inference(trainset)

In [30]:
print (correct_predictions[1]/(correct_predictions[1] + correct_predictions[0]))

0.77516218721038


In [None]:
print (correct_predictions)

In [None]:
for column in count_dict_yes:
    print (column)

In [None]:
for column in trainset:
    print (column)