# Speed Dating with Tinder
#### <i>Author: Delphine César<i>

# Table of contents

<ul>
   <li><a href="#import">I - Import of librairies and dataset</a></li>
   <li><a href="#info">II - Dataset information</a></li>
      <ul>
         <li><a href="#missing">1 - Focus on missing values</a></li>
      </ul>
   <li><a href="#engineering">III - Data engineering</a></li>
      <ul>
         <li><a href="#waves">1 - Waves selection</a></li>
         <li><a href="#renaming">2 - Renaming useful column values</a></li>
      </ul>
   <li><a href="#group">IV - Descriptive analysis of the group</a></li>
   <li><a href="#dates">V - Dates analysis</a></li>
      <ul>
         <li><a href="#attributes">1 - Focus on attributes</a></li>
         <li><a href="#races">2 - Focus on race</a></li>
         <li><a href="#order">3 - Focus on date order</a></li>
         <li><a href="#after">4 - Happily ever after?</a></li>
      </ul>
</ul>

<a id='import'></a>
### I - Import of librairies and dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as py

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
dataset = pd.read_csv("Speed+Dating+Data.csv", encoding = "ISO-8859-1")

In [3]:
pd.set_option('display.max_columns', None)

<a id='info'></a>
### II - Dataset information

In [4]:
# Basic stats
print("Number of rows : {}".format(dataset.shape[0]))
print()

print("Number of columns : {}".format(dataset.shape[1]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

Number of rows : 8378

Number of columns : 195

Display of dataset: 


Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,partner,pid,match,int_corr,samerace,age_o,race_o,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,pf_o_amb,pf_o_sha,dec_o,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,like_o,prob_o,met_o,age,field,field_cd,undergra,mn_sat,tuition,race,imprace,imprelig,from,zipcode,income,goal,date,go_out,career,career_c,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,exphappy,expnum,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr4_1,sinc4_1,intel4_1,fun4_1,amb4_1,shar4_1,attr2_1,sinc2_1,intel2_1,fun2_1,amb2_1,shar2_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1,attr5_1,sinc5_1,intel5_1,fun5_1,amb5_1,dec,attr,sinc,intel,fun,amb,shar,like,prob,met,match_es,attr1_s,sinc1_s,intel1_s,fun1_s,amb1_s,shar1_s,attr3_s,sinc3_s,intel3_s,fun3_s,amb3_s,satis_2,length,numdat_2,attr7_2,sinc7_2,intel7_2,fun7_2,amb7_2,shar7_2,attr1_2,sinc1_2,intel1_2,fun1_2,amb1_2,shar1_2,attr4_2,sinc4_2,intel4_2,fun4_2,amb4_2,shar4_2,attr2_2,sinc2_2,intel2_2,fun2_2,amb2_2,shar2_2,attr3_2,sinc3_2,intel3_2,fun3_2,amb3_2,attr5_2,sinc5_2,intel5_2,fun5_2,amb5_2,you_call,them_cal,date_3,numdat_3,num_in_3,attr1_3,sinc1_3,intel1_3,fun1_3,amb1_3,shar1_3,attr7_3,sinc7_3,intel7_3,fun7_3,amb7_3,shar7_3,attr4_3,sinc4_3,intel4_3,fun4_3,amb4_3,shar4_3,attr2_3,sinc2_3,intel2_3,fun2_3,amb2_3,shar2_3,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,1,11.0,0,0.14,0,27.0,2.0,35.0,20.0,20.0,20.0,0.0,5.0,0,6.0,8.0,8.0,8.0,8.0,6.0,7.0,4.0,2.0,21.0,Law,1.0,,,,4.0,2.0,4.0,Chicago,60521,69487.0,2.0,7.0,1.0,lawyer,,9.0,2.0,8.0,9.0,1.0,1.0,5.0,1.0,5.0,6.0,9.0,1.0,10.0,10.0,9.0,8.0,1.0,3.0,2.0,15.0,20.0,20.0,15.0,15.0,15.0,,,,,,,35.0,20.0,15.0,20.0,5.0,5.0,6.0,8.0,8.0,8.0,7.0,,,,,,1,6.0,9.0,7.0,7.0,6.0,5.0,7.0,6.0,2.0,4.0,,,,,,,,,,,,6.0,2.0,1.0,,,,,,,19.44,16.67,13.89,22.22,11.11,16.67,,,,,,,,,,,,,6.0,7.0,8.0,7.0,6.0,,,,,,1.0,1.0,0.0,,,15.0,20.0,20.0,15.0,15.0,15.0,,,,,,,,,,,,,,,,,,,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,2,12.0,0,0.54,0,22.0,2.0,60.0,0.0,0.0,40.0,0.0,0.0,0,7.0,8.0,10.0,7.0,7.0,5.0,8.0,4.0,2.0,21.0,Law,1.0,,,,4.0,2.0,4.0,Chicago,60521,69487.0,2.0,7.0,1.0,lawyer,,9.0,2.0,8.0,9.0,1.0,1.0,5.0,1.0,5.0,6.0,9.0,1.0,10.0,10.0,9.0,8.0,1.0,3.0,2.0,15.0,20.0,20.0,15.0,15.0,15.0,,,,,,,35.0,20.0,15.0,20.0,5.0,5.0,6.0,8.0,8.0,8.0,7.0,,,,,,1,7.0,8.0,7.0,8.0,5.0,6.0,7.0,5.0,1.0,4.0,,,,,,,,,,,,6.0,2.0,1.0,,,,,,,19.44,16.67,13.89,22.22,11.11,16.67,,,,,,,,,,,,,6.0,7.0,8.0,7.0,6.0,,,,,,1.0,1.0,0.0,,,15.0,20.0,20.0,15.0,15.0,15.0,,,,,,,,,,,,,,,,,,,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,3,13.0,1,0.16,1,22.0,4.0,19.0,18.0,19.0,18.0,14.0,12.0,1,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0,21.0,Law,1.0,,,,4.0,2.0,4.0,Chicago,60521,69487.0,2.0,7.0,1.0,lawyer,,9.0,2.0,8.0,9.0,1.0,1.0,5.0,1.0,5.0,6.0,9.0,1.0,10.0,10.0,9.0,8.0,1.0,3.0,2.0,15.0,20.0,20.0,15.0,15.0,15.0,,,,,,,35.0,20.0,15.0,20.0,5.0,5.0,6.0,8.0,8.0,8.0,7.0,,,,,,1,5.0,8.0,9.0,8.0,5.0,7.0,7.0,,1.0,4.0,,,,,,,,,,,,6.0,2.0,1.0,,,,,,,19.44,16.67,13.89,22.22,11.11,16.67,,,,,,,,,,,,,6.0,7.0,8.0,7.0,6.0,,,,,,1.0,1.0,0.0,,,15.0,20.0,20.0,15.0,15.0,15.0,,,,,,,,,,,,,,,,,,,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,4,14.0,1,0.61,0,23.0,2.0,30.0,5.0,15.0,40.0,5.0,5.0,1,7.0,8.0,9.0,8.0,9.0,8.0,7.0,7.0,2.0,21.0,Law,1.0,,,,4.0,2.0,4.0,Chicago,60521,69487.0,2.0,7.0,1.0,lawyer,,9.0,2.0,8.0,9.0,1.0,1.0,5.0,1.0,5.0,6.0,9.0,1.0,10.0,10.0,9.0,8.0,1.0,3.0,2.0,15.0,20.0,20.0,15.0,15.0,15.0,,,,,,,35.0,20.0,15.0,20.0,5.0,5.0,6.0,8.0,8.0,8.0,7.0,,,,,,1,7.0,6.0,8.0,7.0,6.0,8.0,7.0,6.0,2.0,4.0,,,,,,,,,,,,6.0,2.0,1.0,,,,,,,19.44,16.67,13.89,22.22,11.11,16.67,,,,,,,,,,,,,6.0,7.0,8.0,7.0,6.0,,,,,,1.0,1.0,0.0,,,15.0,20.0,20.0,15.0,15.0,15.0,,,,,,,,,,,,,,,,,,,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,5,15.0,1,0.21,0,24.0,3.0,30.0,10.0,20.0,10.0,10.0,20.0,1,8.0,7.0,9.0,6.0,9.0,7.0,8.0,6.0,2.0,21.0,Law,1.0,,,,4.0,2.0,4.0,Chicago,60521,69487.0,2.0,7.0,1.0,lawyer,,9.0,2.0,8.0,9.0,1.0,1.0,5.0,1.0,5.0,6.0,9.0,1.0,10.0,10.0,9.0,8.0,1.0,3.0,2.0,15.0,20.0,20.0,15.0,15.0,15.0,,,,,,,35.0,20.0,15.0,20.0,5.0,5.0,6.0,8.0,8.0,8.0,7.0,,,,,,1,5.0,6.0,7.0,7.0,6.0,6.0,6.0,6.0,2.0,4.0,,,,,,,,,,,,6.0,2.0,1.0,,,,,,,19.44,16.67,13.89,22.22,11.11,16.67,,,,,,,,,,,,,6.0,7.0,8.0,7.0,6.0,,,,,,1.0,1.0,0.0,,,15.0,20.0,20.0,15.0,15.0,15.0,,,,,,,,,,,,,,,,,,,5.0,7.0,7.0,7.0,7.0,,,,,



Basics statistics: 


Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,partner,pid,match,int_corr,samerace,age_o,race_o,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,pf_o_amb,pf_o_sha,dec_o,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,like_o,prob_o,met_o,age,field,field_cd,undergra,mn_sat,tuition,race,imprace,imprelig,from,zipcode,income,goal,date,go_out,career,career_c,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,exphappy,expnum,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr4_1,sinc4_1,intel4_1,fun4_1,amb4_1,shar4_1,attr2_1,sinc2_1,intel2_1,fun2_1,amb2_1,shar2_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1,attr5_1,sinc5_1,intel5_1,fun5_1,amb5_1,dec,attr,sinc,intel,fun,amb,shar,like,prob,met,match_es,attr1_s,sinc1_s,intel1_s,fun1_s,amb1_s,shar1_s,attr3_s,sinc3_s,intel3_s,fun3_s,amb3_s,satis_2,length,numdat_2,attr7_2,sinc7_2,intel7_2,fun7_2,amb7_2,shar7_2,attr1_2,sinc1_2,intel1_2,fun1_2,amb1_2,shar1_2,attr4_2,sinc4_2,intel4_2,fun4_2,amb4_2,shar4_2,attr2_2,sinc2_2,intel2_2,fun2_2,amb2_2,shar2_2,attr3_2,sinc3_2,intel3_2,fun3_2,amb3_2,attr5_2,sinc5_2,intel5_2,fun5_2,amb5_2,you_call,them_cal,date_3,numdat_3,num_in_3,attr1_3,sinc1_3,intel1_3,fun1_3,amb1_3,shar1_3,attr7_3,sinc7_3,intel7_3,fun7_3,amb7_3,shar7_3,attr4_3,sinc4_3,intel4_3,fun4_3,amb4_3,shar4_3,attr2_3,sinc2_3,intel2_3,fun2_3,amb2_3,shar2_3,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
count,8378.0,8377.0,8378.0,8378.0,8378.0,8378.0,8378.0,8378.0,6532.0,8378.0,8378.0,8368.0,8378.0,8220.0,8378.0,8274.0,8305.0,8289.0,8289.0,8289.0,8280.0,8271.0,8249.0,8378.0,8166.0,8091.0,8072.0,8018.0,7656.0,7302.0,8128.0,8060.0,7993.0,8283.0,8315,8296.0,4914,3133.0,3583.0,8315.0,8299.0,8299.0,8299,7314.0,4279.0,8299.0,8281.0,8299.0,8289,8240.0,8299.0,8299.0,8299.0,8299.0,8299.0,8299.0,8299.0,8299.0,8299.0,8299.0,8299.0,8299.0,8299.0,8299.0,8299.0,8299.0,8299.0,8277.0,1800.0,8299.0,8299.0,8299.0,8289.0,8279.0,8257.0,6489.0,6489.0,6489.0,6489.0,6489.0,6467.0,8299.0,8299.0,8299.0,8299.0,8289.0,8289.0,8273.0,8273.0,8273.0,8273.0,8273.0,4906.0,4906.0,4906.0,4906.0,4906.0,8378.0,8176.0,8101.0,8082.0,8028.0,7666.0,7311.0,8138.0,8069.0,8003.0,7205.0,4096.0,4096.0,4096.0,4096.0,4096.0,4096.0,4000.0,4000.0,4000.0,4000.0,4000.0,7463.0,7463.0,7433.0,1984.0,1955.0,1984.0,1984.0,1955.0,1974.0,7445.0,7463.0,7463.0,7463.0,7463.0,7463.0,5775.0,5775.0,5775.0,5775.0,5775.0,5775.0,5775.0,5775.0,5775.0,5775.0,5775.0,5775.0,7463.0,7463.0,7463.0,7463.0,7463.0,4377.0,4377.0,4377.0,4377.0,4377.0,3974.0,3974.0,3974.0,1496.0,668.0,3974.0,3974.0,3974.0,3974.0,3974.0,3974.0,2016.0,2016.0,2016.0,2016.0,2016.0,2016.0,2959.0,2959.0,2959.0,2959.0,2959.0,2959.0,2959.0,2959.0,2959.0,2959.0,2959.0,2016.0,3974.0,3974.0,3974.0,3974.0,3974.0,2016.0,2016.0,2016.0,2016.0,2016.0
unique,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,259,,241,68.0,115.0,,,,269,409.0,261.0,,,,367,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
top,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Business,,UC Berkeley,1400.0,26908.0,,,,New York,0.0,55080.0,,,,Finance,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,521,,107,403.0,241.0,,,,522,355.0,124.0,,,,202,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,283.675937,8.960248,0.500597,17.327166,1.828837,11.350919,16.872046,9.042731,9.295775,8.927668,8.963595,283.863767,0.164717,0.19601,0.395799,26.364999,2.756653,22.495347,17.396867,20.270759,17.459714,10.685375,11.84593,0.419551,6.190411,7.175256,7.369301,6.400599,6.778409,5.47487,6.134498,5.208251,1.960215,26.358928,,7.662488,,,,2.757186,3.784793,3.651645,,,,2.122063,5.006762,2.158091,,5.277791,6.425232,4.575491,6.245813,7.783829,6.985781,6.714544,5.737077,3.881191,5.745993,7.678515,5.304133,6.776118,7.919629,6.825401,7.851066,5.631281,4.339197,5.534131,5.570556,22.514632,17.396389,20.265613,17.457043,10.682539,11.845111,26.39436,11.071506,12.636308,15.566805,9.780089,11.014845,30.362192,13.273691,14.416891,18.42262,11.744499,11.854817,7.084733,8.294935,7.70446,8.403965,7.578388,6.941908,7.927232,8.284346,7.426213,7.617611,0.419909,6.189995,7.175164,7.368597,6.400598,6.777524,5.474559,6.134087,5.207523,0.948769,3.207814,20.791624,15.434255,17.243708,15.260869,11.144619,12.457925,7.21125,8.082,8.25775,7.6925,7.58925,5.71151,1.843495,2.338087,32.819556,13.529923,15.293851,18.868448,7.286957,12.156028,26.217194,15.865084,17.813755,17.654765,9.913436,12.760263,26.806234,11.929177,12.10303,15.16381,9.342511,11.320866,29.344369,13.89823,13.958265,17.967233,11.909735,12.887976,7.125285,7.931529,8.238912,7.602171,7.486802,6.827964,7.394106,7.838702,7.279415,7.332191,0.780825,0.981631,0.37695,1.230615,0.934132,24.384524,16.588583,19.411346,16.233415,10.898075,12.699142,31.330357,15.654266,16.679563,16.418155,7.823909,12.207837,25.610341,10.751267,11.524839,14.276783,9.207503,11.253802,24.970936,10.923285,11.952687,14.959108,9.526191,11.96627,7.240312,8.093357,8.388777,7.658782,7.391545,6.81002,7.615079,7.93254,7.155258,7.048611
std,158.583367,5.491329,0.500029,10.940735,0.376673,5.995903,4.358458,5.514939,5.650199,5.477009,5.491068,158.584899,0.370947,0.303539,0.489051,3.563648,1.230689,12.569802,7.044003,6.782895,6.085526,6.126544,6.362746,0.493515,1.950305,1.740575,1.550501,1.954078,1.79408,2.156163,1.841258,2.129354,0.245925,3.566763,,3.758935,,,,1.230905,2.845708,2.805237,,,,1.407181,1.444531,1.105246,,3.30952,2.619024,2.801874,2.418858,1.754868,2.052232,2.263407,2.570207,2.620507,2.502218,2.006565,2.529135,2.235152,1.700927,2.156283,1.791827,2.608913,2.717612,1.734059,4.762569,12.587674,7.0467,6.783003,6.085239,6.124888,6.362154,16.297045,6.659233,6.717476,7.328256,6.998428,6.06015,16.249937,6.976775,6.263304,6.577929,6.886532,6.167314,1.395783,1.40746,1.564321,1.076608,1.778315,1.498653,1.627054,1.283657,1.779129,1.773094,0.493573,1.950169,1.740315,1.550453,1.953702,1.794055,2.156363,1.841285,2.129565,0.989889,2.444813,12.968524,6.915322,6.59642,5.356969,5.514028,5.921789,1.41545,1.455741,1.179317,1.626839,1.793136,1.820764,0.975662,0.63124,17.15527,7.977482,7.292868,8.535963,6.125187,8.241906,14.388694,6.658494,6.535894,6.129746,5.67555,6.651547,16.402836,6.401556,5.990607,7.290107,5.856329,6.296155,14.551171,6.17169,5.398621,6.100307,6.313281,5.615691,1.37139,1.503236,1.18028,1.5482,1.744634,1.411096,1.588145,1.280936,1.647478,1.521854,1.611694,1.382139,0.484683,1.294557,0.753902,13.71212,7.471537,6.124502,5.163777,5.900697,6.557041,17.55154,9.336288,7.880088,7.231325,6.100502,8.615985,17.477134,5.740351,6.004222,6.927869,6.385852,6.516178,17.007669,6.226283,7.01065,7.935509,6.403117,7.012067,1.576596,1.610309,1.459094,1.74467,1.961417,1.507341,1.504551,1.340868,1.672787,1.717988
min,1.0,1.0,0.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,-0.83,0.0,18.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,18.0,,1.0,,,,1.0,0.0,1.0,,,,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,3.0,2.0,2.0,1.0,3.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,3.0,1.0,4.0,3.0,2.0,1.0,1.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,4.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,3.0,2.0,1.0,2.0,2.0,4.0,1.0,1.0
25%,154.0,4.0,0.0,8.0,2.0,7.0,14.0,4.0,4.0,4.0,4.0,154.0,0.0,-0.02,0.0,24.0,2.0,15.0,15.0,17.39,15.0,5.0,9.52,0.0,5.0,6.0,6.0,5.0,6.0,4.0,5.0,4.0,2.0,24.0,,5.0,,,,2.0,1.0,1.0,,,,1.0,4.0,1.0,,2.0,4.0,2.0,5.0,7.0,6.0,5.0,4.0,2.0,4.0,7.0,3.0,5.0,7.0,5.0,7.0,4.0,2.0,5.0,2.0,15.0,15.0,17.39,15.0,5.0,9.52,10.0,6.0,8.0,10.0,5.0,7.0,20.0,10.0,10.0,15.0,6.0,10.0,6.0,8.0,7.0,8.0,7.0,6.0,7.0,8.0,6.0,7.0,0.0,5.0,6.0,6.0,5.0,6.0,4.0,5.0,4.0,0.0,2.0,14.81,10.0,10.0,10.0,7.0,9.0,7.0,7.0,8.0,7.0,7.0,5.0,1.0,2.0,20.0,10.0,10.0,10.0,0.0,5.0,16.67,10.0,15.0,15.0,5.0,10.0,10.0,8.0,8.0,9.0,5.0,7.0,19.15,10.0,10.0,15.0,10.0,10.0,7.0,7.0,8.0,7.0,7.0,6.0,6.0,7.0,6.0,6.0,0.0,0.0,0.0,1.0,1.0,15.22,10.0,16.67,14.81,5.0,10.0,20.0,10.0,10.0,10.0,0.0,5.0,10.0,7.0,7.0,9.0,5.0,7.0,10.0,7.0,7.0,9.0,6.0,5.0,7.0,7.0,8.0,7.0,6.0,6.0,7.0,7.0,6.0,6.0
50%,281.0,8.0,1.0,16.0,2.0,11.0,18.0,8.0,9.0,8.0,8.0,281.0,0.0,0.21,0.0,26.0,2.0,20.0,18.37,20.0,18.0,10.0,10.64,0.0,6.0,7.0,7.0,7.0,7.0,6.0,6.0,5.0,2.0,26.0,,8.0,,,,2.0,3.0,3.0,,,,2.0,5.0,2.0,,6.0,7.0,4.0,6.0,8.0,7.0,7.0,6.0,3.0,6.0,8.0,6.0,7.0,8.0,7.0,8.0,6.0,4.0,6.0,4.0,20.0,18.18,20.0,18.0,10.0,10.64,25.0,10.0,10.0,15.0,10.0,10.0,25.0,15.0,15.0,20.0,10.0,10.0,7.0,8.0,8.0,8.0,8.0,7.0,8.0,8.0,8.0,8.0,0.0,6.0,7.0,7.0,7.0,7.0,6.0,6.0,5.0,0.0,3.0,17.65,15.79,18.42,15.91,10.0,12.5,7.0,8.0,8.0,8.0,8.0,6.0,1.0,2.0,30.0,10.0,15.0,20.0,5.0,10.0,20.0,16.67,19.05,18.37,10.0,13.0,25.0,10.0,10.0,15.0,10.0,10.0,25.0,15.0,15.0,18.52,10.0,13.95,7.0,8.0,8.0,8.0,8.0,7.0,8.0,8.0,7.0,7.0,0.0,1.0,0.0,1.0,1.0,20.0,16.67,20.0,16.33,10.0,14.29,25.0,15.0,18.0,17.0,10.0,10.0,20.0,10.0,10.0,12.0,9.0,10.0,20.0,10.0,10.0,15.0,10.0,10.0,7.0,8.0,8.0,8.0,8.0,7.0,8.0,8.0,7.0,7.0
75%,407.0,13.0,1.0,26.0,2.0,15.0,20.0,13.0,14.0,13.0,13.0,408.0,0.0,0.43,1.0,28.0,4.0,25.0,20.0,23.81,20.0,15.0,16.0,1.0,8.0,8.0,8.0,8.0,8.0,7.0,7.0,7.0,2.0,28.0,,10.0,,,,4.0,6.0,6.0,,,,2.0,6.0,3.0,,7.0,9.0,7.0,8.0,9.0,9.0,8.0,8.0,6.0,8.0,9.0,7.0,9.0,9.0,8.0,9.0,8.0,7.0,7.0,8.0,25.0,20.0,23.81,20.0,15.0,16.0,35.0,15.0,16.0,20.0,15.0,15.0,40.0,18.75,20.0,20.0,15.0,15.63,8.0,9.0,9.0,9.0,9.0,8.0,9.0,9.0,9.0,9.0,1.0,8.0,8.0,8.0,8.0,8.0,7.0,7.0,7.0,2.0,4.0,25.0,20.0,20.0,20.0,15.0,16.28,8.0,9.0,9.0,9.0,9.0,7.0,3.0,3.0,40.0,20.0,20.0,24.0,10.0,20.0,30.0,20.0,20.0,20.0,15.0,16.67,40.0,15.0,15.0,20.0,10.0,15.0,38.46,19.23,17.39,20.0,15.09,16.515,8.0,9.0,9.0,9.0,9.0,8.0,8.0,9.0,8.0,8.0,1.0,1.0,1.0,1.0,1.0,30.0,20.0,20.0,20.0,15.0,16.67,40.0,20.0,20.0,20.0,10.0,20.0,37.0,15.0,15.0,20.0,10.0,15.0,35.0,15.0,15.0,20.0,10.0,15.0,8.0,9.0,9.0,9.0,9.0,8.0,9.0,9.0,8.0,8.0





<a id='missing'></a>
##### 1 - Focus on missing values

In [5]:
information = dataset[['iid', 'id',	'gender', 'idg', 'condtn', 'wave', 'round', 'position', 'positin1', 'order', 'partner', 'pid', 'match', 'int_corr', 'samerace', 'age_o', 'race_o', 'pf_o_att', 'pf_o_sin', 'pf_o_int', 'pf_o_fun', 'pf_o_amb', 'pf_o_sha', 'dec_o', 'attr_o', 'sinc_o', 'intel_o', 'fun_o', 'amb_o', 'shar_o', 'like_o', 'prob_o', 'met_o']]
signup = dataset[['age', 'field', 'field_cd', 'undergra', 'mn_sat', 'tuition', 'race', 'imprace', 'imprelig', 'from', 'zipcode', 'income', 'goal', 'date', 'go_out', 'career', 'career_c', 'sports', 'tvsports', 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga', 'exphappy', 'expnum', 'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1', 'attr4_1', 'sinc4_1', 'intel4_1', 'fun4_1', 'amb4_1', 'shar4_1', 'attr2_1', 'sinc2_1', 'intel2_1', 'fun2_1', 'amb2_1', 'shar2_1', 'attr3_1', 'sinc3_1','fun3_1','intel3_1','amb3_1','attr5_1','sinc5_1','intel5_1','fun5_1','amb5_1']]
scorecard = dataset[['dec', 'attr', 'sinc', 'intel', 'fun', 'amb', 'shar', 'like', 'prob', 'met', 'match_es', 'attr1_s', 'sinc1_s', 'intel1_s', 'fun1_s', 'amb1_s', 'shar1_s','attr3_s', 'sinc3_s','intel3_s','fun3_s','amb3_s']]
followup = dataset[['satis_2', 'length', 'numdat_2', 'attr7_2', 'sinc7_2', 'intel7_2', 'fun7_2', 'amb7_2', 'shar7_2', 'attr1_2', 'sinc1_2', 'intel1_2', 'fun1_2', 'amb1_2', 'shar1_2', 'attr4_2', 'sinc4_2', 'intel4_2', 'fun4_2', 'amb4_2', 'shar4_2', 'attr2_2', 'sinc2_2', 'intel2_2', 'fun2_2', 'amb2_2', 'shar2_2', 'attr3_2', 'sinc3_2', 'intel3_2', 'fun3_2', 'amb3_2', 'attr5_2', 'sinc5_2', 'intel5_2', 'fun5_2', 'amb5_2']]
followup2 = dataset[['you_call', 'them_cal', 'date_3', 'numdat_3', 'num_in_3', 'attr1_3', 'sinc1_3', 'intel1_3', 'fun1_3', 'amb1_3', 'shar1_3', 'attr7_3', 'sinc7_3', 'intel7_3', 'fun7_3', 'amb7_3', 'shar7_3', 'attr4_3', 'sinc4_3', 'intel4_3', 'fun4_3', 'amb4_3', 'shar4_3', 'attr2_3', 'sinc2_3', 'intel2_3', 'fun2_3', 'amb2_3', 'shar2_3', 'attr3_3',  'sinc3_3', 'intel3_3', 'fun3_3', 'amb3_3', 'attr5_3', 'sinc5_3', 'intel5_3', 'fun5_3', 'amb5_3']]


There are two types of data in this dataset. In order to have a better understanding of the missing data, I've separated them into 4 datasets. The data saved in the "information" dataset are certainly data filled in by the event organizers. The other 4 datasets correspond to forms filled in by participants at various times:
- signup": at the time of registration 
- scorecard": during the dates
- followup": the day after the event
- followup2": 3-4 weeks after the event

In [6]:
total_missing = information.isnull().sum().sum()
total_cells = np.product(information.shape)
percentage_missing = (total_missing / total_cells) * 100
print(f"The percentage of missing values for the information section is: {percentage_missing:.2f}%")

total_missing = signup.isnull().sum().sum()
total_cells = np.product(signup.shape)
percentage_missing = (total_missing / total_cells) * 100
print(f"The percentage of missing values in the signup form for is: {percentage_missing:.2f}%")

total_missing = scorecard.isnull().sum().sum()
total_cells = np.product(scorecard.shape)
percentage_missing = (total_missing / total_cells) * 100
print(f"The percentage of missing values in the scorecard form for is: {percentage_missing:.2f}%")

total_missing = followup.isnull().sum().sum()
total_cells = np.product(followup.shape)
percentage_missing = (total_missing / total_cells) * 100
print(f"The percentage of missing values in the first follow up form for is: {percentage_missing:.2f}%")

total_missing = followup2.isnull().sum().sum()
total_cells = np.product(followup2.shape)
percentage_missing = (total_missing / total_cells) * 100
print(f"The percentage of missing values in the second follow up form for is: {percentage_missing:.2f}%")

The percentage of missing values for the information section is: 2.43%
The percentage of missing values in the signup form for is: 10.82%
The percentage of missing values in the scorecard form for is: 28.53%
The percentage of missing values in the first follow up form for is: 33.08%
The percentage of missing values in the second follow up form for is: 64.94%


we can see that there's very little missing data when it comes to information entered by users. On the other hand, for data entered by participants, the number of missing data increases with time. This can be explained, on the one hand, by the fact that not all fields were mandatory, but also by a loss of interest over time.

<a id='engineering'></a>
### III - Data engineering

<a id='waves'></a>
##### 1 - Waves selection

Since waves 6 to 9 were given a different grading system concerning attributes, I have decided the remove these 4 waves from the dataset not to interfere with the analysis

In [7]:
dataset = dataset[(dataset['wave'] != 6) & (dataset['wave'] != 7) & (dataset['wave'] != 8) & (dataset['wave'] != 9)]

<a id='renaming'></a>
##### 2 - Renaming useful column values

In [8]:
dataset["gender"] = dataset['gender'].apply(lambda x: "Female" if x == 0
                                            else "Male" if x == 1
                                            else "Nan")

dataset["race"] = dataset['race'].apply(lambda x: "Black/African American" if x == 1
                                        else "European/Caucasian-American" if x == 2 
                                        else "Latino/Hispanic American" if x == 3
                                        else "Asian/Pacific Islander/Asian-American" if x == 4
                                        else "Native American" if x == 5
                                        else "Other" if x == 6
                                        else "Nan")

dataset["field_cd"] = dataset['field_cd'].apply(lambda x: "Law" if x == 1
                                        else "Math" if x == 2 
                                        else "Social Science, Psychologist" if x == 3
                                        else "Medical Science, Pharmaceuticals, and Bio Tech" if x == 4
                                        else "Engineering" if x == 5
                                        else "English/Creative Writing/ Journalism" if x == 6
                                        else "History/Religion/Philosophy" if x == 7
                                        else "Business/Econ/Finance" if x == 8
                                        else "Education, Academia" if x == 9
                                        else "Nan")

dataset['goal'] = dataset['goal'].apply(lambda x: "Seemed like a fun night out" if x==1 
                               else "To meet new people" if x == 2
                               else "To get a date" if x==3 
                               else "Looking for a serious relationship" if x == 4
                               else "To say I did it" if x == 5
                               else "Other" if x == 6
                               else "Nan")

dataset['date'] = dataset['date'].apply(lambda x: "Several times a week" if x==1 
                               else "Twice a week" if x == 2
                               else "Once a week" if x==3 
                               else "Twice a month" if x == 4
                               else "Once a month" if x == 5
                               else "Several times a year" if x == 6
                                else "Almost never" if x == 7
                               else "Nan")

dataset['go_out'] = dataset['go_out'].apply(lambda x: "Several times a week" if x==1 
                               else "Twice a week" if x == 2
                               else "Once a week" if x==3 
                               else "Twice a month" if x == 4
                               else "Once a month" if x == 5
                               else "Several times a year" if x == 6
                                else "Almost never" if x == 7
                               else "Nan")

dataset['samerace'] = dataset['samerace'].apply(lambda x: "Yes" if x==1 
                               else "No" if x ==0
                               else "Nan")

dataset['date_3'] = dataset['date_3'].apply(lambda x: "Yes" if x==1 
                               else "No" if x==0
                               else "Nan")

<a id='group'></a>
### IV - Descriptive analysis of the group

In [25]:
# deduplicate data for analysis
deduplicated_dataset = dataset[['iid', 'gender', 'race', 'imprace', 'age', 'field_cd','goal', 'date', 'go_out',
                                'attr1_1', 'sinc1_1','intel1_1','fun1_1','amb1_1','shar1_1', 
                                'attr7_2', 'sinc7_2','intel7_2','fun7_2','amb7_2','shar7_2',
                                'you_call', 'them_cal', 'date_3']]

deduplicated_dataset.drop_duplicates(subset ="iid", keep = 'first', inplace=True)
deduplicated_dataset.shape



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(449, 24)

In [10]:
# Number of people 
total = len(deduplicated_dataset)
print(f"{total} people participated in the experiment")

449 people participated in the experiment


In [11]:
# Males / females distribution
count_gender = deduplicated_dataset['gender'].value_counts()

male_nb = count_gender['Male']
female_nb = count_gender['Female']

print(f"{male_nb} men and {female_nb} women participated in the experiement")

226 men and 223 women participated in the experiement


In [12]:
# Distribution by gender
fig_gender = px.pie(deduplicated_dataset, names='gender', title= 'Repartition by gender', width = 1000, height=400)
fig_gender.update_layout(title_x=0.5, legend=dict(x= 0.7))
fig_gender.show(); 

# Repartition by race
fig_race = px.pie(deduplicated_dataset, names='race',title='Repartition by race', width = 1000, height=400)
fig_race.update_layout(title_x=0.5, legend=dict(x= 0.8))
fig_race.show()

# Repartition by field
fig_field = px.pie(deduplicated_dataset, names='field_cd',title='Repartition by field', width = 1000, height=400)
fig_field.update_layout(title_x=0.5, legend=dict(x= 0.85))
fig_field.show()

# Age distribution
fig_age = px.histogram(deduplicated_dataset, x='age', title='Age distribution', width = 1000, height=400)
fig_age.update_layout(title_x=0.5)
fig_age.show()

# Age distribution by gender
fig_age_gender = px.box(deduplicated_dataset, x = 'gender', y = 'age', title='Age distribution by gender', width = 1000, height=500)
fig_age_gender.update_layout(title_x=0.5)
fig_age_gender.show()

An almost equal number of men and women took part in the event, mainly from Caucasians. 
On average, the men were slightly older.

In [13]:
# Dating and going out  habits of the participants
goal_data = deduplicated_dataset['goal'].value_counts()
date_data = deduplicated_dataset['date'].value_counts()
go_out_data = deduplicated_dataset['go_out'].value_counts()

# Reasons to participate in this event
fig = go.Figure()
fig.add_trace(go.Bar(x=goal_data.index, y=goal_data.values))
fig.update_layout(title="What is your primary goal in participating in this event?", title_x =0.5, width = 1000, height=400)
fig.show()

# Frequence of dates
fig = go.Figure()
fig.add_trace(go.Bar(x=date_data.index, y=date_data.values))
fig.update_layout(title="In general, how frequently do you go on dates?", title_x =0.5, width = 1000, height=400)
fig.show()


# Frequence of going out
fig = go.Figure()
fig.add_trace(go.Bar(x=go_out_data.index, y=go_out_data.values))
fig.update_layout(title="How often do you go out (not necessarily on dates)?", title_x =0.5, width = 1000, height=400)
fig.show()

The main reason people signed up for the event isn't really to find love, but rather to spend an evening out and meet new people. Overall, they don't regularly go on dates but tend to go out often.

<a id='dates'></a>
### V - Dates analysis

In [14]:
# Number of dates 
dates = int(len(dataset)/2)
print(f"There were {dates} dates during the experiment")

There were 3408 dates during the experiment


In [15]:
# Match percentage
fig_match = px.pie(dataset, names='match', title= 'Match percentage', width = 1000, height=400)
fig_match.update_layout(title_x=0.5, legend=dict(x= 0.7))
fig_match.show(); 

16.5% of dates ended in a match

In [16]:
# Decision analysis
male_data = dataset[dataset['gender'] == 'Male']
female_data = dataset[dataset['gender'] == 'Female']

male_counts = male_data['dec'].value_counts()
female_counts = female_data['dec'].value_counts()

fig_male = go.Figure(data=[go.Pie(labels=male_counts.index, values=male_counts.values)])
fig_male.update_layout(title_text="Decision after the date - Males", title_x=0.5, legend=dict(x= 0.7), width = 1000, height=300)

fig_female = go.Figure(data=[go.Pie(labels=female_counts.index, values=female_counts.values)])
fig_female.update_layout(title_text="Decision after the date - Females", title_x=0.5, legend=dict(x= 0.7), width = 1000, height=300)

fig_male.show()
fig_female.show()

We can see that women are more picky and less likely to match than men

<a id='attributes'></a>
##### 1 - Focus on attributes

In [28]:
dataset_male = deduplicated_dataset[deduplicated_dataset['gender'] == "Male"]
dataset_female = deduplicated_dataset[deduplicated_dataset['gender'] == "Female"]

gender_attributes = deduplicated_dataset[['gender','attr1_1', 'sinc1_1','intel1_1','fun1_1','amb1_1','shar1_1']].groupby('gender').mean()

male_attributes_before = dataset_male[['attr1_1', 'sinc1_1','intel1_1','fun1_1','amb1_1','shar1_1']].mean()
male_attributes_after = dataset_male[['attr7_2', 'sinc7_2','intel7_2','fun7_2','amb7_2','shar7_2']].mean()
female_attributes_before = dataset_female[['attr1_1', 'sinc1_1','intel1_1','fun1_1','amb1_1','shar1_1']].mean()
female_attributes_after = dataset_female[['attr7_2', 'sinc7_2','intel7_2','fun7_2','amb7_2','shar7_2']].mean()

attributes = ['Attractive','Sincere','Intelligent','Fun','Ambitious','Shared Interests']
gender_attributes.columns = attributes

# Male
trace1 = go.Bar(
    y=list(male_attributes_before),
    x=gender_attributes.columns.values,
    name='Before',
    )
    
trace2 = go.Bar(
    y=list(male_attributes_after),
    x=gender_attributes.columns,
    name='After'
)

data1 = [trace1, trace2]
layout = go.Layout(
    title="Before vs After the dates - Men",
    title_x = 0.5, 
    width = 1000, 
    height=400
)
fig = go.Figure(data=data1, layout=layout)
py.iplot(fig, filename='barchart')

# Female
trace1 = go.Bar(
    y=list(female_attributes_before),
    x=gender_attributes.columns.values,
    name='Before',
    )
    
trace2 = go.Bar(
    y=list(female_attributes_after),
    x=gender_attributes.columns,
    name='After'
)

data1 = [trace1, trace2]
layout = go.Layout(
    title="Before vs After the dates - Women",
    title_x = 0.5, 
    width = 1000, 
    height=400
)
fig = go.Figure(data=data1, layout=layout)
py.iplot(fig, filename='barchart')

Here, I wanted to analyze the importance of the various attributes for the participants by comparing their answers at sign-up with those they gave 3-4 weeks after the dates.
The most obvious difference is the one given by the women on attractiveness, which was much more important than what they declared at sign-up.

<a id='races'></a>
##### 2 - Focus on race

In [20]:
# Importance of race
race_imp = deduplicated_dataset['imprace'].value_counts()

fig = go.Figure()
fig.add_trace(go.Bar(x=race_imp.index, y=race_imp.values))
fig.update_layout(title="How important is it to you (on a scale of 1-10) that a person you date be of the same racial/ethnic background?", title_x =0.5, width = 1000, height=400)
fig.show()

In [21]:
# Decision analysis
samerace_data = dataset[dataset['samerace'] == 'Yes']
differentsamerace_data = dataset[dataset['samerace'] == 'No']

samerace_data = samerace_data['match'].value_counts()
differentrace_data = differentsamerace_data['match'].value_counts()

fig_same_race = go.Figure(data=[go.Pie(labels=samerace_data.index, values=samerace_data.values)])
fig_same_race.update_layout(title_text="Same race couples match", title_x=0.5, legend=dict(x= 0.7), width = 1000, height=300)

fig_differentrace = go.Figure(data=[go.Pie(labels=differentrace_data.index, values=differentrace_data.values)])
fig_differentrace.update_layout(title_text="Different race couples match", title_x=0.5, legend=dict(x= 0.7), width = 1000, height=300)

fig_same_race.show()
fig_differentrace.show()

a majority of participants said that sharing the same race as their partner wasn't very important to them, although we can see that there is a slightly higher proportion of matches for same-race couples.

<a id='order'></a>
##### 3 - Focus on date order

In [22]:
dec_mean_by_order = dataset.groupby('order')['dec'].mean().reset_index()

fig = px.line(dec_mean_by_order, x='order', y='dec')
fig.update_xaxes(title='Date position')
fig.update_yaxes(title='Decision mean')
fig.update_layout(title="Date order importance on decision", title_x =0.5, width = 1000, height=400)
fig.show()

Meeting your partner in first position is the best way to get a match

<a id='after'></a>
##### 4 - Happily ever after ?

In [23]:
deduplicated_dataset["Called"] = (deduplicated_dataset["you_call"] > 0) & (deduplicated_dataset["them_cal"] > 0)

deduplicated_dataset["Called"] = deduplicated_dataset["Called"].apply(lambda x: "Yes" if x==True 
                                                                                else "No" if x==False
                                                                                else "Nan")

fig_match = px.pie(deduplicated_dataset, names='Called', title= 'Percentage of participants who gave or received a phone call after the event', width = 1000, height=300)
fig_match.update_layout(title_x=0.5, legend=dict(x= 0.6))
fig_match.show(); 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [24]:
fig_match = px.pie(deduplicated_dataset, names='date_3', title= 'Percentage of participants who went on a second date after the event', width = 1000, height=300)
fig_match.update_layout(title_x=0.5, legend=dict(x= 0.6))
fig_match.show();

I waas curious to know what happened after the event, 10% of participants got on the phone with at least one of their matches and 28,3% got on a second date. 