In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
df = pd.read_csv('data/speed_dating_data.csv', encoding = "ISO-8859-1")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 195 entries, iid to amb5_3
dtypes: float64(174), int64(13), object(8)
memory usage: 12.5+ MB


In [4]:
df1 = df.dropna()

In [5]:
df1

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3


Drop all male records to eliminate duplicates in interaction records

In [6]:
df = df[df.gender == 0]

# Exploration

## Verify: match iff dec & dec_o

In [7]:
# test_df = pd.concat([
#     df.match,
#     df.dec,
#     df.dec_o
# ], axis=1)

# test_df['expected'] = test_df.apply(lambda row: row.dec&row.dec_o == row.match, axis=1)
# assert test_df.expected.sum()==len(test_df.index)

## Use self evaluation or perception by others as proxy?

In [8]:
# test_df = pd.concat([
#     df.iid, df.pid,
#     df.loc[:, 'attr3_1':'amb3_1'],  # How do you think you measure up?
#     df.loc[:, 'attr5_1':'amb5_1'],  # How do others perceive you?
# ], axis=1)

# test_df.info()

Too many missing values for "How do others perceive you" results.

# Data Cleaning / Feature Engineering

In [9]:
X = pd.concat([
    df.iid, df.partner, df.pid, # id of self and partner, to be removed later
    df.gender,
    df.age, df.age_o,
    df.int_corr, # correlation of interests
    df.samerace,
    df.goal,
    df.date,
    df.exphappy, # expected happiness with people you will meet
    df.loc[:, 'attr3_1':'amb3_1'],  # self evaluation
    df.loc[:, 'attr':'shar'],  # evaluation of partner
    df.loc[:, 'attr1_1':'shar1_1'],  # what's important to you, sum to 100 
    df.loc[:, 'pf_o_att':'pf_o_sha'], # what's important to partner, sum to 100
], axis=1)

y = pd.concat([
    df.match,  # Label for two-way prediction, whether two people will be a good match
    df.dec_o  # Label for one way prediction, whether your partner will say "yes"
], axis=1)

In [10]:
X1 = X.dropna()
X1

Unnamed: 0,iid,partner,pid,gender,age,age_o,int_corr,samerace,goal,date,...,intel1_1,fun1_1,amb1_1,shar1_1,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,pf_o_amb,pf_o_sha
0,1,1,11.0,0,21.0,27.0,0.14,0,2.0,7.0,...,20.0,15.0,15.0,15.0,35.0,20.0,20.0,20.0,0.0,5.0
1,1,2,12.0,0,21.0,22.0,0.54,0,2.0,7.0,...,20.0,15.0,15.0,15.0,60.0,0.0,0.0,40.0,0.0,0.0
2,1,3,13.0,0,21.0,22.0,0.16,1,2.0,7.0,...,20.0,15.0,15.0,15.0,19.0,18.0,19.0,18.0,14.0,12.0
3,1,4,14.0,0,21.0,23.0,0.61,0,2.0,7.0,...,20.0,15.0,15.0,15.0,30.0,5.0,15.0,40.0,5.0,5.0
4,1,5,15.0,0,21.0,24.0,0.21,0,2.0,7.0,...,20.0,15.0,15.0,15.0,30.0,10.0,20.0,10.0,10.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7889,530,18,548.0,0,22.0,30.0,0.23,0,2.0,7.0,...,10.0,15.0,5.0,30.0,40.0,10.0,20.0,10.0,0.0,20.0
7890,530,19,549.0,0,22.0,28.0,0.29,0,2.0,7.0,...,10.0,15.0,5.0,30.0,20.0,20.0,20.0,20.0,0.0,20.0
7891,530,20,550.0,0,22.0,30.0,-0.20,0,2.0,7.0,...,10.0,15.0,5.0,30.0,30.0,3.0,30.0,30.0,3.0,4.0
7892,530,21,551.0,0,22.0,27.0,-0.32,0,2.0,7.0,...,10.0,15.0,5.0,30.0,40.0,20.0,20.0,20.0,0.0,0.0


In [11]:
assert y.match.count() == len(X.index)
assert y.dec_o.count() == len(X.index)

In [12]:
print("Baseline accuracy for match: %f" % (1 - df.match.sum()/df.match.count()))
print("Baseline accuracy for decision: %f" % (1 - df.dec_o.sum()/df.dec_o.count()))

Baseline accuracy for match: 0.835086
Baseline accuracy for decision: 0.525335


In [13]:
df['1_1total'] = df.loc[:, 'attr1_1':'shar1_1'].sum(axis=1)

### pid (partner's unique ID)

In [14]:
# X[X.pid.isnull()]  # The missing pid comes from same person in one night

In [15]:
# # Assign a new pid to the person missing
# X.pid.fillna(X.pid.max() + 1, inplace=True)
# assert X.pid.isna().sum() == 0

### importance ratings: fill with 100/6 

In [16]:
# X.loc[:, 'attr1_1':'pf_o_sha'].isna().sum()

In [17]:
# X.loc[:, 'attr1_1':'pf_o_sha'].fillna(100/6, inplace=True) # doesn't work

# X.attr1_1.fillna(100/6, inplace=True)
# X.sinc1_1.fillna(100/6, inplace=True)
# X.intel1_1.fillna(100/6, inplace=True)
# X.fun1_1.fillna(100/6, inplace=True)
# X.amb1_1.fillna(100/6, inplace=True)
# X.shar1_1.fillna(100/6, inplace=True)
# X.pf_o_att.fillna(100/6, inplace=True)
# X.pf_o_sin.fillna(100/6, inplace=True)
# X.pf_o_int.fillna(100/6, inplace=True)
# X.pf_o_fun.fillna(100/6, inplace=True)
# X.pf_o_amb.fillna(100/6, inplace=True)
# X.pf_o_sha.fillna(100/6, inplace=True)

In [18]:
# assert X.loc[:, 'attr1_1':'pf_o_sha'].isna().sum().sum() == 0

In [19]:
# X.date.isna().sum()

### goal: convert to indicator variables

In [20]:
one_hot = pd.get_dummies(X.goal, 
                         prefix='goal', 
                         dummy_na=True, 
                         drop_first=False)

X = X.drop('goal', axis=1)
X = X.join(one_hot)

### Train-Test Split

In [21]:
X.drop(columns=['iid', 'partner', 'pid'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.2,
                                                    stratify=y)

### age: fill with median among gender

In [22]:
# X_train.age.isnull().sum()

In [23]:
# # fill missing age values with median age among gender
# X_train['age'] = X_train.groupby('gender').transform(lambda group: group.fillna(group.median()))
# X_train['age_o'] = X_train.groupby('gender').transform(lambda group: group.fillna(group.median()))

# X_test['age'] = X_test.groupby('gender').transform(lambda group: group.fillna(group.median()))
# X_test['age_o'] = X_test.groupby('gender').transform(lambda group: group.fillna(group.median()))

In [24]:
# X_train['age'].mean()

In [25]:
# X_train['age'] = X_train.groupby('gender').transform(lambda group: group.fillna(group.mean()))
# X_train['age_o'] = X_train.groupby('gender').transform(lambda group: group.fillna(group.mean()))

# X_test['age'] = X_test.groupby('gender').transform(lambda group: group.fillna(group.mean()))
# X_test['age_o'] = X_test.groupby('gender').transform(lambda group: group.fillna(group.mean()))

In [26]:
# X_train['age'].mean()

In [27]:
# assert X_train.age.isna().sum() == 0
# assert X_train.age_o.isna().sum() == 0

# assert X_test.age.isna().sum() == 0
# assert X_test.age_o.isna().sum() == 0

### int_corr, date, exphappy, self evaluation: fill with median

In [28]:
# for c in X_train:
#     X_train[c] = X_train.groupby('gender').transform(lambda group: group.fillna(group.median()))
#     X_test[c] = X_test.groupby('gender').transform(lambda group: group.fillna(group.median()))

In [29]:
X_train.fillna(X_train.median(), inplace=True)
X_test.fillna(X_test.median(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [30]:
# # imp = SimpleImputer(missing_values=np.nan, strategy='mean') # .40 best
# # imp = IterativeImputer(max_iter=10) # .50 best
# imp = SimpleImputer(missing_values=np.nan, strategy='median') #.44
# X_train_i = pd.DataFrame(imp.fit_transform(X_train))
# X_train_i.columns = X_train.columns
# X_train_i.index = X_train.index

In [31]:
# # imp = SimpleImputer(missing_values=np.nan, strategy='mean')
# # imp = IterativeImputer(max_iter=10)
# imp = SimpleImputer(missing_values=np.nan, strategy='median')
# X_test_i = pd.DataFrame(imp.fit_transform(X_test))
# X_test_i.columns = X_test.columns
# X_test_i.index = X_test.index

In [32]:
X_train

Unnamed: 0,gender,age,age_o,int_corr,samerace,date,exphappy,attr3_1,sinc3_1,fun3_1,...,pf_o_fun,pf_o_amb,pf_o_sha,goal_1.0,goal_2.0,goal_3.0,goal_4.0,goal_5.0,goal_6.0,goal_nan
3875,0,28.0,27.0,0.61,1,7.0,4.0,8.0,8.0,7.0,...,20.00,10.00,0.00,0,1,0,0,0,0,0
7451,0,26.0,30.0,0.43,0,7.0,5.0,7.0,9.0,6.0,...,30.00,3.00,4.00,0,1,0,0,0,0,0
1953,0,24.0,23.0,-0.20,1,7.0,5.0,5.0,5.0,7.0,...,16.28,16.28,16.28,0,1,0,0,0,0,0
3457,0,34.0,25.0,0.02,0,6.0,7.0,7.0,9.0,7.0,...,15.00,0.00,25.00,0,1,0,0,0,0,0
5794,0,24.0,23.0,0.36,1,4.0,2.0,7.0,7.0,10.0,...,20.00,10.00,15.00,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2009,0,28.0,28.0,0.65,1,5.0,4.0,8.0,8.0,8.0,...,20.45,11.36,11.36,1,0,0,0,0,0,0
2651,0,25.0,26.0,0.52,0,1.0,6.0,9.0,9.0,9.0,...,12.77,6.38,21.28,1,0,0,0,0,0,0
5824,0,27.0,31.0,0.26,1,6.0,4.0,7.0,9.0,7.0,...,20.00,0.00,20.00,0,0,1,0,0,0,0
56,0,23.0,30.0,0.47,0,3.0,4.0,5.0,7.0,8.0,...,10.00,5.00,10.00,1,0,0,0,0,0,0


In [33]:
assert X_train.isna().sum().sum() == 0
assert X_test.isna().sum().sum() == 0

In [34]:
X_train.to_csv('data/X_train.csv', index=False)

In [35]:
X_test.to_csv('data/X_test.csv', index=False)

In [36]:
y_train.to_csv('data/y_train.csv', index=False)#, header=True)

In [37]:
y_test.to_csv('data/y_test.csv', index=False)#, header=True)