In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/speed_dating_data.csv', encoding = "ISO-8859-1")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 195 entries, iid to amb5_3
dtypes: float64(174), int64(13), object(8)
memory usage: 12.5+ MB


# Exploration

## Verify: match iff dec & dec_o

In [4]:
test_df = pd.concat([
    df.match,
    df.dec,
    df.dec_o
], axis=1)

test_df['expected'] = test_df.apply(lambda row: row.dec&row.dec_o == row.match, axis=1)
assert test_df.expected.sum()==len(test_df.index)

## Use self evaluation or perception by others as proxy?

In [5]:
test_df = pd.concat([
    df.iid, df.pid,
    df.loc[:, 'attr3_1':'amb3_1'],  # How do you think you measure up?
    df.loc[:, 'attr5_1':'amb5_1'],  # How do others perceive you?
], axis=1)

test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Data columns (total 12 columns):
iid         8378 non-null int64
pid         8368 non-null float64
attr3_1     8273 non-null float64
sinc3_1     8273 non-null float64
fun3_1      8273 non-null float64
intel3_1    8273 non-null float64
amb3_1      8273 non-null float64
attr5_1     4906 non-null float64
sinc5_1     4906 non-null float64
intel5_1    4906 non-null float64
fun5_1      4906 non-null float64
amb5_1      4906 non-null float64
dtypes: float64(11), int64(1)
memory usage: 785.6 KB


Too many missing values for "How do others perceive you" results.

# Data Cleaning / Feature Engineering

In [6]:
X = pd.concat([
    df.iid, df.partner, df.pid, # id of self and partner, to be removed later
    df.gender,
    df.age, df.age_o,
    df.int_corr, # correlation of interests
    df.samerace,
    df.goal,
    df.date,
    df.exphappy, # expected happiness with people you will meet
    df.loc[:, 'attr3_1':'amb3_1'],  # self evaluation
    df.loc[:, 'attr':'shar'],  # evaluation of partner
    df.loc[:, 'attr1_1':'shar1_1'],  # what's important to you, sum to 100 
    df.loc[:, 'pf_o_att':'pf_o_sha'], # what's important to partner, sum to 100
], axis=1)

y = pd.concat([
    df.match,  # Label for two-way prediction, whether two people will be a good match
    df.dec_o  # Label for one way prediction, whether your partner will say "yes"
], axis=1)

In [7]:
assert y.match.count() == len(X.index)
assert y.dec_o.count() == len(X.index)

In [8]:
print("Baseline accuracy for match: %f" % (1 - df.match.sum()/df.match.count()))
print("Baseline accuracy for decision: %f" % (1 - df.dec_o.sum()/df.dec_o.count()))

Baseline accuracy for match: 0.835283
Baseline accuracy for decision: 0.580449


### pid (partner's unique ID)

In [9]:
X[X.pid.isnull()]  # The missing pid comes from same person in one night

Unnamed: 0,iid,partner,pid,gender,age,age_o,int_corr,samerace,goal,date,...,intel1_1,fun1_1,amb1_1,shar1_1,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,pf_o_amb,pf_o_sha
1755,122,7,,1,22.0,,-0.12,0,1.0,3.0,...,10.0,20.0,5.0,10.0,,,,,,
1765,123,7,,1,18.0,,-0.29,0,2.0,5.0,...,20.0,5.0,5.0,10.0,,,,,,
1775,124,7,,1,22.0,,-0.05,0,2.0,4.0,...,10.0,10.0,10.0,20.0,,,,,,
1785,125,7,,1,21.0,,0.15,0,1.0,3.0,...,20.0,15.0,12.0,18.0,,,,,,
1795,126,7,,1,22.0,,0.01,0,4.0,6.0,...,15.0,10.0,5.0,5.0,,,,,,
1805,127,7,,1,22.0,,0.38,0,6.0,5.0,...,25.0,15.0,5.0,5.0,,,,,,
1815,128,7,,1,21.0,,-0.05,0,1.0,3.0,...,10.0,20.0,10.0,5.0,,,,,,
1825,129,7,,1,,,0.09,0,3.0,3.0,...,20.0,20.0,,,,,,,,
1835,130,7,,1,20.0,,-0.4,0,1.0,6.0,...,10.0,,,,,,,,,
1845,131,7,,1,19.0,,-0.14,0,2.0,7.0,...,20.0,20.0,10.0,15.0,,,,,,


In [10]:
# Assign a new pid to the person missing
X.pid.fillna(X.pid.max() + 1, inplace=True)
assert X.pid.isna().sum() == 0

### importance ratings: fill with 100/6 

In [11]:
# X.loc[:, 'attr1_1':'pf_o_sha'].isna().sum()

In [12]:
# X.loc[:, 'attr1_1':'pf_o_sha'].fillna(100/6, inplace=True) # doesn't work

# X.attr1_1.fillna(100/6, inplace=True)
# X.sinc1_1.fillna(100/6, inplace=True)
# X.intel1_1.fillna(100/6, inplace=True)
# X.fun1_1.fillna(100/6, inplace=True)
# X.amb1_1.fillna(100/6, inplace=True)
# X.shar1_1.fillna(100/6, inplace=True)
# X.pf_o_att.fillna(100/6, inplace=True)
# X.pf_o_sin.fillna(100/6, inplace=True)
# X.pf_o_int.fillna(100/6, inplace=True)
# X.pf_o_fun.fillna(100/6, inplace=True)
# X.pf_o_amb.fillna(100/6, inplace=True)
# X.pf_o_sha.fillna(100/6, inplace=True)

In [13]:
# assert X.loc[:, 'attr1_1':'pf_o_sha'].isna().sum().sum() == 0

### goal: convert to indicator variables

In [14]:
one_hot = pd.get_dummies(X.goal, 
                         prefix='goal', 
                         dummy_na=True, 
                         drop_first=True)

X = X.drop('goal', axis=1)
X = X.join(one_hot)

### Train-Test Split

In [15]:
X.drop(columns=['iid', 'partner', 'pid'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.2,
                                                    stratify=y)

### age: fill with median among gender

In [16]:
X_train.age.isnull().sum()

76

In [17]:
# fill missing age values with median age among gender
X_train['age'] = X_train.groupby('gender').transform(lambda group: group.fillna(group.median()))
X_train['age_o'] = X_train.groupby('gender').transform(lambda group: group.fillna(group.median()))

X_test['age'] = X_test.groupby('gender').transform(lambda group: group.fillna(group.median()))
X_test['age_o'] = X_test.groupby('gender').transform(lambda group: group.fillna(group.median()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead


In [18]:
assert X_train.age.isna().sum() == 0
assert X_train.age_o.isna().sum() == 0

assert X_test.age.isna().sum() == 0
assert X_test.age_o.isna().sum() == 0

### int_corr, date, exphappy, self evaluation: fill with median

In [19]:
X_train.fillna(X_train.median(), inplace=True)
X_test.fillna(X_test.median(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [20]:
assert X_train.isna().sum().sum() == 0
assert X_test.isna().sum().sum() == 0

In [21]:
X_train.to_csv('data/X_train.csv', index=False)

In [22]:
X_test.to_csv('data/X_test.csv', index=False)

In [23]:
y_train.to_csv('data/y_train.csv', index=False)#, header=True)

In [24]:
y_test.to_csv('data/y_test.csv', index=False)#, header=True)