In [37]:
import numpy as np
import pandas as pd
import seaborn as sns
# import matplotlib.pyplot as plt
# import statsmodels.api as sm
from sklearn.model_selection import train_test_split
# from sklearn.model_selection import cross_val_score
# from sklearn.linear_model import LogisticRegression

In [38]:
df = pd.read_csv('speed_dating_data.csv', encoding = "ISO-8859-1")

# Exploration

## Verify: match iff dec & dec_o

In [39]:
test_df = pd.concat([
    df.match,
    df.dec,
    df.dec_o
], axis=1)

test_df['expected'] = test_df.apply(lambda row: row.dec&row.dec_o == row.match, axis=1)
assert test_df.expected.sum()==len(test_df.index)

## Use self evaluation or perception by others as proxy?

In [73]:
test_df = pd.concat([
    df.iid, df.pid,
    df.loc[:, 'attr3_1':'amb3_1'],  # How do you think you measure up?
    df.loc[:, 'attr5_1':'amb5_1'],  # How do others perceive you?
], axis=1)

test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Data columns (total 17 columns):
iid         8378 non-null int64
pid         8368 non-null float64
attr3_1     8273 non-null float64
sinc3_1     8273 non-null float64
fun3_1      8273 non-null float64
intel3_1    8273 non-null float64
amb3_1      8273 non-null float64
attr5_1     4906 non-null float64
sinc5_1     4906 non-null float64
intel5_1    4906 non-null float64
fun5_1      4906 non-null float64
amb5_1      4906 non-null float64
attr_o      8166 non-null float64
sinc_o      8091 non-null float64
intel_o     8072 non-null float64
fun_o       8018 non-null float64
amb_o       7656 non-null float64
dtypes: float64(16), int64(1)
memory usage: 1.1 MB


Too many missing values for "How do others perceive you" results.

# Cleanup

In [92]:
X = pd.concat([
    df.iid, df.partner, df.pid, # id of self and partner, to be removed later
    df.gender,
    df.age, df.age_o,
    df.int_corr, # correlation of interests
    df.samerace,
    df.goal,
    df.date,
    df.exphappy, # expected happiness with people you will meet
    df.loc[:, 'attr3_1':'amb3_1'],  # self evaluation
    df.loc[:, 'attr':'shar'],  # evaluation of partner
    df.loc[:, 'attr1_1':'shar1_1'],  # what's important to you, sum to 100 
    df.loc[:, 'pf_o_att':'pf_o_sha'], # what's important to partner, sum to 100
], axis=1)

y = df.match

## Fill Missing Values

### pid (partner's unique ID)

In [77]:
X[X.pid.isnull()]  # The missing pid comes from same person in one night

Unnamed: 0,iid,partner,pid,gender,age,age_o,int_corr,samerace,goal,date,...,intel1_1,fun1_1,amb1_1,shar1_1,pf_o_att,pf_o_sin,pf_o_int,pf_o_fun,pf_o_amb,pf_o_sha
1755,122,7,,1,22.0,,-0.12,0,1.0,3.0,...,10.0,20.0,5.0,10.0,,,,,,
1765,123,7,,1,18.0,,-0.29,0,2.0,5.0,...,20.0,5.0,5.0,10.0,,,,,,
1775,124,7,,1,22.0,,-0.05,0,2.0,4.0,...,10.0,10.0,10.0,20.0,,,,,,
1785,125,7,,1,21.0,,0.15,0,1.0,3.0,...,20.0,15.0,12.0,18.0,,,,,,
1795,126,7,,1,22.0,,0.01,0,4.0,6.0,...,15.0,10.0,5.0,5.0,,,,,,
1805,127,7,,1,22.0,,0.38,0,6.0,5.0,...,25.0,15.0,5.0,5.0,,,,,,
1815,128,7,,1,21.0,,-0.05,0,1.0,3.0,...,10.0,20.0,10.0,5.0,,,,,,
1825,129,7,,1,,,0.09,0,3.0,3.0,...,20.0,20.0,,,,,,,,
1835,130,7,,1,20.0,,-0.4,0,1.0,6.0,...,10.0,,,,,,,,,
1845,131,7,,1,19.0,,-0.14,0,2.0,7.0,...,20.0,20.0,10.0,15.0,,,,,,


In [78]:
# Assign a new pid to the person missing
X.pid.fillna(X.pid.max() + 1, inplace=True)
assert X.pid.isna().sum() == 0

### importance ratings: fill with 100/6 

In [93]:
X.loc[:, 'attr1_1':'pf_o_sha'].isna().sum()

attr1_1      79
sinc1_1      79
intel1_1     79
fun1_1       89
amb1_1       99
shar1_1     121
pf_o_att     89
pf_o_sin     89
pf_o_int     89
pf_o_fun     98
pf_o_amb    107
pf_o_sha    129
dtype: int64

In [100]:
# X.loc[:, 'attr1_1':'pf_o_sha'].fillna(100/6, inplace=True) # doesn't work

X.attr1_1.fillna(100/6, inplace=True)
X.sinc1_1.fillna(100/6, inplace=True)
X.intel1_1.fillna(100/6, inplace=True)
X.fun1_1.fillna(100/6, inplace=True)
X.amb1_1.fillna(100/6, inplace=True)
X.shar1_1.fillna(100/6, inplace=True)
X.pf_o_att.fillna(100/6, inplace=True)
X.pf_o_sin.fillna(100/6, inplace=True)
X.pf_o_int.fillna(100/6, inplace=True)
X.pf_o_fun.fillna(100/6, inplace=True)
X.pf_o_amb.fillna(100/6, inplace=True)
X.pf_o_sha.fillna(100/6, inplace=True)

In [101]:
X.loc[:, 'attr1_1':'pf_o_sha'].isna().sum()

attr1_1     0
sinc1_1     0
intel1_1    0
fun1_1      0
amb1_1      0
shar1_1     0
pf_o_att    0
pf_o_sin    0
pf_o_int    0
pf_o_fun    0
pf_o_amb    0
pf_o_sha    0
dtype: int64

### goal: fill with 6 (Other)

### Train-Test Split

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.1,
                                                    stratify=y)

### age: fill with median among gender

In [51]:
X_train.age.isnull().sum()

84

In [52]:
# fill missing age values with median age among gender
X_train['age'] = X_train.groupby('gender').transform(lambda group: group.fillna(group.median()))
X_train['age_o'] = X_train.groupby('gender').transform(lambda group: group.fillna(group.median()))

X_test['age'] = X_test.groupby('gender').transform(lambda group: group.fillna(group.median()))
X_test['age_o'] = X_test.groupby('gender').transform(lambda group: group.fillna(group.median()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead


In [53]:
assert X_train.age.isna().sum() == 0
assert X_train.age_o.isna().sum() == 0

assert X_test.age.isna().sum() == 0
assert X_test.age_o.isna().sum() == 0

### int_corr

In [54]:
X_train.int_corr.isna().sum()

139

In [60]:
X_train.int_corr.fillna(X_train.int_corr.median, inplace=True)
X_test.int_corr.fillna(X_test.int_corr.median, inplace=True)

In [61]:
assert X_train.int_corr.isna().sum() == 0
assert X_test.int_corr.isna().sum() == 0

In [69]:
# X_train.date.isna().sum()
X_train.date.value_counts()
# X_train.date.median()

6.0    1897
4.0    1849
5.0    1377
7.0    1281
3.0     700
2.0     270
1.0      83
Name: date, dtype: int64

In [102]:
X_train.goal.isna().sum()

67

In [None]:
# median: date, exphappy, self_eval, int_corr
# one hot: goal

# Baseline Logistic Regression
No feature engineering;<br>
Treat ordinal variables as continuous, drop all other categorical variables<br>
Remove attributes that might be linearly dependent on others<br>

In [17]:
df2 = pd.concat([
    df.age,
    df.int_corr, # correlation of interests
    df.samerace,
    df.goal,
    df.date,
    df.exphappy, # expected happiness with people you will meet
    df.loc[:, 'attr3_1':'amb3_1'],  # self eval, assuming to be accurate
    df.dec_o # decision of partner
], axis=1)

In [18]:
df2.isna().sum()

age          95
int_corr    158
samerace      0
goal         79
date         97
exphappy    101
attr3_1     105
sinc3_1     105
fun3_1      105
intel3_1    105
amb3_1      105
dec_o         0
dtype: int64

In [13]:
X = df2.iloc[:,:-1]
y = df2.dec_o

In [14]:
lr = LogisticRegression()
lr.fit(X,y)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [7]:
df3 = pd.concat([df.id, df.age, df.age_o], axis=1)
df3['age_diff'] = df3.age -df3.age_o
df3

Unnamed: 0,id,age,age_o,age_diff
0,1.0,21.0,27.0,-6.0
1,1.0,21.0,22.0,-1.0
2,1.0,21.0,22.0,-1.0
3,1.0,21.0,23.0,-2.0
4,1.0,21.0,24.0,-3.0
...,...,...,...,...
8373,22.0,25.0,26.0,-1.0
8374,22.0,25.0,24.0,1.0
8375,22.0,25.0,29.0,-4.0
8376,22.0,25.0,22.0,3.0
