## Speed Dating Data Set

In [1]:
# do the neccessary imports
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff

In [2]:
#transforming arff file in csv
df = pd.read_csv('data/speed-dating/speeddating.csv')
print(df.dtypes)
df.shape

id                     int64
has_null               int64
wave                   int64
gender                object
age                   object
                       ...  
d_guess_prob_liked    object
met                   object
decision               int64
decision_o             int64
match                  int64
Length: 124, dtype: object


  exec(code_obj, self.user_global_ns, self.user_ns)


(8378, 124)

In [3]:
df[df.isin(["?"]).any(axis=1)].shape

(7330, 124)

## About Ratings

When a column is giving a rating, for example any column that has ```importance``` or ```pref_o_``` also include scales, which are weird and we need to figure out how to normalize everything. 

# Missing values

Some rows in these rating columns also have missing values which can't simply be thrown out. Instead we have to look at the context, for example for the missing values in ```importance_same_race``` we can fill them in by taking the median/mean of the ratings that people of the same race have given.

In [4]:
df.replace('?', np.nan, inplace=True)

In [5]:
df[['race','importance_same_race']][df['race'].isna() == True]
# 63 rows with no race and no importance of race so we just drop these


Unnamed: 0,race,importance_same_race
828,,
829,,
830,,
831,,
832,,
...,...,...
5127,,
5128,,
5129,,
5130,,


In [6]:
df = df[df['race'].notna()]

We try to divide df temporarily in to races to omit the nan value of importance of race by the mode. For the group other, we do the same

Update: As it looks like, only Europeans/Caucasian-Americans have empty values in this dataset so we can just fill them with the mode of the whole dataset

In [7]:
df['importance_same_race'][df['importance_same_race'].isna() == True]

312    NaN
313    NaN
314    NaN
315    NaN
316    NaN
317    NaN
318    NaN
319    NaN
320    NaN
321    NaN
322    NaN
323    NaN
324    NaN
325    NaN
326    NaN
327    NaN
Name: importance_same_race, dtype: object

In [8]:
#First convert column to int instead of string
df['importance_same_race'] = df['importance_same_race'].fillna(100).astype(int)

In [9]:
#We replaced nan with value 100 for conversion to int then replaced 100 with the mode
df['importance_same_race'].replace(100, df['importance_same_race'][df['race'] == 'European/Caucasian-American'].mode()[0], inplace=True)

In [10]:
# We do the same thing for religion
# Note there are missing values for these columns only for the europeans
df['importance_same_religion'] = df['importance_same_religion'].fillna(100).astype(int)
df['importance_same_religion'].replace(100, df['importance_same_religion'][df['race'] == 'European/Caucasian-American'].mode()[0], inplace=True)

### Dealing with NaN for ```preference_of...```

For this case the number of NaN is also not that big so we could actually drop them since the dataset is relatively big. We are losing at most 192 values

In [11]:
df.dropna(subset=['pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests'], inplace=True)

In [12]:
df.shape

(8186, 124)

## Casting strings to float and rounding float values to int for ```preferece_o```

In [13]:
df[['pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests']] = df[['pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests']].astype(float)

In [14]:
df[['pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests']] = df[['pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests']].round()
df[['pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests']] = df[['pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests']].astype(int)

### Handling age

In [15]:
df['age'].fillna(1000, inplace=True)
df['age_o'].fillna(1000, inplace=True)

In [16]:
df[['age', 'age_o']] = df[['age', 'age_o']].astype(int)

In [17]:
df['age'].replace(1000, df['age'].median(), inplace=True)
df['age_o'].replace(1000, df['age_o'].median(), inplace=True)

### Handling Duplicate Fields in Field

In [18]:
df['field'] = df['field'].str.upper()

In [19]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [20]:
df['field_encoded'] = le.fit_transform(df['field'])
le.classes_

#Many fields are still the same so we sub with regex

array(['ACTING', 'AFRICAN-AMERICAN STUDIES/HISTORY', 'AMERICAN STUDIES',
       'AMERICAN STUDIES [MASTERS]', 'ANTHROPOLOGY',
       'ANTHROPOLOGY/EDUCATION', 'APPLIED MATHS/ECONS',
       'APPLIED PHYSIOLOGY & NUTRITION', 'ARCHITECTURE', 'ART EDUCATION',
       'ART HISTORY', 'ART HISTORY/MEDICINE', 'ARTS ADMINISTRATION',
       'BILINGUAL EDUCATION', 'BIOCHEMISTRY',
       'BIOCHEMISTRY & MOLECULAR BIOPHYSICS', 'BIOCHEMISTRY/GENETICS',
       'BIOLOGY', 'BIOLOGY PHD', 'BIOMEDICAL ENGINEERING',
       'BIOMEDICAL INFORMATICS', 'BIOMEDICINE', 'BIOTECHNOLOGY',
       'BUSINESS', 'BUSINESS & INTERNATIONAL AFFAIRS',
       'BUSINESS ADMINISTRATION',
       'BUSINESS AND INTERNATIONAL AFFAIRS [MBA/MIA DUAL DEGREE]',
       'BUSINESS CONSULTING', 'BUSINESS SCHOOL',
       'BUSINESS [FINANCE & MARKETING]', 'BUSINESS [MBA]',
       'BUSINESS- MBA', 'BUSINESS/ FINANCE/ REAL ESTATE', 'BUSINESS/LAW',
       'BUSINESS; MARKETING', 'BUSINESS; MEDIA', 'CELL BIOLOGY',
       'CHEMISTRY', 'CLASSICS',

In [21]:
le.classes_.size

219

In [22]:
df['field'] = df['field'].replace('.*BUSINESS.*|MBA.*|ECONOMICS.*|.*FINANCE.*', 'BUSINESS/ECONOMICS/FINANCE', regex=True)
df['field'] = df['field'].replace('.*INTERNATIONAL AFFAIRS.*|SIPA.*', 'INTERNATIONAL AFFAIRS', regex=True)
df['field'] = df['field'].replace('LAW.*', 'LAW', regex=True)
df['field'] = df['field'].replace('OPERATIONS RESEARCH.*', 'OPERATIONS RESEARCH', regex=True)
df['field'] = df['field'].replace('PHILOSOPHY.*', 'PHILOSOPHY', regex=True)
df['field'] = df['field'].replace('PHYSICS.*', 'PHYSICS', regex=True)
df['field'] = df['field'].replace('.*INDUSTRIAL ENGINEERING.*', 'INDUSTRIAL ENGINEERING', regex=True)
df['field'] = df['field'].replace('.*MATH.*|.*STAT.*', 'MATHEMATICS', regex=True)
df['field'] = df['field'].replace('ART.*', 'ART', regex=True)
df['field'] = df['field'].replace('.*BIO.*', 'BIOLOGY', regex=True)
df['field'] = df['field'].replace('.*AMERICAN.*', 'AMERICAN STUDIES', regex=True)
df['field'] = df['field'].replace('CLIMATE.*|ENVIRON.*|.*EARTH.*', 'ENVIRONMENTAL SCIENCE', regex=True)
df['field'] = df['field'].replace('.*WRITING.*', 'WRITING', regex=True)
df['field'] = df['field'].replace('.*SOCI.*', 'SOCIOLOGY/SOCIAL STUDIES', regex=True)
df['field'] = df['field'].replace('.*NEURO.*', 'NEUROSCIENCE', regex=True)
df['field'] = df['field'].replace('.*ENGLISH.*|.*GERMAN.*|.*POLISH.*|.*FRENCH.*|.*LANG.*|.*CHINE.*|.*JAP.*', 'LANGUAGES', regex=True)
df['field'] = df['field'].replace('.*HIST.*', 'HISTORY', regex=True)
df['field'] = df['field'].replace('.*PSYCH.*', 'PSYCHOLOGY', regex=True)
df['field'] = df['field'].replace('.*ANTH.*', 'ANTHROPOLOGY', regex=True)
df['field'] = df['field'].replace('.*EDU.*', 'EDUCATION', regex=True)
df['field'] = df['field'].replace('.*THEA.*', 'THEATER', regex=True)
df['field'] = df['field'].replace('.*RELI.*', 'RELIGION', regex=True)


In [23]:
df['field_encoded'] = le.fit_transform(df['field'])
le.classes_.size

86