#  Capstone 2: Data Wrangling<a id='2_Data_wrangling'></a>

## Data Collection

### 1.1 Imports<a id='1.1_Imports'></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

### 1.2 Load Data<a id='1.2_Load_Data'></a>

In [2]:
sd_data = pd.read_csv('../raw_data/speed_dating_data.csv', encoding='unicode_escape')
career_category = pd.read_csv('../raw_data/career_categories.csv', encoding='unicode_escape')
from_countries = pd.read_csv('../raw_data/from_countries.csv', encoding='unicode_escape')

In [3]:
sd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 195 entries, iid to amb5_3
dtypes: float64(174), int64(13), object(8)
memory usage: 12.5+ MB


In [4]:
sd_data.head()

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,...,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,...,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,...,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,...,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,...,5.0,7.0,7.0,7.0,7.0,,,,,


In [5]:
sd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 195 entries, iid to amb5_3
dtypes: float64(174), int64(13), object(8)
memory usage: 12.5+ MB


In [6]:
sd_data.columns

Index(['iid', 'id', 'gender', 'idg', 'condtn', 'wave', 'round', 'position',
       'positin1', 'order',
       ...
       'attr3_3', 'sinc3_3', 'intel3_3', 'fun3_3', 'amb3_3', 'attr5_3',
       'sinc5_3', 'intel5_3', 'fun5_3', 'amb5_3'],
      dtype='object', length=195)

In [7]:
# list of unwanted columns, consists of columns missing data and/or part of surveys after the actual speed dating event or
unwanted_cols = [
    'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1', 
    'attr4_1', 'sinc4_1', 'intel4_1', 'fun4_1', 'amb4_1', 'shar4_1', 
    'attr2_1', 'sinc2_1', 'intel2_1', 'fun2_1', 'amb2_1', 'shar2_1', 
    'attr3_1', 'sinc3_1', 'intel3_1', 'fun3_1', 'amb3_1', 
    'attr5_1', 'sinc5_1', 'intel5_1', 'fun5_1', 'amb5_1', 
    'attr1_s', 'sinc1_s', 'intel1_s', 'fun1_s', 'amb1_s', 'shar1_s',
    'attr3_s', 'sinc3_s', 'intel3_s', 'fun3_s', 'amb3_s', 
    'attr7_2', 'sinc7_2', 'intel7_2', 'fun7_2', 'amb7_2', 'shar7_2',
    'attr2_1', 'sinc2_1', 'intel2_1', 'fun2_1', 'amb2_1', 'shar2_1',
    'attr4_2', 'sinc4_2', 'intel4_2', 'fun4_2', 'amb4_2', 'shar4_2',
    'attr2_2', 'sinc2_2', 'intel2_2', 'fun2_2', 'amb2_2', 'shar2_2',
    'attr3_2', 'sinc3_2', 'intel3_2', 'fun3_2', 'amb3_2', 
    'attr5_2', 'sinc5_2', 'intel5_2', 'fun5_2', 'amb5_2', 
    'attr1_3', 'sinc1_3', 'intel1_3', 'fun1_3', 'amb1_3', 'shar1_3',
    'attr7_3', 'sinc7_3', 'intel7_3', 'fun7_3', 'amb7_3', 'shar7_3',
    'attr4_3', 'sinc4_3', 'intel4_3', 'fun4_3', 'amb4_3', 'shar4_3',
    'attr2_3', 'sinc2_3', 'intel2_3', 'fun2_3', 'amb2_3', 'shar2_3',
    'attr3_3', 'sinc3_3', 'intel3_3', 'fun3_3', 'amb3_3', 
    'attr5_3', 'sinc5_3', 'intel5_3', 'fun5_3', 'amb5_3',
    'satis_2', 'length', 'numdat_2', 'you_call', 'them_cal', 'date_3', 
    'numdat_3', 'num_in_3'
]

# drop unwanted columns in data
sd_data.drop(unwanted_cols, axis=1, inplace=True)
sd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Data columns (total 86 columns):
iid         8378 non-null int64
id          8377 non-null float64
gender      8378 non-null int64
idg         8378 non-null int64
condtn      8378 non-null int64
wave        8378 non-null int64
round       8378 non-null int64
position    8378 non-null int64
positin1    6532 non-null float64
order       8378 non-null int64
partner     8378 non-null int64
pid         8368 non-null float64
match       8378 non-null int64
int_corr    8220 non-null float64
samerace    8378 non-null int64
age_o       8274 non-null float64
race_o      8305 non-null float64
pf_o_att    8289 non-null float64
pf_o_sin    8289 non-null float64
pf_o_int    8289 non-null float64
pf_o_fun    8280 non-null float64
pf_o_amb    8271 non-null float64
pf_o_sha    8249 non-null float64
dec_o       8378 non-null int64
attr_o      8166 non-null float64
sinc_o      8091 non-null float64
intel_o     8072 non-null float64

In [8]:
# look at missing data
missing = pd.concat([sd_data.isnull().sum(), 100 * sd_data.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count')

Unnamed: 0,count,%
iid,0,0.000000
gender,0,0.000000
idg,0,0.000000
condtn,0,0.000000
wave,0,0.000000
...,...,...
undergra,3464,41.346383
income,4099,48.925758
tuition,4795,57.233230
mn_sat,5245,62.604440


In [9]:
# columns missing values are greater than 0
missing[missing['%'] != 0].sort_values(by='count')

Unnamed: 0,count,%
id,1,0.011936
pid,10,0.119360
field,63,0.751969
race,63,0.751969
race_o,73,0.871330
...,...,...
undergra,3464,41.346383
income,4099,48.925758
tuition,4795,57.233230
mn_sat,5245,62.604440


In [10]:
# columns missing values are greater than or equal to 10%
missing[missing['%'] >= 10.0].sort_values(by='count')

Unnamed: 0,count,%
sinc1_2,915,10.921461
intel1_2,915,10.921461
fun1_2,915,10.921461
amb1_2,915,10.921461
shar1_2,915,10.921461
attr1_2,933,11.136309
zipcode,1064,12.699928
shar,1067,12.735736
shar_o,1076,12.843161
match_es,1173,14.000955


In [11]:
# columns with significant amount of missing data
missing_cols = missing[missing['%'] >= 20.0].index
missing_cols

Index(['positin1', 'undergra', 'mn_sat', 'tuition', 'income', 'expnum'], dtype='object')

In [12]:
# drop columns with significant amount of missing data
sd_data.drop(missing_cols, axis=1, inplace=True)
sd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Data columns (total 80 columns):
iid         8378 non-null int64
id          8377 non-null float64
gender      8378 non-null int64
idg         8378 non-null int64
condtn      8378 non-null int64
wave        8378 non-null int64
round       8378 non-null int64
position    8378 non-null int64
order       8378 non-null int64
partner     8378 non-null int64
pid         8368 non-null float64
match       8378 non-null int64
int_corr    8220 non-null float64
samerace    8378 non-null int64
age_o       8274 non-null float64
race_o      8305 non-null float64
pf_o_att    8289 non-null float64
pf_o_sin    8289 non-null float64
pf_o_int    8289 non-null float64
pf_o_fun    8280 non-null float64
pf_o_amb    8271 non-null float64
pf_o_sha    8249 non-null float64
dec_o       8378 non-null int64
attr_o      8166 non-null float64
sinc_o      8091 non-null float64
intel_o     8072 non-null float64
fun_o       8018 non-null float64

In [13]:
missing = pd.concat([sd_data.isnull().sum(), 100 * sd_data.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count')

Unnamed: 0,count,%
iid,0,0.000000
dec_o,0,0.000000
samerace,0,0.000000
match,0,0.000000
partner,0,0.000000
...,...,...
attr1_2,933,11.136309
zipcode,1064,12.699928
shar,1067,12.735736
shar_o,1076,12.843161


In [14]:
sd_data

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,order,partner,...,like,prob,met,match_es,attr1_2,sinc1_2,intel1_2,fun1_2,amb1_2,shar1_2
0,1,1.0,0,1,1,1,10,7,4,1,...,7.0,6.0,2.0,4.0,19.44,16.67,13.89,22.22,11.11,16.67
1,1,1.0,0,1,1,1,10,7,3,2,...,7.0,5.0,1.0,4.0,19.44,16.67,13.89,22.22,11.11,16.67
2,1,1.0,0,1,1,1,10,7,10,3,...,7.0,,1.0,4.0,19.44,16.67,13.89,22.22,11.11,16.67
3,1,1.0,0,1,1,1,10,7,5,4,...,7.0,6.0,2.0,4.0,19.44,16.67,13.89,22.22,11.11,16.67
4,1,1.0,0,1,1,1,10,7,7,5,...,6.0,6.0,2.0,4.0,19.44,16.67,13.89,22.22,11.11,16.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,552,22.0,1,44,2,21,22,14,5,18,...,2.0,5.0,0.0,3.0,70.00,0.00,15.00,10.00,0.00,5.00
8374,552,22.0,1,44,2,21,22,13,4,19,...,4.0,4.0,0.0,3.0,70.00,0.00,15.00,10.00,0.00,5.00
8375,552,22.0,1,44,2,21,22,19,10,20,...,6.0,5.0,0.0,3.0,70.00,0.00,15.00,10.00,0.00,5.00
8376,552,22.0,1,44,2,21,22,3,16,21,...,5.0,5.0,0.0,3.0,70.00,0.00,15.00,10.00,0.00,5.00


In [15]:
# intial account of first person in dataframe
# refer to cell 38 to see how its changes after merging data
sd_data[sd_data['iid'] == 1]['career']

0    lawyer
1    lawyer
2    lawyer
3    lawyer
4    lawyer
5    lawyer
6    lawyer
7    lawyer
8    lawyer
9    lawyer
Name: career, dtype: object

In [16]:
# Q: What is your intended career?
# lowercase everything
sd_data['career'].value_counts().sort_values()

a research position           5
engineering professional      5
MBA                           5
Asset Management              5
teaching                      5
                           ... 
Consulting                  147
Professor                   148
Lawyer                      154
professor                   199
Finance                     202
Name: career, Length: 367, dtype: int64

In [17]:
sd_data['career']

0                                                  lawyer
1                                                  lawyer
2                                                  lawyer
3                                                  lawyer
4                                                  lawyer
                              ...                        
8373    assistant master of the universe (otherwise it...
8374    assistant master of the universe (otherwise it...
8375    assistant master of the universe (otherwise it...
8376    assistant master of the universe (otherwise it...
8377    assistant master of the universe (otherwise it...
Name: career, Length: 8378, dtype: object

In [18]:
career_category

Unnamed: 0,career,career_category
0,"Academia, Research, Banking, Life",academic
1,academia,academic
2,Academic,academic
3,Academic or Research staff,academic
4,research in industry or academia,academic
...,...,...
516,writer,writer
517,writer/teacher,writer
518,writer/producer,writer
519,Writer/Editor,writer


In [20]:
len(career_category.career.unique())

367

In [21]:
career_category.drop_duplicates()
##career_category.groupby(by='career').first

Unnamed: 0,career,career_category
0,"Academia, Research, Banking, Life",academic
1,academia,academic
2,Academic,academic
3,Academic or Research staff,academic
4,research in industry or academia,academic
...,...,...
516,writer,writer
517,writer/teacher,writer
518,writer/producer,writer
519,Writer/Editor,writer


In [22]:
# merging career_category with left join on career column
sd_data = sd_data.merge(career_category.drop_duplicates(), how='left', left_on='career', right_on='career')

In [23]:
sd_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8378 entries, 0 to 8377
Data columns (total 81 columns):
iid                8378 non-null int64
id                 8377 non-null float64
gender             8378 non-null int64
idg                8378 non-null int64
condtn             8378 non-null int64
wave               8378 non-null int64
round              8378 non-null int64
position           8378 non-null int64
order              8378 non-null int64
partner            8378 non-null int64
pid                8368 non-null float64
match              8378 non-null int64
int_corr           8220 non-null float64
samerace           8378 non-null int64
age_o              8274 non-null float64
race_o             8305 non-null float64
pf_o_att           8289 non-null float64
pf_o_sin           8289 non-null float64
pf_o_int           8289 non-null float64
pf_o_fun           8280 non-null float64
pf_o_amb           8271 non-null float64
pf_o_sha           8249 non-null float64
dec_o        

In [24]:
sd_data['career_category'].value_counts().sort_values()

environmentalist         9
design                  10
energy                  16
economist               19
professional sports     20
healthcare              20
public service          20
curator                 23
sex industry            32
tech                    35
counselor               37
pathologist             44
epidemiologist          48
real estate             49
management              50
trading                 51
music industry          57
other                   57
art                     59
ceo                     61
government              67
nonprofit               67
nutrition               78
journalism              80
diplomat                83
entertainment           84
acting                  88
development work       102
global development     106
film                   107
entrepreneur           110
marketing              117
politics               130
physician              139
medicine               139
engineer               153
psychologist           190
s

In [25]:
sd_data['career_category'].value_counts().sum()

8362

In [26]:
# Q: Where are you from originally (before coming to Columbia)? 
sd_data['from'].value_counts().sort_values()

Pougkeepsie NY      5
china               5
sofia, bg           5
Greenwich, CT       5
Europe              5
                 ... 
Italy             132
China             139
California        301
New Jersey        365
New York          522
Name: from, Length: 269, dtype: int64

In [27]:
from_countries

Unnamed: 0,from,from_countries
0,Argentina,Argentina
1,Argentina,Argentina
2,Australia,Australia
3,Azerbaijan,Azerbaijan
4,Bangladesh,Bangladesh
...,...,...
531,,Unknown
532,J.P. Morgan,Unknown
533,International Student,Unknown
534,way too little space here. world citizen.,Unknown


In [28]:
# merging from_countries with left join on from column
sd_data = sd_data.merge(from_countries.drop_duplicates(), how='left', left_on='from', right_on='from')

In [29]:
sd_data[['from', 'from_countries']]

Unnamed: 0,from,from_countries
0,Chicago,USA
1,Chicago,USA
2,Chicago,USA
3,Chicago,USA
4,Chicago,USA
...,...,...
8585,France,France
8586,France,France
8587,France,France
8588,France,France


In [30]:
sd_data['from_countries'].value_counts().sort_values()

Europe            5
Uzbekistan       10
Puerto Rico      10
Azerbaijan       10
Belgium          15
Romania          15
Brazil           16
Panama           18
Switzerland      19
Uruguay          19
Costa Rica       20
Yugoslavia       20
Australia        20
Sweden           20
Cameroon         20
Hungary          21
Czechia          21
Iceland          21
Nepal            21
Siberia          21
Indonesia        22
Iran             22
Persia           22
Bulgaria         25
Argentina        28
Chile            31
Poland           31
South Korea      38
Bangladesh       40
Colombia         50
Singapore        52
Philippines      53
Japan            55
Florida          55
Greece           61
Mexico           74
Germany          80
Hong Kong        81
Russia           84
Israel          104
Spain           105
Canada          121
France          122
Philadephia     132
Taiwan          133
UK              153
Unknown         156
Italy           200
China           203
India           205


In [31]:
sd_data['from_countries'].value_counts().sum()

8576

In [32]:
sd_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8590 entries, 0 to 8589
Data columns (total 82 columns):
iid                8590 non-null int64
id                 8589 non-null float64
gender             8590 non-null int64
idg                8590 non-null int64
condtn             8590 non-null int64
wave               8590 non-null int64
round              8590 non-null int64
position           8590 non-null int64
order              8590 non-null int64
partner            8590 non-null int64
pid                8580 non-null float64
match              8590 non-null int64
int_corr           8429 non-null float64
samerace           8590 non-null int64
age_o              8483 non-null float64
race_o             8516 non-null float64
pf_o_att           8498 non-null float64
pf_o_sin           8498 non-null float64
pf_o_int           8498 non-null float64
pf_o_fun           8489 non-null float64
pf_o_amb           8480 non-null float64
pf_o_sha           8456 non-null float64
dec_o        

In [33]:
sd_data

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,order,partner,...,met,match_es,attr1_2,sinc1_2,intel1_2,fun1_2,amb1_2,shar1_2,career_category,from_countries
0,1,1.0,0,1,1,1,10,7,4,1,...,2.0,4.0,19.44,16.67,13.89,22.22,11.11,16.67,law,USA
1,1,1.0,0,1,1,1,10,7,3,2,...,1.0,4.0,19.44,16.67,13.89,22.22,11.11,16.67,law,USA
2,1,1.0,0,1,1,1,10,7,10,3,...,1.0,4.0,19.44,16.67,13.89,22.22,11.11,16.67,law,USA
3,1,1.0,0,1,1,1,10,7,5,4,...,2.0,4.0,19.44,16.67,13.89,22.22,11.11,16.67,law,USA
4,1,1.0,0,1,1,1,10,7,7,5,...,2.0,4.0,19.44,16.67,13.89,22.22,11.11,16.67,law,USA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8585,552,22.0,1,44,2,21,22,14,5,18,...,0.0,3.0,70.00,0.00,15.00,10.00,0.00,5.00,other,France
8586,552,22.0,1,44,2,21,22,13,4,19,...,0.0,3.0,70.00,0.00,15.00,10.00,0.00,5.00,other,France
8587,552,22.0,1,44,2,21,22,19,10,20,...,0.0,3.0,70.00,0.00,15.00,10.00,0.00,5.00,other,France
8588,552,22.0,1,44,2,21,22,3,16,21,...,0.0,3.0,70.00,0.00,15.00,10.00,0.00,5.00,other,France
