In [1]:
## Importing pandas for data analysis

import pandas as pd

In [2]:
## Loading csv into a pandas dataframe

df = pd.read_csv('profiles.csv')

In [3]:
## Getting the OkCupid variables in a list format
## We are going to have to go over each column
## to see if it needs cleaning before we analyze
## and make assumptions.

## It is important that we can recognize, filter, and
## adjust data that are irrelevant, or incorrect to make
## sure our results are as accurate as possible

list(df)

['age',
 'body_type',
 'diet',
 'drinks',
 'drugs',
 'education',
 'essay0',
 'essay1',
 'essay2',
 'essay3',
 'essay4',
 'essay5',
 'essay6',
 'essay7',
 'essay8',
 'essay9',
 'ethnicity',
 'height',
 'income',
 'job',
 'last_online',
 'location',
 'offspring',
 'orientation',
 'pets',
 'religion',
 'sex',
 'sign',
 'smokes',
 'speaks',
 'status']

In [4]:
## For the age variable, we see that is data for all 59946
## Everything seems normal here. 

age = df[['age']]
age.describe()

Unnamed: 0,age
count,59946.0
mean,32.34029
std,9.452779
min,18.0
25%,26.0
50%,30.0
75%,37.0
max,110.0


In [5]:
## For the body_type variable, we combine
## it with sex in order to count
## Everything seems normal here. 

body_type = df[['body_type', 'sex']]
body_type_grouped = body_type.groupby('body_type')
body_type_grouped = body_type_grouped.count().reset_index()
body_type_grouped.rename(index=str, columns={"sex":"count"})

Unnamed: 0,body_type,count
0,a little extra,2629
1,athletic,11819
2,average,14652
3,curvy,3924
4,fit,12711
5,full figured,1009
6,jacked,421
7,overweight,444
8,rather not say,198
9,skinny,1777


In [6]:
## Following the same explanations as above,
## everything seems normal here.

diet = df[['diet', 'sex']]
diet_grouped = diet.groupby('diet')
diet_grouped = diet_grouped.count()
diet_grouped = diet_grouped.reset_index()
diet_grouped.rename(index=str, columns={"sex":"count"})

Unnamed: 0,diet,count
0,anything,6183
1,halal,11
2,kosher,11
3,mostly anything,16585
4,mostly halal,48
5,mostly kosher,86
6,mostly other,1007
7,mostly vegan,338
8,mostly vegetarian,3444
9,other,331


In [7]:
## After diet, drinks and drugs seem normal as well.
## However, let's take a look at education.

## An interesting category in here is 'space camp'
## Through quick research, 'space camp' does exist.
## However, I don't think all of the responses are honest.
## I'm going to keep this in mind when comparing it
## with other responses.

education = df[['education', 'sex']]
education_grouped = education.groupby('education')
education_grouped = education_grouped.count().reset_index()
education_grouped.rename(index=str, columns={"sex":"count"})

Unnamed: 0,education,count
0,college/university,801
1,dropped out of college/university,995
2,dropped out of high school,102
3,dropped out of law school,18
4,dropped out of masters program,140
5,dropped out of med school,12
6,dropped out of ph.d program,127
7,dropped out of space camp,523
8,dropped out of two-year college,191
9,graduated from college/university,23959


In [8]:
## Essays 0-9 are fill in the blank prompts.
## The titles are:
##     0: Self Summary, 1: What I'm doing with my life
##     2: I’m really good at
##     3: The first thing people usually notice about me
##     4: Favorite books, movies, show music, and food
##     5: The six things I could never do without
##     6: I spend a lot of time thinking about
##     7: On a typical Friday night I am
##     8: The most private thing I am willing to admit
##     9: You should message me if...

## Let's take a quick look at a few responses from User Index #0
df.loc[0,['age', 'body_type', 'diet', 'drinks', 'drugs', 'education',
 'ethnicity', 'height', 'income', 'job', 'last_online', 'location',
 'offspring', 'orientation', 'pets', 'religion', 'sex', 'sign',
 'smokes', 'speaks', 'status']]

age                                                      22
body_type                                    a little extra
diet                                      strictly anything
drinks                                             socially
drugs                                                 never
education                     working on college/university
ethnicity                                      asian, white
height                                                   75
income                                                   -1
job                                          transportation
last_online                                2012-06-28-20-30
location                    south san francisco, california
offspring      doesn&rsquo;t have kids, but might want them
orientation                                        straight
pets                              likes dogs and likes cats
religion              agnosticism and very serious about it
sex                                     

In [9]:
## 0: Self Summary

df[['essay0']].iloc[0,0]

"about me:<br />\n<br />\ni would love to think that i was some some kind of intellectual:\neither the dumbest smart guy, or the smartest dumb guy. can't say i\ncan tell the difference. i love to talk about ideas and concepts. i\nforge odd metaphors instead of reciting cliches. like the\nsimularities between a friend of mine's house and an underwater\nsalt mine. my favorite word is salt by the way (weird choice i\nknow). to me most things in life are better as metaphors. i seek to\nmake myself a little better everyday, in some productively lazy\nway. got tired of tying my shoes. considered hiring a five year\nold, but would probably have to tie both of our shoes... decided to\nonly wear leather shoes dress shoes.<br />\n<br />\nabout you:<br />\n<br />\nyou love to have really serious, really deep conversations about\nreally silly stuff. you have to be willing to snap me out of a\nlight hearted rant with a kiss. you don't have to be funny, but you\nhave to be able to make me laugh. you

In [10]:
## 1: What I'm doing with my life

df[['essay1']].iloc[0,0]

'currently working as an international agent for a freight\nforwarding company. import, export, domestic you know the\nworks.<br />\nonline classes and trying to better myself in my free time. perhaps\na hours worth of a good book or a video game on a lazy sunday.'

In [11]:
## 2: I’m really good at

df[['essay2']].iloc[0,0]

'making people laugh.<br />\nranting about a good salting.<br />\nfinding simplicity in complexity, and complexity in simplicity.'

In [12]:
## 3: The first thing people usually notice about me

df[['essay3']].iloc[0,0]

'the way i look. i am a six foot half asian, half caucasian mutt. it\nmakes it tough not to notice me, and for me to blend in.'

In [13]:
## 4: Favorite books, movies, show music, and food

df[['essay4']].iloc[0,0]

"books:<br />\nabsurdistan, the republic, of mice and men (only book that made me\nwant to cry), catcher in the rye, the prince.<br />\n<br />\nmovies:<br />\ngladiator, operation valkyrie, the producers, down periscope.<br />\n<br />\nshows:<br />\nthe borgia, arrested development, game of thrones, monty\npython<br />\n<br />\nmusic:<br />\naesop rock, hail mary mallon, george thorogood and the delaware\ndestroyers, felt<br />\n<br />\nfood:<br />\ni'm down for anything."

In [14]:
## 5: The six things I could never do without

df[['essay5']].iloc[0,0]

'food.<br />\nwater.<br />\ncell phone.<br />\nshelter.'

In [15]:
## 6: I spend a lot of time thinking about

df[['essay6']].iloc[0,0]

'duality and humorous things'

In [16]:
## 7: On a typical Friday night I am

df[['essay7']].iloc[0,0]

'trying to find someone to hang out with. i am down for anything\nexcept a club.'

In [17]:
## 8: The most private thing I am willing to admit

df[['essay8']].iloc[0,0]

'i am new to california and looking for someone to wisper my secrets\nto.'

In [18]:
## 9: You should message me if...

df[['essay9']].iloc[0,0]

'you want to be swept off your feet!<br />\nyou are tired of the norm.<br />\nyou want to catch a coffee or a bite.<br />\nor if you want to talk philosophy.'

In [19]:
## Now, let's take a look at ethnicity.
## Generally, I would consider anywhere from
## 1-3 ethnicities as okay for a person to
## label themselves. However, when a person
## uses more than 3 ethnicities, I would
## assume that they are not taking the questions
## seriously. Similar users selecting 'space camp',
## I'm going to take note of this as well.

ethnicity = df[['ethnicity', 'sex']]
ethnicity_grouped = ethnicity.groupby('ethnicity')
ethnicity_grouped = ethnicity_grouped.count().reset_index()
ethnicity_grouped.rename(index=str, columns={"sex":"count"})

Unnamed: 0,ethnicity,count
0,asian,6134
1,"asian, black",59
2,"asian, black, hispanic / latin",2
3,"asian, black, hispanic / latin, other",2
4,"asian, black, hispanic / latin, white",2
5,"asian, black, hispanic / latin, white, other",1
6,"asian, black, indian",1
7,"asian, black, indian, hispanic / latin, other",1
8,"asian, black, native american",9
9,"asian, black, native american, hispanic / latin",2


In [20]:
## Height is recorded in inches.
## Although this is very close to the national
## average, let's take a look at the extremes

height = df[['height', 'sex']]
height_grouped = height.groupby('sex')
height_grouped.mean().reset_index()

Unnamed: 0,sex,height
0,f,65.103873
1,m,70.443492


In [21]:
## Here, we have a female standing at 4 inches tall.
## Also, a male at 1 inch tall.

height = df[['height', 'sex']]
height_grouped = height.groupby('sex')
height_grouped.min().reset_index()

Unnamed: 0,sex,height
0,f,4.0
1,m,1.0


In [22]:
## Here, we have a female standing at 95 inches tall.
## Also, a male at 95 inch tall.
## Just for reference, the average NBA basketball player
## stands at 79 inches tall.

height = df[['height', 'sex']]
height_grouped = height.groupby('sex')
height_grouped.max().reset_index()

Unnamed: 0,sex,height
0,f,95.0
1,m,95.0


In [23]:
## It seems that these two extremes show that
## some of the reponses for height may be wrong.
## According to US data from cdc.gov, the average height
## for female is 63.8 inches and for male is 69.3 inches

## If the data is more than 5 standard deviations away
## from the mean, I will replace it with a NaN value.

## Coincidently, the average standard deviation for
## men and women is 3.00 inches. So I am going replace
## if the users heights is over or under the US average
## by 15.00 inches with NaN values.

height_grouped.std()

Unnamed: 0_level_0,height
sex,Unnamed: 1_level_1
f,2.926502
m,3.076521


In [24]:
## For income, a response of '-1' means
## perfer not to answer. I'm going to
## change it to "NaN's" so it doesn't factor
## in the average. It is also worth noting that
## there are quite a bit of millionaires using
## OkCupid. Although impressive, I'll later
## cross check these extreme values with
## other variables with extreme values.

income = df[['income', 'sex']]
income_grouped = income.groupby('income')
income_grouped = income_grouped.count().reset_index()
income_grouped.rename(index=str, columns={"sex":"count"})

Unnamed: 0,income,count
0,-1,48442
1,20000,2952
2,30000,1048
3,40000,1005
4,50000,975
5,60000,736
6,70000,707
7,80000,1111
8,100000,1621
9,150000,631


In [25]:
## Since jobs relate to income, it's good
## to look at those two variables together
## It seems like everything is going okay
## here. Just lots of missing values.

## From this, all the way to 'status',
## seems normal as well. With all of
## the questionable variables noted,
## it is time to do some comparions.

job = df[['job','income']]
job

Unnamed: 0,job,income
0,transportation,-1
1,hospitality / travel,80000
2,,-1
3,student,20000
4,artistic / musical / writer,-1
5,computer / hardware / software,-1
6,,-1
7,artistic / musical / writer,-1
8,,-1
9,student,-1
