<h1>Exploring OkCupid Dataset</h1>

<h2>Packages installed</h2>

In [None]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import helper as hp ###function I coded for this project
from IPython.display import display
%matplotlib inline
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

print('pandas version is {}.'.format(pd.__version__))
print('numpy version is {}.'.format(np.__version__))
print('scikit-learn version is {}.'.format(sklearn.__version__))
print('seaborn version is {}.'.format(sns.__version__))
print('matplotlib version is {}.'.format(matplotlib.__version__))

<h2>Loading the Dataset</h2>

In [None]:
data = pd.read_csv("profiles.csv")
print("This set has {} data points and {} features.".format(*data.shape))

In [None]:
for i in data.columns:
    print (i, end = ", ")

<h2>Dividing the Population by Gender</h2>

In [None]:
males = data[data['sex'] == 'm'].copy()
females = data[data['sex'] == 'f'].copy()

In [None]:
print("This data set has {} males and {} females".format(males['sex'].count(), females['sex'].count()))
print("male:female ratio = {0:.{1}f}".format(males['sex'].count() / females['sex'].count(), 2))

<h2>Basic Summary (both populations)</h2>

In [None]:
data.describe()

<h2>Basic Summary (Males)</h2>

In [None]:
males.describe()

<h2>Basic Summary (Females)</h2>

In [None]:
females.describe()

In [None]:
import scipy.stats as stats
print(stats.ttest_ind(males['age'], females['age'], nan_policy = 'omit'))
print(stats.ttest_ind(males['height'], females['height'], nan_policy = 'omit'))
print(stats.ttest_ind(males['income'], females['income'], nan_policy = 'omit'))

<h4>Comments on Basic Summaries by Sex</h4>
<p>t-testing yielded low p-values below 0.005. This suggests we reject the hypothesis that average age and average height are the same across users on OkCupid. 50% of men from this dataset fall within 5'8" - 6'0" and are between the ages of 26 - 36. Women fall within 5'3" - 5'7" and are between the ages 26 - 37.</p>

In [None]:
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfTransformer
#from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#vectorizer = TfidfVectorizer()
#experimenting = data[~data['essay1'].isNaN()]
#type(experimenting['essay1'][13])
#tfidf = vectorizer.fit_transform(data['essay1'])

<h2>Types of Bodies</h2>

In [None]:
hp.word_counter(data['body_type'])

<h3>Males</h3>

In [None]:
hp.word_counter(males['body_type'])

<h3>Females</h3>

In [None]:
hp.word_counter(females['body_type'])

<h4>Comments on Body Types</h4>
<p>Average, athletic, and fit people make up 65.3% of the reported body types in the dataset.</p>

<h2>How Educated are people</h2>

<h4>Varying replies to this question. Lots of output to display.</h4>

In [None]:
#hp.word_counter(data['education'])

<h3>Males</h3>

In [None]:
#hp.word_counter(males['education'])

<h3>Females</h3>

In [None]:
#hp.word_counter(females['education'])

<h2>Diets</h2>

In [None]:
hp.word_counter(data['diet'])

<h3>Males</h3>

In [None]:
hp.word_counter(males['diet'])

<h3>Females</h3>

In [None]:
hp.word_counter(females['diet'])

<h2>Drinks</h2>

In [None]:
hp.word_counter(data['drinks'])

<h3>Males</h3>

In [None]:
hp.word_counter(males['drinks'])

<h3>Females</h3>

In [None]:
hp.word_counter(females['drinks'])

<h2>Drugs</h2>

In [None]:
hp.word_counter(data['drugs'])

<h3>Males</h3>

In [None]:
hp.word_counter(males['drugs'])

<h3>Females</h3>

In [None]:
hp.word_counter(females['drugs'])

<h2>Job</h2>

<h4>Here you can see the bias of the dataset in that it was pulled from the greater San Francisco (tech sector jobs).</h4>

In [None]:
#hp.word_counter(data['job'])

<h3>Males</h3>

In [None]:
#hp.word_counter(males['job'])

<h3>Females</h3>

In [None]:
#hp.word_counter(females['job'])

<h2>Offspring</h2>

<h4>Varying responses. Too much to output</h4>

In [None]:
#hp.word_counter(data['offspring'])

<h3>Males</h3>

In [None]:
#hp.word_counter(males['offspring'])

<h3>Females</h3>

In [None]:
#hp.word_counter(females['offspring'])

<h2>Location</h2>

<h4>Locations are extremely varied and create too much output.</h4>

In [None]:
#hp.word_counter(data['location'])

<h3>Males</h3>

In [None]:
#hp.word_counter(males['location'])

<h3>Females</h3>

In [None]:
#hp.word_counter(females['location'])

<h2>Orientation</h2>

In [None]:
hp.word_counter(data['orientation'])

<h3>Males</h3>

In [None]:
hp.word_counter(males['orientation'])

<h3>Females</h3>

In [None]:
hp.word_counter(females['orientation'])

<h2>Pets</h2>

<h4>Same issue as the location. Too much to output and varying amounts of responses.</h4>

In [None]:
#hp.word_counter(data['pets'])

<h3>Males</h3>

In [None]:
#hp.word_counter(males['pets'])

<h3>Females</h3>

In [None]:
#hp.word_counter(females['pets'])

<h2>Religion</h2>

In [None]:
#hp.word_counter(data['religion'])

<h3>Males</h3>

In [None]:
#hp.word_counter(males['religion'])

<h3>Females</h3>

In [None]:
#hp.word_counter(females['religion'])

<h2>Sign</h2>

In [None]:
#hp.word_counter(data['sign'])

<h3>Males</h3>

In [None]:
#hp.word_counter(males['sign'])

<h3>Females</h3>

In [None]:
#hp.word_counter(females['sign'])

<h2>Smokes</h2>

In [None]:
hp.word_counter(data['smokes'])

<h3>Males</h3>

In [None]:
hp.word_counter(males['smokes'])

<h3>Females</h3>

In [None]:
hp.word_counter(females['smokes'])

<h2>Status</h2>

In [None]:
hp.word_counter(data['status'])

<h3>Males</h3>

In [None]:
hp.word_counter(males['status'])

<h3>Females</h3>

In [None]:
hp.word_counter(females['status'])

<h2>Natural Language Processing: Analyzing what people write about themselves</h2>

In [1]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import helper as hp ###function I coded for this project
from IPython.display import display
%matplotlib inline

data = pd.read_csv("profiles.csv")

In [2]:
import random

In [3]:
import re

In [4]:
stringer = 'essay'
lister = []
mod_lister = []
for i in range(0,10):
    lister.append(stringer + str(i))
    mod_lister.append(stringer + str(i) + '_mod')
print(lister)
print(mod_lister)

['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9']
['essay0_mod', 'essay1_mod', 'essay2_mod', 'essay3_mod', 'essay4_mod', 'essay5_mod', 'essay6_mod', 'essay7_mod', 'essay8_mod', 'essay9_mod']


In [5]:
for i, j in zip(lister, mod_lister):
    data[j] = data[i].replace(np.nan, '', regex=True)
    data[j] = data[j].apply(hp.remove_tags)
    display(data[j])

0        about me:    i would love to think that i was ...
1        i am a chef: this is what that means.  1. i am...
2        i'm not ashamed of much, but writing public te...
3                i work in a library and go to school. . .
4        hey how's it going? currently vague on the pro...
5        i'm an australian living in san francisco, but...
6        life is about the little things. i love to lau...
7                                                         
8                                                         
9        my names jake.  i'm a creative guy and i look ...
10       update: i'm seeing someone, so off the market ...
11       i was born in wisconsin, grew up in iowa, and ...
12                                       bang my shit bang
13                                                        
14                                                        
15                                                        
16       i just moved to the bay area from austin, tx (.

0        currently working as an international agent fo...
1        dedicating everyday to being an unbelievable b...
2        i make nerdy software for musicians, artists, ...
3                reading things written by old dead people
4                               work work work work + play
5        building awesome stuff. figuring out what's im...
6                               digging up buried treasure
7        writing. meeting new people, spending time wit...
8        oh goodness. at the moment i have 4 jobs, so i...
9        i have an apartment. i like to explore and che...
10       i have three jobs. i've been doing sound and l...
11       i'm currently the youngest member on an intern...
12                                                        
13                                                        
14       i have an awesome career working as a senior m...
15       dancing, playing, exploring, smiling, and doin...
16       making music, programming, getting back into a.

0        making people laugh.  ranting about a good sal...
1        being silly. having ridiculous amonts of fun w...
2        improvising in different contexts. alternating...
3        playing synthesizers and organizing books acco...
4        creating imagery to look at:  http://bagsbrown...
5        imagining random shit. laughing at aforementio...
6        frolicking  witty banter  using my camera to e...
7        remembering people's birthdays, sending cards,...
8                                                         
9        i'm good at finding creative solutions to prob...
10       hugging, kissing, laughing, motivating people,...
11       i'm really good at a little bit of everything....
12                                                        
13                                                        
14       listening. helping others. being patient. comm...
15       obscure dances from the '30's and '40's, laugh...
16       i'm from louisiana, so cooking and eating are .

0        the way i look. i am a six foot half asian, ha...
1                                                         
2        my large jaw and large glasses are the physica...
3                        socially awkward but i do my best
4                  i smile a lot and my inquisitive nature
5        i have a big smile. i also get asked if i'm we...
6                                    i am the last unicorn
7        i'm rather approachable (a byproduct of being ...
8        i'm freakishly blonde and have the same name a...
9                                                i'm short
10                                     my huge goofy smile
11       the way i dress. some days it's hats, other da...
12                                                        
13                                                        
14       well, i get the most compliments on my butt, s...
15                                           you tell me:)
16       lately, i keep getting asked "are you with the.

0        books:  absurdistan, the republic, of mice and...
1        i am die hard christopher moore fan. i don't r...
2        okay this is where the cultural matrix gets so...
3        bataille, celine, beckett. . .  lynch, jarmusc...
4        music: bands, rappers, musicians  at the momen...
5        books: to kill a mockingbird, lord of the ring...
6        i like books. ones with pictures. reading them...
7        i like: alphabetized lists, aquariums, autobio...
8        i am always willing to try new foods and am no...
9        i like some tv. i love summer heights high and...
10       i'm constantly reading, i read at what my frie...
11       books = yes. avid reader.  moves = eternal sun...
12                                                        
13                                                        
14       books:  my all-time favorite book is george or...
15       i don't really pick favorites. variety is the ...
16       movies/tv/etc:  the big lebowski (and other co.

0                     food.  water.  cell phone.  shelter.
1        delicious porkness in all of its glories.  my ...
2        movement  conversation  creation  contemplatio...
3                                                         
4                                                         
5        like everyone else, i love my friends and fami...
6        laughter  amazing people in my life  color  cu...
7        friends, family, notebook/pen, books, music, t...
8        sports/my softball glove  coffee. because nobo...
9        music, my guitar  contrast  good food  my bike...
10            family  friends  food  women  music  reading
11       guitar - even if i don't play it all the time,...
12                                                        
13                                                        
14       1. my family  2. italian or mexican food  3. m...
15       i work with children with disabilities for a l...
16       (in no particular order)    - good food  - mus.

0                              duality and humorous things
1                                                         
2                                                         
3                               cats and german philosophy
4                                                         
5        what my contribution to the world is going to ...
6        synchronicity    there is this whole other rea...
7                         things that amuse and inspire me
8                                                         
9                                                         
10       snowboarding, food, women, goofy nerd stuff, a...
11       a little bit of everything. but mostly social ...
12                                                        
13                                                        
14       to be honest, i spend way too much time thinki...
15                     how to live a joyful, playful life.
16       - methodologies for practicing creative skills.

0        trying to find someone to hang out with. i am ...
1                                                         
2        viewing. listening. dancing. talking. drinking...
3                                                         
4                                                         
5                                     out with my friends!
6        plotting to take over the world with my army o...
7        out and about or relaxing at home with a good ...
8        in or out... drinking with friends, maybe a ba...
9                                                         
10       having dinner and drinks with friends and/or w...
11       hanging out with a small group of friends--sta...
12                                                        
13                                                        
14       unwinding from my work week. hanging out with ...
15       i'm usually pretty exhausted by friday. so usu...
16       i just moved here and am still getting to know.

0        i am new to california and looking for someone...
1        i am very open and will share just about anyth...
2        when i was five years old, i was known as "the...
3                                                         
4                                                         
5        i cried on my first day at school because a bi...
6                                  my typical friday night
7                                                         
8        potential friends/lovers/people who come in co...
9                                                         
10       i used to wish for a jetpack when blowing out ...
11       i'm picky when it comes to dating. i know what...
12                                                        
13                                                        
14       hmmm...not sure if i want to admit this, but p...
15       i hated cleaning the litter box so much, i tau...
16       i am in my 30's and still cannot grow a mustac.

0        you want to be swept off your feet!  you are t...
1                                                         
2        you are bright, open, intense, silly, ironic, ...
3                                    you feel so inclined.
4                                                         
5                                          you're awesome.
6                                                         
7                                                         
8        http://www.youtube.com/watch?v=4dxbwzuwsxk let...
9                                   you can rock the bells
10       you are a complex woman with healthy self-este...
11       if you know who you are, who you want, where y...
12                                                        
13                                                        
14            ...you genuinely think we'd be a good match.
15                                                        
16       you want to help me assemble ikea stuff and/or.

In [6]:
data.columns

Index(['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0',
       'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7',
       'essay8', 'essay9', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'orientation', 'pets',
       'religion', 'sex', 'sign', 'smokes', 'speaks', 'status', 'essay0_mod',
       'essay1_mod', 'essay2_mod', 'essay3_mod', 'essay4_mod', 'essay5_mod',
       'essay6_mod', 'essay7_mod', 'essay8_mod', 'essay9_mod'],
      dtype='object')

In [13]:
for i in range(0,201):
    print(data['essay0_mod'][np.random.randint(0, data.shape[0])], '\n\n')

my name is harrison, i'm 19 years old and new to san francisco. i was born and raised in boston, but started moving around at a young age. i have just come back to the states after a sick 3 year residence in england. 


things that i love: lemonade, foxes, trashy pop music, public radio, embarrassing my nephew by singing to him in public, fog, puns, metaphysics, all of those stupid ghost hunting shows, daisies, giving awesome gifts, shark week, soapboxing about the importance of comprehensive sex education, roy blount jr., clever graffiti, celebrity gossip, plato, champagne, new sneakers, quotes, the color green, milk duds.    things that i hate: james blunt, the smell of hair dye, moths, cats and dogs talking like people, pears, when it rains and the hems of my jeans get wet and stay that way all day, jello, when girls talk about how they just can't seem to make female friends because girls are so catty, ticket scalpers, weak hugs, people who are perpetually late, ketchup on hotdogs. 