In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
import pickle

In [4]:
df = pd.read_csv('./data/profiles.csv')

print(df.shape,'\n',
      df.columns,'\n')
df.head()

(59946, 31) 
 Index(['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0',
       'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7',
       'essay8', 'essay9', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'orientation', 'pets',
       'religion', 'sex', 'sign', 'smokes', 'speaks', 'status'],
      dtype='object') 



Unnamed: 0,age,body_type,diet,drinks,drugs,education,essay0,essay1,essay2,essay3,...,location,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status
0,22,a little extra,strictly anything,socially,never,working on college/university,about me:<br />\n<br />\ni would love to think...,currently working as an international agent fo...,making people laugh.<br />\nranting about a go...,"the way i look. i am a six foot half asian, ha...",...,"south san francisco, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism and very serious about it,m,gemini,sometimes,english,single
1,35,average,mostly other,often,sometimes,working on space camp,i am a chef: this is what that means.<br />\n1...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,...,"oakland, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism but not too serious about it,m,cancer,no,"english (fluently), spanish (poorly), french (...",single
2,38,thin,anything,socially,,graduated from masters program,"i'm not ashamed of much, but writing public te...","i make nerdy software for musicians, artists, ...",improvising in different contexts. alternating...,my large jaw and large glasses are the physica...,...,"san francisco, california",,straight,has cats,,m,pisces but it doesn&rsquo;t matter,no,"english, french, c++",available
3,23,thin,vegetarian,socially,,working on college/university,i work in a library and go to school. . .,reading things written by old dead people,playing synthesizers and organizing books acco...,socially awkward but i do my best,...,"berkeley, california",doesn&rsquo;t want kids,straight,likes cats,,m,pisces,no,"english, german (poorly)",single
4,29,athletic,,socially,never,graduated from college/university,hey how's it going? currently vague on the pro...,work work work work + play,creating imagery to look at:<br />\nhttp://bag...,i smile a lot and my inquisitive nature,...,"san francisco, california",,straight,likes dogs and likes cats,,m,aquarius,no,english,single


In [5]:
for i in np.arange(30):
    
    print(df['essay2'][i],'\n')

making people laugh.<br />
ranting about a good salting.<br />
finding simplicity in complexity, and complexity in simplicity. 

being silly. having ridiculous amonts of fun wherever. being a
smart ass. ohh and i can cook. ;) 

improvising in different contexts. alternating between being
present and decidedly outside of a moment, or trying to hold both
at once. rambling intellectual conversations that hold said
conversations in contempt while seeking to find something that
transcends them. being critical while remaining generous. listening
to and using body language--often performed in caricature or large
gestures, if not outright interpretive dance. dry, dark, and
raunchy humor. 

playing synthesizers and organizing books according to the library
of congress classification system 

creating imagery to look at:<br />
http://bagsbrown.blogspot.com/<br />
http://stayruly.blogspot.com/ 

imagining random shit. laughing at aforementioned random shit.
being goofy. articulating what i think 

In [6]:
desc = df['essay2'].dropna().values

In [7]:
len(desc)

50308

In [8]:
stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

def tokenize(text):
    return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]

In [9]:
punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]
ex_stop = ['abov', 'afterward', 'alon', 'alreadi', 'alway', 'ani', 'anoth', 
           'anyon', 'anyth', 'anywher', 'becam', 'becaus', 'becom', 'befor', 
           'besid', 'cri', 'describ', 'dure', 'els', 'elsewher', 'empti', 
           'everi', 'everyon', 'everyth', 'everywher', 'fifti', 'forti', 
           'henc', 'hereaft', 'herebi', 'howev', 'hundr', 'inde', 'mani', 
           'meanwhil', 'moreov', 'nobodi', 'noon', 'noth', 'nowher', 'onc', 
           'onli', 'otherwis', 'ourselv', 'perhap', 'pleas', 'sever', 'sinc', 
           'sincer', 'sixti', 'someon', 'someth', 'sometim', 'somewher', 'themselv', 
           'thenc', 'thereaft', 'therebi', 'therefor', 'togeth', 'twelv', 'twenti', 
           'veri', 'whatev', 'whenc', 'whenev', 'wherea', 'whereaft', 'wherebi', 
           'wherev', 'whi', 'yourselv','anywh', 'el', 'elsewh', 'everywh', 'ind', 
           'otherwi', 'plea', 'somewh']

stop_words = text.ENGLISH_STOP_WORDS.union(ex_stop).union(punc)
vectorizer = TfidfVectorizer(stop_words = stop_words, tokenizer=tokenize, max_features=6000)
X = vectorizer.fit_transform(desc)

In [10]:
vectorizer.get_feature_names()

["'s",
 'ab',
 'abalon',
 'abandon',
 'abil',
 'abl',
 'abroad',
 'absolut',
 'absorb',
 'abstract',
 'absurd',
 'abund',
 'abus',
 'academ',
 'academia',
 'acceler',
 'accent',
 'accept',
 'access',
 'accessor',
 'accessori',
 'accid',
 'accident',
 'accommod',
 'accompani',
 'accomplish',
 'accord',
 'accordion',
 'account',
 'accumul',
 'accur',
 'accuraci',
 'accus',
 'ace',
 'ach',
 'achiev',
 'acknowledg',
 'acoust',
 'acquaint',
 'acquir',
 'acquisit',
 'acrobat',
 'acronym',
 'acroyoga',
 'acryl',
 'act',
 'action',
 'activ',
 'activist',
 'actor',
 'actress',
 'actual',
 'acut',
 'ad',
 'adam',
 'adapt',
 'add',
 'addict',
 'addit',
 'address',
 'adept',
 'adequ',
 'adhd',
 'adject',
 'adjust',
 'administ',
 'administr',
 'admir',
 'admit',
 'adob',
 'adobo',
 'adolesc',
 'adopt',
 'ador',
 'adrenalin',
 'adult',
 'advanc',
 'advantag',
 'adventur',
 'advers',
 'advertis',
 'advic',
 'advis',
 'advisor',
 'advoc',
 'advocaci',
 'aerial',
 'aerob',
 'aesthet',
 'affair',
 'affe

In [11]:
df['location'].value_counts()

san francisco, california       31064
oakland, california              7214
berkeley, california             4212
san mateo, california            1331
palo alto, california            1064
                                ...  
orange, california                  1
hacienda heights, california        1
waterford, california               1
union city, california              1
astoria, new york                   1
Name: location, Length: 199, dtype: int64

In [12]:
kmeans = KMeans(n_clusters = 15, n_init = 5, n_jobs = -1)
kmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=15, n_init=5, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [96]:
common_words = kmeans.cluster_centers_.argsort()[:,-1:-11:-1]
for num, centroid in enumerate(common_words):
    print(str(num) + ' : ' + ', '.join(words[word] for word in centroid))

0 : br, make, peopl, thing, good, i'm, cook, listen, laugh, friend
1 : good, i'm, realli, br, pretti, friend, peopl, thing, like, think
2 : make, br, listen, friend, good, danc, write, thing, time, life
3 : feel, comfort, make, peopl, good, laugh, eas, listen, i'm, br
4 : thing, lot, fix, new, good, tri, make, learn, listen, peopl
5 : play, guitar, music, good, br, game, make, sport, cook, love
6 : cook, danc, love, listen, bake, friend, good, make, laugh, food
7 : stuff, fix, lot, br, thing, good, make, cook, comput, like
8 : peopl, make, laugh, smile, friend, good, listen, new, thing, love
9 : ilink, href, class, br, cook, i'm, good, danc, make, thing
10 : listen, friend, good, peopl, talk, laugh, advic, problem, cook, understand
11 : mind, set, open, thing, read, heart, peopl, listen, good, pretti
12 : job, good, peopl, cook, friend, make, listen, laugh, br, i'm
13 : laugh, make, peopl, smile, good, cook, listen, time, friend, i'm
14 : fun, make, laugh, peopl, listen, thing, cook, f

In [95]:
words = vectorizer.get_feature_names()

In [22]:

filename = 'means.sav'
pickle.dump(kmeans, open(filename, 'wb'))

In [6]:
new_vect = pickle.load(open(filename, 'rb'))

AttributeError: Can't get attribute 'tokenize' on <module '__main__'>

In [13]:
new_string = ['I love going out for tacos, hiking and craft cocktails']

In [20]:
vectorizer.transform(new_string)

<1x6000 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [19]:
(new_vect.transform(new_string).todense()).shape

(1, 6000)

In [26]:
sf = (df[df['location'] == 'san francisco, california'][['essay2']]
      .dropna()
      .reset_index()
      .drop(columns=['index']))

In [28]:
sf.to_csv('sf_data_descriptions.csv')

In [19]:
tokenize(new_string[0])

['i', 'love', 'go', 'out', 'for', 'taco', 'hike', 'and', 'craft', 'cocktail']

In [24]:
len(str(new_string))

58