In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import string
import pickle

from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = pd.read_csv('profiles.csv')

In [3]:
data.columns

Index(['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0',
       'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7',
       'essay8', 'essay9', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'orientation', 'pets',
       'religion', 'sex', 'sign', 'smokes', 'speaks', 'status'],
      dtype='object')

In [4]:
data.iloc[0][['essay' + str(i) for i in range(10)]]

essay0    about me:<br />\n<br />\ni would love to think...
essay1    currently working as an international agent fo...
essay2    making people laugh.<br />\nranting about a go...
essay3    the way i look. i am a six foot half asian, ha...
essay4    books:<br />\nabsurdistan, the republic, of mi...
essay5    food.<br />\nwater.<br />\ncell phone.<br />\n...
essay6                          duality and humorous things
essay7    trying to find someone to hang out with. i am ...
essay8    i am new to california and looking for someone...
essay9    you want to be swept off your feet!<br />\nyou...
Name: 0, dtype: object

In [5]:
data.dtypes

age              int64
body_type       object
diet            object
drinks          object
drugs           object
education       object
essay0          object
essay1          object
essay2          object
essay3          object
essay4          object
essay5          object
essay6          object
essay7          object
essay8          object
essay9          object
ethnicity       object
height         float64
income           int64
job             object
last_online     object
location        object
offspring       object
orientation     object
pets            object
religion        object
sex             object
sign            object
smokes          object
speaks          object
status          object
dtype: object

In [6]:
data.isnull().sum()

age                0
body_type       5296
diet           24395
drinks          2985
drugs          14080
education       6628
essay0          5488
essay1          7572
essay2          9638
essay3         11476
essay4         10537
essay5         10850
essay6         13771
essay7         12451
essay8         19225
essay9         12603
ethnicity       5680
height             3
income             0
job             8198
last_online        0
location           0
offspring      35561
orientation        0
pets           19921
religion       20226
sex                0
sign           11056
smokes          5512
speaks            50
status             0
dtype: int64

In [7]:
X = data[['essay' + str(i) for i in range(10)]]

In [8]:
X = X.fillna('')

In [9]:
X = X.apply(lambda row: ' '.join(row), axis=1).to_frame()

In [10]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, 'lxml')
    stripped_text = soup.get_text()
    return stripped_text

In [11]:
def custom_tokenizer(text):
    text = strip_html_tags(text)
    
    remove_punct = str.maketrans('', '', string.punctuation)
    text = text.translate(remove_punct)
    
    remove_digits = str.maketrans('', '', string.digits)
    text = text.lower().translate(remove_digits)
    
    tokens = word_tokenize(text)
    
    stop_words = stopwords.words('english')
    tokens_stop = [y for y in tokens if y not in stop_words]
    
    stemmer = SnowballStemmer('english')
    tokens_stem = [stemmer.stem(y) for y in tokens_stop]
    
    return tokens_stem

In [21]:
data[data.sex=='m'].age.describe()

count    35829.000000
mean        32.018588
std          9.032881
min         18.000000
25%         26.000000
50%         30.000000
75%         36.000000
max        109.000000
Name: age, dtype: float64

In [22]:
data[data.sex=='f'].age.describe()

count    24117.000000
mean        32.818220
std         10.025385
min         18.000000
25%         26.000000
50%         30.000000
75%         37.000000
max        110.000000
Name: age, dtype: float64

In [12]:
cv = CountVectorizer(tokenizer=custom_tokenizer)
X_cv = cv.fit_transform(X[0])

KeyboardInterrupt: 

In [None]:
X_cv

In [None]:
with open('words_sparse_matrix.pkl', 'wb') as handle:
    pickle.dump(X_cv, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
with open('countvec.pkl', 'wb') as f:
    pickle.dump(cv, f, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
with open('words_sparse_matrix.pkl', 'rb') as handle:
    X_cv = pickle.load(handle)

In [19]:
cv.get_feature_names()

['aa',
 'aaa',
 'aaaa',
 'aaaaa',
 'aaaaaa',
 'aaaaaaaaa',
 'aaaaaaaaaaaaaaaaaaaaaaand',
 'aaaaaaaaaaaaaaaaaaahhhhhhhhh',
 'aaaaaaaaaaaand',
 'aaaaaaaaaaand',
 'aaaaaaaaaajdkjdjdjsjjsjsjdjsndndjdjsnsnmsmsmddmmsmsmdmdmdmdmdmdnxnxnckcjffkf',
 'aaaaaaaaaand',
 'aaaaaaaaall',
 'aaaaaaaah',
 'aaaaaaaand',
 'aaaaaaahhhhhh',
 'aaaaaaand',
 'aaaaaaauaiagajakalpplth',
 'aaaaaag',
 'aaaaaah',
 'aaaaaahhh',
 'aaaaaalmost',
 'aaaaaand',
 'aaaaaandeat',
 'aaaaah',
 'aaaaall',
 'aaaaalll',
 'aaaaalllllllll',
 'aaaaalllrright',
 'aaaaand',
 'aaaaannndddd',
 'aaaaannnnnd',
 'aaaaawwwwwwwwwwww',
 'aaaaayyyyyi',
 'aaaaggh',
 'aaaah',
 'aaaahh',
 'aaaahhh',
 'aaaahhhh',
 'aaaallrighti',
 'aaaand',
 'aaaandit',
 'aaaandthat',
 'aaaani',
 'aaaannnd',
 'aaaannnddddd',
 'aaaanyhoo',
 'aaaanyth',
 'aaaanyway',
 'aaaaoooh',
 'aaaasss',
 'aaaawwweessooomme',
 'aaaawwww',
 'aaaayyy',
 'aaabuuunnncch',
 'aaagh',
 'aaah',
 'aaahahaha',
 'aaahh',
 'aaahhh',
 'aaahyay',
 'aaaiiiieee',
 'aaaladdin',
 'aaallllll',
 'a