In [2]:
from os import listdir
from os.path import isfile, join
mypath = 'blogs/'
files_names = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [3]:
import xml.etree.ElementTree as ET
tree = ET.parse(mypath+files_names[1])
root = tree.getroot()

In [78]:
%%time
raw_posts = []
labels = []

for f in files_names:

    data = {}
    tags = {}


    if 'fe' in f:
        tags['gender'] = 0
    else:
        tags['gender'] = 1

    tags['age']=int(f.split('.')[2])
    tags['field']=(f.split('.')[3])
    tags['zodiac']=(f.split('.')[4])


    for child in root:
        if child.tag == 'date':
            data['date'] = child.text
        elif child.tag == 'post':
            data['text'] = child.text
            raw_posts.append(data)
            labels.append(tags)
            data = {}



CPU times: user 1.25 s, sys: 432 ms, total: 1.68 s
Wall time: 1.68 s


In [6]:
import re
import nltk
from random import shuffle

In [7]:
def get_words(text, stop=False):
    raw_text =  re.split(' ',text)
    unwanted = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~£\n'
    raw_text2 = ''
    for k in raw_text:
        # Ignore urls
        if k.startswith('http') or k.startswith('http'):
            continue
        elif k.startswith('@'):     
            continue
        elif k.startswith('#'):
            continue
        else:
            raw_text2 += ' '
            raw_text2 += k
    words_raw = nltk.word_tokenize(raw_text2.lower())
    words = [w for w in words_raw if w not in unwanted]
    if stop: 
        words = [w for w in words if w not in stopwords.words('english')]
    return words

In [9]:
%%time
y = []
for item in labels:
    if item['age'] < 18:
        y.append(0)
    elif item['age'] < 28:
        y.append(1)
    else: 
        y.append(2)

CPU times: user 408 ms, sys: 48 ms, total: 456 ms
Wall time: 455 ms


In [69]:
%%time
y_gen = []
for item in labels:
    y_gen.append(item['gender'])

CPU times: user 264 ms, sys: 0 ns, total: 264 ms
Wall time: 260 ms


In [10]:
%%time
x_raw = []
for item in raw_posts:
    x_raw.append(raw_posts[0]['date']+raw_posts[0]['text'])

CPU times: user 776 ms, sys: 384 ms, total: 1.16 s
Wall time: 1.16 s


## Gendim models

In [11]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence, TaggedDocument, TaggedLineDocument
from gensim.models import Doc2Vec

# numpy
import numpy

# random
from random import shuffle

# classifier
from sklearn.linear_model import LogisticRegression



In [12]:

class LabeledLineSentence(object):
    def __init__(self, prefix, sources):
        self.sources = sources
        self.prefix = prefix
        k1 = self.to_array()
        
    
    def __iter__(self):
        for item_no, line in enumerate(self.sources):
            yield LabeledSentence(get_words(line), [self.prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for item_no, line in enumerate(self.sources):
            self.sentences.append(LabeledSentence(get_words(line), [self.prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [106]:
%%time
split = 1000*1000
idx = list(range(len(x_raw))) 
shuffle(idx)

x_rand = []
y_rand = []
for i in idx[:split]:
    x_rand.append(x_raw[i])
    y_rand.append(y[i])

CPU times: user 2.31 s, sys: 0 ns, total: 2.31 s
Wall time: 2.31 s


In [63]:
%%time
sentences = LabeledLineSentence('age', x_rand)

CPU times: user 1min 52s, sys: 0 ns, total: 1min 52s
Wall time: 1min 52s


In [64]:
%%time
model = Doc2Vec(min_count=1, window=6, size=255, sample=1e-4, negative=5, workers=20)
model.build_vocab(sentences)

CPU times: user 1min 53s, sys: 0 ns, total: 1min 53s
Wall time: 1min 53s


In [65]:
%%time
for epoch in range(10):
    print("Epoch ", epoch)
    model.train(sentences.sentences_perm())

Epoch  0
Epoch  1
Epoch  2
Epoch  3
Epoch  4
Epoch  5
Epoch  6
Epoch  7
Epoch  8
Epoch  9
CPU times: user 28min 14s, sys: 5min 52s, total: 34min 7s
Wall time: 19min 50s


In [68]:
%%time
idx = list(range(len(x_rand))) 

x_rand_norm = []

for i in idx:
    x_rand_norm.append(model.docvecs['age_{}'.format(i)])


split_stamp = int(0.8 * len(x_rand_norm))  
x_train = x_rand_norm[:split_stamp]
y_train = y_rand[:split_stamp]

x_test = x_rand_norm[split_stamp:]
y_test = y_rand[split_stamp:]


CPU times: user 208 ms, sys: 0 ns, total: 208 ms
Wall time: 207 ms


## ML models

In [21]:
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV



In [67]:
%%time
svm = SVC()
parameters = {
    'degree':[3,4,5]
}

clf = GridSearchCV(svm, parameters, cv=3, n_jobs=20)
clf.fit(x_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()

print("Best score: %0.3f" % clf.best_score_)

print("Test score")
print(clf.score(x_test, y_test))


Best parameters set found on development set:

{'degree': 3}

Best score: 0.427
Test score
0.4293
CPU times: user 1h 2min 7s, sys: 3.36 s, total: 1h 2min 10s
Wall time: 1h 37min 35s


## Gender identification

In [79]:
%%time
y_gen = []
for item in labels:
    y_gen.append(item['gender'])

CPU times: user 296 ms, sys: 4 ms, total: 300 ms
Wall time: 297 ms


In [110]:
%%time
split = 100*1000
idx = list(range(len(x_raw))) 
shuffle(idx)

x_rand = []
y_rand = []
for i in idx[:split]:
    x_rand.append(x_raw[i])
    y_rand.append(y_gen[i])


CPU times: user 1.61 s, sys: 0 ns, total: 1.61 s
Wall time: 1.61 s


In [97]:

sentences = LabeledLineSentence('age', x_rand)



model = Doc2Vec(min_count=1, window=8, size=350, sample=1e-4, negative=5, workers=20)
model.build_vocab(sentences)

for epoch in range(30):
    print("Epoch ", epoch)
    model.train(sentences.sentences_perm())
    

idx = list(range(len(x_rand))) 

x_rand_norm = []

for i in idx:
    x_rand_norm.append(model.docvecs['age_{}'.format(i)])


split_stamp = int(0.8 * len(x_rand_norm))  
x_train = x_rand_norm[:split_stamp]
y_train = y_rand[:split_stamp]

x_test = x_rand_norm[split_stamp:]
y_test = y_rand[split_stamp:]


Epoch  0
Epoch  1
Epoch  2
Epoch  3
Epoch  4
Epoch  5
Epoch  6
Epoch  7
Epoch  8
Epoch  9
Epoch  10
Epoch  11
Epoch  12
Epoch  13
Epoch  14
Epoch  15
Epoch  16
Epoch  17
Epoch  18
Epoch  19
Epoch  20
Epoch  21
Epoch  22
Epoch  23
Epoch  24
Epoch  25
Epoch  26
Epoch  27
Epoch  28
Epoch  29
CPU times: user 28min 4s, sys: 5min 11s, total: 33min 15s
Wall time: 18min 57s


In [94]:
%%time
svm = SVC()
parameters = {
    'degree':[3,4,5]
}

clf = GridSearchCV(svm, parameters, cv=3, n_jobs=20)
clf.fit(x_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()

print("Best score: %0.3f" % clf.best_score_)

print("Test score")
print(clf.score(x_test, y_test))


Best parameters set found on development set:

{'degree': 3}

Best score: 0.503
Test score
0.498
CPU times: user 33.8 s, sys: 940 ms, total: 34.8 s
Wall time: 1min


In [98]:
%%time
svm = SVC()
parameters = {
    'degree':[3,4,5]
}

#clf = GridSearchCV(svm, parameters, cv=3, n_jobs=20)
#clf.fit(x_train, y_train)
svm.fit(x_train, y_train)

print("Test score")
print(svm.score(x_test, y_test))


Test score
0.497666666667
CPU times: user 5min 40s, sys: 0 ns, total: 5min 40s
Wall time: 5min 40s


## TfIdf

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [123]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer='word', ngram_range=(1,3), stop_words='english')),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('nb', SVC()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [124]:
split_stamp = int(0.8 * len(x_rand))  
x_train = x_rand[:split_stamp]
y_train = y_rand[:split_stamp]

x_test = x_rand[split_stamp:]
y_test = y_rand[split_stamp:]

In [125]:
%%time
pipeline.fit(x_train, y_train)

CPU times: user 35min 16s, sys: 0 ns, total: 35min 16s
Wall time: 35min 16s


Pipeline(steps=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        s...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [126]:
%%time
pipeline.score(x_test, y_test)

CPU times: user 6min 29s, sys: 0 ns, total: 6min 29s
Wall time: 6min 29s


0.49590000000000001