# Load the dataset

In [1]:
from pos import count_pos
import sentiment_feature_extraction
import os, xml.etree.ElementTree as et, itertools
from bs4 import BeautifulSoup

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
def extract_targets(datapath):
    files_gender = {}
    files_age = {}
    with open(datapath+'truth.txt', 'rb') as truth:
        lines = truth.readlines()
        for line in lines:
            files_gender[line.split(':::')[0]+'.xml'] = line.split(':::')[1]
            files_age[line.split(':::')[0]+'.xml'] = line.split(':::')[2]
    return files_gender, files_age

In [4]:
def extract_tweets(language, year):
    if year == '2015':
        datapath = 'data/pan15-author-profiling-training-dataset-'+language+'-2015-04-23/'
        files_gender, files_age = extract_targets(datapath)
        files = os.listdir(datapath)
        tweets = []
        for f in files:
            if f.endswith('.xml'):
                posts = []
                texts = ''
                tree = et.parse(datapath+f)
                documents = tree.iterfind('document')
                for d in documents:
                    texts += d.text+'\n'
                post = {}
                post['text'] = texts
                post['gender'] = files_gender[f]
                post['age'] = files_age[f]
                posts.append(post)
                tweets.append(posts)
        tweets = list(itertools.chain(*tweets))
    elif year == '2016':
        datapath = 'data/pan16-author-profiling-training-dataset-'+language+'-2016-02-29/'
        files_gender, files_age = extract_targets(datapath)
        files = os.listdir(datapath)
        tweets = []
        for f in files:
            if f.endswith('.xml'):
                posts = []
                texts = ''
                root = et.parse(datapath+f)
                for d in root.find('documents').findall('document'):
                    if d.text!=None:
                        soup = BeautifulSoup(d.text, 'html.parser').get_text()
                        texts += soup+'\n'
                post = {}
                post['text'] = texts
                post['gender'] = files_gender[f]
                post['age'] = files_age[f]
                posts.append(post)
                tweets.append(posts)
        tweets = list(itertools.chain(*tweets))    
    return tweets

In [5]:
language = 'english' #spanish, english, english-nltk, dutch
year = '2016' #2015, 2016

In [6]:
tweets = extract_tweets(language, year)

# Extract Features

In [7]:
"""from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import cross_validation, linear_model
hv = HashingVectorizer(ngram_range=(1, 2), binary=True)
data = []
gender = []
age = []
for t in tweets:
    data.append(t['text'])
    gender.append(t['gender'])
    age.append(t['age'])

features = hv.transform(data)"""

"from sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn import cross_validation, linear_model\nhv = HashingVectorizer(ngram_range=(1, 2), binary=True)\ndata = []\ngender = []\nage = []\nfor t in tweets:\n    data.append(t['text'])\n    gender.append(t['gender'])\n    age.append(t['age'])\n\nfeatures = hv.transform(data)"

In [10]:
with open('EmoticonSentimentLexicon.txt') as emoticon_lexicon:
    lines = emoticon_lexicon.readlines()
    emot_dict = {}
    for l in lines:
        line = l.split('\t')
        emot_dict[line[0]] = int(line[1].split('\r')[0])

In [11]:
import csv

age_lexicon = {}
gender_lexicon = {}

with open('lexica/emnlp14age.csv') as age_lexicon_file:
    age_reader = csv.reader(age_lexicon_file, delimiter=',', quotechar='"')
    age_reader.next()
    age_intercept = age_reader.next()[1]
    for row in age_reader:
        age_lexicon[row[0]] = float(row[1])
        
with open('lexica/emnlp14gender.csv') as gender_lexicon_file:
    gender_reader = csv.reader(gender_lexicon_file, delimiter=',', quotechar='"')
    gender_reader.next()
    gender_intercept = gender_reader.next()[1]
    for row in gender_reader:
        gender_lexicon[row[0]] = float(row[1])  

In [12]:
import re
gender = []
age = []
pos_tags = []
tag_count = []
i = 0
for t in tweets:
    text = t['text']
    emoticons = sentiment_feature_extraction.emoticons_from_dictionary(text, emot_dict)
    if text.split()<=0.0:
        length = len(text.split())
    else:
        length = 1.0
    caps = ('CAPS', float(sentiment_feature_extraction.caps_words(text)))
    elongated = ('ELONGATED', float(sentiment_feature_extraction.elonganted_words(text)))
    exclamation_interrogation_dict = sentiment_feature_extraction.exclamation_and_interrogation(text)
    excl = ('!', exclamation_interrogation_dict['!'])
    interr = ('?', exclamation_interrogation_dict['?'])
    omg_count = ('OMG', len(re.findall('omg+', text, re.I)))
    heart_count = ('<3', len(re.findall('<3+', text)))
    lol_count = ('lol', len(re.findall('lo+l', text, re.I)))
    lmfao_count = ('lmfao', len(re.findall('lmfa+o+', text, re.I)))
    emoticon_count = ('EMOTCOUNT', emoticons['number_emoticons'])
    emoticon_score = ('EMOTSCORE', emoticons['score_emoticons'])
    mention_count = ('@COUNT', len(re.findall('@username', text)))
    hashtag_count = ('#COUNT', len(re.findall('#', text)))
    rt_count = ('RT', len(re.findall('RT @username', text)))
    url_count = ('URL', len(re.findall('http[s]?://', text)))
    pic_count = ('PIC', len(re.findall('pic.twitter.com', text)))
    avg_text_length = ('TEXTLEN', length/len(text.split('\n')))
    words_length = 0
    for word in text.split():
        words_length += len(word)        
    avg_word_length = ('WORDLEN', words_length/length)
    count = count_pos(text, language)
    count_dict = dict(count)
    extrav_score = 0.0
    sum_tags = ['NN', 'JJ', 'IN', 'DT']
    sub_tags = ['PRP', 'VB', 'VBD', 'VBG', 'VBZ', 'VBP', 'VBN', 'RB', 'UH']
    for tag in sum_tags:
        if count_dict.has_key(tag):
            extrav_score += count_dict[tag]
    for tag in sub_tags:
        if count_dict.has_key(tag):
            extrav_score -= count_dict[tag]    
    extraversion = ('EXTRAV', (extrav_score+100)/2.0)
    
    if language == 'english':
        bf_words = 'wife|gf|girlfriend|dw'
        gf_words = 'husband|bf|boyfriend|hubby|dh'
    elif language == 'spanish':
        bf_words = 'mujer|novia|esposa'
        gf_words = 'marido|novio|esposo'
    elif language == 'dutch':
        bf_words = 'vrouw|vriendin'
        gf_words = 'man|bf|vriend'
        
    bf_count = ('GFCOUNT', len(re.findall(bf_words, text, re.I)))
    gf_count = ('BFCOUNT', len(re.findall(gf_words, text, re.I)))
        
    count.extend((caps, elongated, excl, interr, omg_count, heart_count, lol_count, lmfao_count, emoticon_count, 
            emoticon_score, mention_count, hashtag_count, rt_count, url_count, pic_count, avg_text_length, 
            avg_word_length, extraversion, bf_count, gf_count))
    
    if language == 'english':
        male_rationales = ('MALRAT', len(re.findall('bro|dude|homie', text, re.I)))
        female_rationales = ('FEMRAT', len(re.findall('cute', text, re.I)))
        
        gender_lex_count = 0.0
        age_lex_count = 0.0
        for word in text.split():
            if word in gender_lexicon.keys():
                gender_lex_count += gender_lexicon[word]
            if word in age_lexicon.keys():
                age_lex_count += age_lexicon[word]
        gender_lex = ('GLEX', gender_intercept+gender_lex_count/length)
        age_lex = ('ALEX', age_intercept+age_lex_count/length)
        
        count.extend((male_rationales, female_rationales, gender_lex, age_lex))
    
    tag_count.append(count)
    gender.append(t['gender'])
    age.append(t['age'])
    i += 1
    print str(i)+'/'+str(len(tweets))



TypeError: cannot concatenate 'str' and 'float' objects

In [13]:
# Save features

import pickle
#datapath = 'data/pan16-author-profiling-training-dataset-'+language+'-'+year+'-02-29/'
datapath = 'data/pan15-author-profiling-training-dataset-'+language+'-'+year+'-04-23/'
if not os.path.isdir(datapath+'data'):
    os.makedirs(datapath+'data')
with open(datapath+'data/tag_count.p', 'wb') as tagfile:
    pickle.dump(tag_count, tagfile)
with open(datapath+'data/gender.p', 'wb') as genderfile:
    pickle.dump(gender, genderfile)
with open(datapath+'data/age.p', 'wb') as agefile:
    pickle.dump(age, agefile)

In [14]:
# Load features

import pickle
#datapath = 'data/pan16-author-profiling-training-dataset-'+language+'-'+year+'-02-29/'
datapath = 'data/pan15-author-profiling-training-dataset-'+language+'-'+year+'-04-23/'
with open(datapath+'data/tag_count.p', 'rb') as tagfile:
    tag_count = pickle.load(tagfile)
with open(datapath+'data/gender.p', 'rb') as genderfile:
    gender = pickle.load(genderfile)
with open(datapath+'data/age.p', 'rb') as agefile:
    age = pickle.load(agefile)

In [15]:
pos_tags = []
for post in tag_count: 
    for tag in post:
        if tag[0] not in pos_tags:
            pos_tags.append(tag[0])
print pos_tags

complete_tag_count = []
for post in tag_count:
    p = dict(post)
    for pos in pos_tags:
        if pos not in p:
            post.append((pos, 0))
    post = sorted(post)
    complete_tag_count.append([i[1] for i in post])

[]


# Train and evaluate classifiers

In [16]:
tag_total = np.array(complete_tag_count)
gender_total = np.array(gender)
#remove_index = pos_tags.index('EXTRAV')
#filtered_tags = np.zeros(shape=(len(tag_total),65))
#for t in range(0, len(tag_total)-1):
#    filtered_tags[t] = np.delete(tag_total[t], remove_index)
#tag_total = filtered_tags

In [17]:
from sklearn import cross_validation, svm, linear_model, tree, ensemble, naive_bayes, neighbors, gaussian_process, grid_search
from sklearn.multiclass import OneVsRestClassifier

age_total = np.array(age)
#features_total = np.array(features)

predicted_class = age_total # gender_total, age_total

clf1 = linear_model.LogisticRegression()
clf2 = ensemble.RandomForestClassifier(n_estimators=100)
clf3 = tree.DecisionTreeClassifier(max_depth=3)
clf4 = svm.SVC(kernel='linear', probability=True, C=0.05)
clf5 = naive_bayes.GaussianNB()
clf6 = naive_bayes.BernoulliNB()
clf7 = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
clf8 = ensemble.AdaBoostClassifier(n_estimators=100)
clf9 = OneVsRestClassifier(clf4)

eclf = ensemble.VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('dt', clf3), ('kn', clf4),
                                             ('svcl', clf5), ('gnb', clf6), ('gbc', clf7), ('ada', clf8), ('multi', clf9)], voting='soft')
eclf2 = ensemble.VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('dt', clf3), ('kn', clf4),
                                             ('svcl', clf5), ('gnb', clf6), ('gbc', clf7), ('ada', clf8), ('multi', clf9)], voting='hard')

In [19]:
cv = cross_validation.KFold(tag_total.shape[0], 3)

for clf, label in zip([clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, eclf, eclf2], ['Logistic Regression', 'Random Forest',
    'Decision Tree', 'SVC Linear','Gaussian NB', 'Bernoulli NB', 'Gradient Boosting Classifier', 'AdaBoost', 'One vs Rest', 
    'Soft Voting Ensemble', 'Hard Voting Ensemble']):
    scores = cross_validation.cross_val_score(clf, tag_total, predicted_class, cv=cv, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

ValueError: Cannot have number of folds n_folds=3 greater than the number of samples: 0.

In [15]:
"""cv = cross_validation.KFold(tag_total.shape[0], 3)

for clf, label in zip([clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, eclf, grid], ['Logistic Regression', 'Random Forest',
                                                 'Decision Tree', 'SVC Linear','Gaussian NB', 'Bernoulli NB', 
                                                'Gradient Boosting Classifier', 'AdaBoost', 'One vs Rest', 'Voting Ensemble', 'Grid Search']):
    scores = cross_validation.cross_val_score(clf, features, predicted_class, cv=cv, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))"""

TypeError: Singleton array array(<152x1048576 sparse matrix of type '<type 'numpy.float64'>'
	with 220223 stored elements in Compressed Sparse Row format>, dtype=object) cannot be considered a valid collection.

In [None]:
clf8.fit(tag_total, predicted_class)
for c in range(1,len(clf8.feature_importances_)):
    print pos_tags[c], clf8.feature_importances_[c]

In [None]:
clf3.fit(tag_total, gender_total)
from sklearn.externals.six import StringIO
with open("gender.dot", 'w') as f:
    f = tree.export_graphviz(clf3, out_file=f, filled=True, class_names=['Female', 'Male'], feature_names=pos_tags)