# Load the dataset

In [1]:
# -*- coding: utf-8 -*-
import os
#os.chdir('/home/croman/PycharmProjects/gender')
from pos import count_pos
import sentiment_feature_extraction
import os, xml.etree.ElementTree as et, itertools
from bs4 import BeautifulSoup
import numpy as np
import extract_features_from_text as extractor

In [2]:
def extract_targets(datapath):
    files_gender = {}
    files_age = {}
    with open(datapath+'truth.txt', 'rb') as truth:
        lines = truth.readlines()
        for line in lines:
            files_gender[line.split(':::')[0]+'.xml'] = line.split(':::')[1]
            files_age[line.split(':::')[0]+'.xml'] = line.split(':::')[2]
    return files_gender, files_age

In [3]:
def extract_tweets(language, year):
    print 'Extracting tweets...'
    if year == '2015':
        datapath = 'data/pan15-author-profiling-training-dataset-'+language+'-2015-04-23/'
        files_gender, files_age = extract_targets(datapath)
        files = os.listdir(datapath)
        tweets = []
        for f in files:
            if f.endswith('.xml'):
                posts = []
                texts = ''
                tree = et.parse(datapath+f)
                documents = tree.iterfind('document')
                for d in documents:
                    texts += d.text+'\n'
                post = {}
                post['text'] = texts
                post['gender'] = files_gender[f]
                post['age'] = files_age[f]
                posts.append(post)
                tweets.append(posts)
        tweets = list(itertools.chain(*tweets))
    elif year == '2016':
        datapath = 'data/pan16-author-profiling-training-dataset-'+language+'-2016-02-29/'
        files_gender, files_age = extract_targets(datapath)
        files = os.listdir(datapath)
        tweets = []
        i=1
        for f in files:
            if f.endswith('.xml'):                
                posts = []
                texts = ''
                root = et.parse(datapath+f)
                for d in root.find('documents').findall('document'):
                    if d.text!=None:
                        soup = BeautifulSoup(d.text, 'html.parser').get_text()
                        texts += soup+'\n'
                post = {}
                post['text'] = texts
                post['gender'] = files_gender[f]
                post['age'] = files_age[f]
                posts.append(post)
                tweets.append(posts)
            print str(i)+'/'+str(len(files))
            i+=1
        tweets = list(itertools.chain(*tweets))    
    return tweets

# Extract Features

In [4]:
"""from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import cross_validation, linear_model
hv = HashingVectorizer(ngram_range=(1, 2), binary=True)
data = []
gender = []
age = []
for t in tweets:
    data.append(t['text'])
    gender.append(t['gender'])
    age.append(t['age'])

features = hv.transform(data)"""

"from sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn import cross_validation, linear_model\nhv = HashingVectorizer(ngram_range=(1, 2), binary=True)\ndata = []\ngender = []\nage = []\nfor t in tweets:\n    data.append(t['text'])\n    gender.append(t['gender'])\n    age.append(t['age'])\n\nfeatures = hv.transform(data)"

In [5]:
def load_lexicons():
    with open('EmoticonSentimentLexicon.txt') as emoticon_lexicon:
        import csv

        lines = emoticon_lexicon.readlines()
        global emot_dict
        emot_dict = {}
        for l in lines:
            line = l.split('\t')
            emot_dict[line[0]] = int(line[1].split('\r')[0])    

        global age_lexicon
        global gender_lexicon
        age_lexicon = {}
        gender_lexicon = {}
    with open('lexica/emnlp14age.csv') as age_lexicon_file:
        age_reader = csv.reader(age_lexicon_file, delimiter=',', quotechar='"')
        age_reader.next()
        global age_intercept
        age_intercept = age_reader.next()[1]
        for row in age_reader:
            age_lexicon[row[0]] = float(row[1])
    with open('lexica/emnlp14gender.csv') as gender_lexicon_file:
        gender_reader = csv.reader(gender_lexicon_file, delimiter=',', quotechar='"')
        gender_reader.next()
        global gender_intercept
        gender_intercept = gender_reader.next()[1]
        for row in gender_reader:
            gender_lexicon[row[0]] = float(row[1])

In [6]:
def extract_features(tweets, language):
    import re
    print 'Extracting features...'
    gender = []
    age = []
    pos_tags = []
    tag_count = []
    i = 0
    for t in tweets:
        text = t['text']
        emoticons = sentiment_feature_extraction.emoticons_from_dictionary(text, emot_dict)
        if text.split()<=0.0:
            length = len(text.split())
        else:
            length = 1.0
        caps = ('CAPS', float(sentiment_feature_extraction.caps_words(text)))
        elongated = ('ELONGATED', float(sentiment_feature_extraction.elonganted_words(text)))
        exclamation_interrogation_dict = sentiment_feature_extraction.exclamation_and_interrogation(text)
        excl = ('!', exclamation_interrogation_dict['!'])
        interr = ('?', exclamation_interrogation_dict['?'])
        omg_count = ('OMG', len(re.findall('omg+', text, re.I)))
        heart_count = ('<3', len(re.findall('<3+', text)))
        lol_count = ('lol', len(re.findall('lo+l', text, re.I)))
        lmfao_count = ('lmfao', len(re.findall('lmfa+o+', text, re.I)))
        emoticon_count = ('EMOTCOUNT', emoticons['number_emoticons'])
        emoticon_score = ('EMOTSCORE', emoticons['score_emoticons'])
        mention_count = ('@COUNT', len(re.findall('@username', text)))
        hashtag_count = ('#COUNT', len(re.findall('#', text)))
        rt_count = ('RT', len(re.findall('RT @username', text)))
        url_count = ('URL', len(re.findall('http[s]?://', text)))
        pic_count = ('PIC', len(re.findall('pic.twitter.com', text)))
        avg_text_length = ('TEXTLEN', length/len(text.split('\n')))
        words_length = 0
        for word in text.split():
            words_length += len(word)        
        avg_word_length = ('WORDLEN', words_length/length)
        count = count_pos(text, language)
        count_dict = dict(count)
        extrav_score = 0.0
        sum_tags = ['NN', 'JJ', 'IN', 'DT']
        sub_tags = ['PRP', 'VB', 'VBD', 'VBG', 'VBZ', 'VBP', 'VBN', 'RB', 'UH']
        for tag in sum_tags:
            if count_dict.has_key(tag):
                extrav_score += count_dict[tag]
        for tag in sub_tags:
            if count_dict.has_key(tag):
                extrav_score -= count_dict[tag]    
        extraversion = ('EXTRAV', (extrav_score+100)/2.0)

        if language == 'english':
            bf_words = 'wife|gf|girlfriend|dw'
            gf_words = 'husband|bf|boyfriend|hubby|dh'
        elif language == 'spanish':
            bf_words = 'mujer|novia|esposa'
            gf_words = 'marido|novio|esposo'
        elif language == 'dutch':
            bf_words = 'vrouw|vriendin'
            gf_words = 'man|bf|vriend'

        bf_count = ('GFCOUNT', len(re.findall(bf_words, text, re.I)))
        gf_count = ('BFCOUNT', len(re.findall(gf_words, text, re.I)))

        count.extend((caps, elongated, excl, interr, omg_count, heart_count, lol_count, lmfao_count, emoticon_count, 
                emoticon_score, mention_count, hashtag_count, rt_count, url_count, pic_count, avg_text_length, 
                avg_word_length, extraversion, bf_count, gf_count))

        if language == 'english':
            male_rationales = ('MALRAT', len(re.findall('bro|dude|homie', text, re.I)))
            female_rationales = ('FEMRAT', len(re.findall('cute', text, re.I)))

            gender_lex_count = 0.0
            age_lex_count = 0.0
            for word in text.split():
                if word in gender_lexicon.keys():
                    gender_lex_count += gender_lexicon[word]
                if word in age_lexicon.keys():
                    age_lex_count += age_lexicon[word]
            gender_lex = ('GLEX', float(gender_intercept)+gender_lex_count/length)
            age_lex = ('ALEX', float(age_intercept)+age_lex_count/length)
            
            """selected_features_LIWC = extractor.extract_features_text(text, dictionary_LIWC_en)
            selected_categories_LIWC = extractor.features_to_categories(selected_features_LIWC, dictionary_LIWC_en)
            tuples_LIWC = extractor.parse_dict_to_tuples(selected_categories_LIWC)"""

            count.extend((male_rationales, female_rationales, gender_lex, age_lex))
            #count.extend(tuples_LIWC)

        if t['gender']=='M':
            g = 0
        elif t['gender']=='F':
            g = 1
        if t['age']=='18-24':
            a = 0
        elif t['age']=='25-34':
            a = 1
        elif t['age']=='35-49':
            a = 2
        elif t['age']=='50-64':
            a = 3
        elif t['age']=='65-xx':
            a = 4
        #count.extend([('GENDER', g), ('AGE', a)])

        tag_count.append(count)
        gender.append(t['gender'])
        age.append(t['age'])
        i += 1
        print str(i)+'/'+str(len(tweets))
    return tag_count, gender, age

In [7]:
# Save features
def save_features(language, year, tag_count, gender, age):
    import pickle

    if year == '2015':
        datapath = 'data/pan15-author-profiling-training-dataset-'+language+'-2015-04-23/'
    elif year == '2016':
        datapath = 'data/pan16-author-profiling-training-dataset-'+language+'-2016-02-29/'

    if not os.path.isdir(datapath+'data'):
        os.makedirs(datapath+'data')
    with open(datapath+'data/tag_count.p', 'wb') as tagfile:
        pickle.dump(tag_count, tagfile)
    with open(datapath+'data/gender.p', 'wb') as genderfile:
        pickle.dump(gender, genderfile)
    with open(datapath+'data/age.p', 'wb') as agefile:
        pickle.dump(age, agefile)

In [8]:
# Load features
def load_features(language, year):
    import pickle

    if year == '2015':
        datapath = 'data/pan15-author-profiling-training-dataset-'+language+'-2015-04-23/'
    elif year == '2016':
        datapath = 'data/pan16-author-profiling-training-dataset-'+language+'-2016-02-29/'

    with open(datapath+'data/tag_count.p', 'rb') as tagfile:
        tag_count = pickle.load(tagfile)
    with open(datapath+'data/gender.p', 'rb') as genderfile:
        gender = pickle.load(genderfile)
    with open(datapath+'data/age.p', 'rb') as agefile:
        age = pickle.load(agefile)
        
    return tag_count, gender, age

In [9]:
def complete_tags(tag_count):
    pos_tags = []
    for post in tag_count: 
        for tag in post:
            if tag[0] not in pos_tags:
                pos_tags.append(tag[0])
    #print pos_tags

    complete_tag_count = []
    for post in tag_count:
        p = dict(post)
        for pos in pos_tags:
            if pos not in p:
                post.append((pos, 0))
        post = sorted(post)
        complete_tag_count.append([i[1] for i in post])
        
    return complete_tag_count

In [10]:
def complete_mixed_tags(tag_count1, tag_count2):
    pos_tags = []
    for post in tag_count1: 
        for tag in post:
            if tag[0] not in pos_tags:
                pos_tags.append(tag[0])
    for post in tag_count2:
        for tag in post:
            if tag[0] not in pos_tags:
                pos_tags.append(tag[0])
    #print pos_tags

    complete_tag_count1 = []
    complete_tag_count2 = []
    
    for post in tag_count1:
        p = dict(post)
        for pos in pos_tags:
            if pos not in p:
                post.append((pos, 0))
        post = sorted(post)
        complete_tag_count1.append([i[1] for i in post])
        
    for post in tag_count2:
        p = dict(post)
        for pos in pos_tags:
            if pos not in p:
                post.append((pos, 0))
        post = sorted(post)
        complete_tag_count2.append([i[1] for i in post])
        
    return complete_tag_count1, complete_tag_count2

# Train and evaluate classifiers

In [11]:
"""tag_total = np.array(complete_tag_count)
gender_total = np.array(gender)
age_total = np.array(age)

remove_index = pos_tags.index('EXTRAV')
remove_index = [3,4,8,9,11,12,14,15,18,19,20,22,23,24,25,26,29,31,32,33,34,35,36,37,38,39,41,42,44,47,48,49,
                50,52,53,54,55,56,57,58,59,60,61,62]
filtered_tags = np.zeros(shape=(len(tag_total),29))
for t in range(0, len(tag_total)-1):
    filtered_tags[t] = np.delete(tag_total[t], remove_index)
tag_total = filtered_tags"""

"tag_total = np.array(complete_tag_count)\ngender_total = np.array(gender)\nage_total = np.array(age)\n\nremove_index = pos_tags.index('EXTRAV')\nremove_index = [3,4,8,9,11,12,14,15,18,19,20,22,23,24,25,26,29,31,32,33,34,35,36,37,38,39,41,42,44,47,48,49,\n                50,52,53,54,55,56,57,58,59,60,61,62]\nfiltered_tags = np.zeros(shape=(len(tag_total),29))\nfor t in range(0, len(tag_total)-1):\n    filtered_tags[t] = np.delete(tag_total[t], remove_index)\ntag_total = filtered_tags"

In [12]:
def train_and_evaluate(complete_tag_count, prediction, predicted_class, nonpredicted_class):    
    from sklearn import cross_validation, svm, linear_model, tree, ensemble, naive_bayes, neighbors, gaussian_process, grid_search
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.ensemble import BaggingClassifier
    from sklearn import metrics

    tag_total = np.array(complete_tag_count)
    predicted_final = np.array(predicted_class)
    nonpredicted_final = np.array(nonpredicted_class)
    """if prediction == 'age':
        remove_index = len(tag_total[0])-1
    elif prediction == 'gender':
        remove_index = len(tag_total[0])-2
    filtered_tags = np.zeros(shape=(len(tag_total),len(tag_total[0])-1))
    for t in range(0, len(tag_total)-1):
        filtered_tags[t] = np.delete(tag_total[t], remove_index)
    print tag_total
    tag_total = filtered_tags
    print tag_total"""
    #features_total = np.array(features)

    clf1 = linear_model.LogisticRegression(n_jobs=9)
    clf2 = ensemble.RandomForestClassifier(n_estimators=100, n_jobs=9)
    clf3 = ensemble.ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=1, random_state=0, criterion='entropy',
                                        n_jobs=9)
    clf4 = tree.DecisionTreeClassifier(max_depth=3)
    clf5 = svm.SVC(kernel='linear', probability=True, C=0.05)
    clf6 = naive_bayes.GaussianNB()
    clf7 = naive_bayes.BernoulliNB()
    clf8 = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=1, random_state=0)
    clf9 = ensemble.AdaBoostClassifier(n_estimators=100)
    clf10 = OneVsRestClassifier(clf4, n_jobs=9)

    eclf = ensemble.VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('ext', clf3), ('dt', clf4), ('kn', clf5),
                                                 ('svcl', clf6), ('gnb', clf7), ('gbc', clf8), ('ada', clf9), ('multi', clf10)
                                                 ], voting='soft')
    eclf2 = ensemble.VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('ext', clf3), ('dt', clf4), ('kn', clf5),
                                                 ('svcl', clf6), ('gnb', clf7), ('gbc', clf8), ('ada', clf9), ('multi', clf10)
                                                 ], voting='hard')
    
    #cv = cross_validation.KFold(tag_total.shape[0], 3)
    cv = cross_validation.StratifiedKFold(predicted_final, 10)

    for clf, label in zip([clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, eclf, eclf2], ['Logistic Regression', 
        'Random Forest', 'Extra Trees', 'Decision Tree', 'SVC Linear','Gaussian NB', 'Bernoulli NB', 'Gradient Boosting Classifier',
        'AdaBoost', 'One vs Rest', 'Soft Voting Ensemble', 'Hard Voting Ensemble']):
        """scores = cross_validation.cross_val_score(clf, tag_total, predicted_final, cv=cv, scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))"""
        if prediction == 'age':
            results = cross_validation.cross_val_predict(clf, tag_total, nonpredicted_final, cv=cv)           
            final_tags = []
            for i in range(len(tag_total)):
                user = tag_total[i]
                user_gender = results[i]
                if user_gender == 'M' or user_gender == 'MALE':
                    g = 0
                elif user_gender == 'F' or user_gender == 'FEMALE':
                    g = 1
                user = np.append(user, g)
                final_tags.append(user)
        else:
            final_tags = tag_total
        scores = cross_validation.cross_val_score(clf, final_tags, predicted_final, cv=cv, scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [13]:
def train_and_predict(train_tags, test_tags, prediction, training_class):    
    from sklearn import cross_validation, svm, linear_model, tree, ensemble, naive_bayes, neighbors, gaussian_process, grid_search
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.ensemble import BaggingClassifier

    train_total = np.array(train_tags)
    test_total = np.array(test_tags)
    final_class = np.array(training_class)
    
    #features_total = np.array(features)
    #print train_total
    #print '--------------------------------'
    #print test_total

    clf1 = linear_model.LogisticRegression(n_jobs=9)
    clf2 = ensemble.RandomForestClassifier(n_estimators=100, n_jobs=9)
    clf3 = ensemble.ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=1, random_state=0, criterion='entropy',
                                        n_jobs=9)
    clf4 = tree.DecisionTreeClassifier(max_depth=3)
    clf5 = svm.SVC(kernel='linear', probability=True, C=0.05)
    clf6 = naive_bayes.GaussianNB()
    clf7 = naive_bayes.BernoulliNB()
    clf8 = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=1, random_state=0)
    clf9 = ensemble.AdaBoostClassifier(n_estimators=100)
    clf10 = OneVsRestClassifier(clf4, n_jobs=9)

    eclf = ensemble.VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('ext', clf3), ('dt', clf4), ('kn', clf5),
                                                 ('svcl', clf6), ('gnb', clf7), ('gbc', clf8), ('ada', clf9), ('multi', clf10)
                                                 ], voting='soft')
    
    eclf.fit(train_total, final_class)
    predicted_tags = eclf.predict(test_total)
    final_tags = []
    if prediction == 'age':
        for tag in predicted_tags:
            if tag == '18-24':
                final_tags.append(0)
            elif tag == '25-34':
                final_tags.append(1)
            elif tag == '35-49':
                final_tags.append(2)
            elif tag == '50-64':
                final_tags.append(3)
            elif tag == '65-xx':
                final_tags.append(4)
            elif tag == '50-XX':
                final_tags.append(3)
            else:
                print tag
    elif prediction == 'gender':
        for tag in predicted_tags:
            if tag == 'M':
                final_tags.append(0)
            elif tag == 'F':
                final_tags.append(1)
    return final_tags

In [14]:
def features(language, year):
    tweets = extract_tweets(language, year)
    load_lexicons()
    global dictionary_LIWC_en, dictionary_LIWC_es, dictionary_LIWC_nl
    dictionary_LIWC_en = extractor.load_LIWC('dictionaries/english/')
    dictionary_LIWC_es = extractor.load_LIWC('dictionaries/spanish/')
    dictionary_LIWC_nl = extractor.load_LIWC('dictionaries/dutch/')
    tags, gender, age = extract_features(tweets, language)
    save_features(language, year, tags, gender, age)

In [15]:
def add_tags(tags_list, new_tags):
    new_list = []
    for i in range(0, len(tags_list)):
        tags = tags_list[i]
        tags.append(new_tags[i])
        new_list.append(tags)
    return new_list

In [16]:
def evaluate(language, year, prediction):
    tags, gender, age = load_features(language, year)
    if prediction == 'age':
        predicted_class = age
        training_class = gender
    elif prediction == 'gender':
        predicted_class = gender
        training_class = age
    final_tags = complete_tags(tags)
    train_and_evaluate(final_tags, prediction, predicted_class, training_class)

In [17]:
def predict(language, train_year, test_year, prediction):
    train_tags, train_gender, train_age = load_features(language, train_year)
    test_tags, test_gender, test_age = load_features(language, test_year)
    if prediction == 'age':
        training_class = train_age
    elif prediction == 'gender':
        training_class = train_gender
    final_train_tags, final_test_tags = complete_mixed_tags(train_tags, test_tags)
    return train_and_predict(final_train_tags, final_test_tags, prediction, training_class)

In [None]:
def mixed_evaluate(language, train_year, test_year, prediction):
    tags, gender, age = load_features(language, test_year)
    if prediction == 'age':
        pretraining = 'gender'
        pretraining_class = gender
    elif prediction == 'gender':
        pretraining = 'age'
        pretraining_class = age
    class_tags = predict(language, train_year, test_year, pretraining)
    if prediction == 'age':
        predicted_class = age
    if prediction == 'gender':
        predicted_class = gender
    print len(tags), len(class_tags)
    final_tags = complete_tags(tags)
    final_tags = add_tags(final_tags, class_tags)
    train_and_evaluate(final_tags, prediction, predicted_class)

In [None]:
language = 'spanish' #spanish, english, english-nltk, dutch
year = '2016' #2015, 2016
#features(language, year)
evaluate(language, year, 'age')

Accuracy: 0.50 (+/- 0.09) [Logistic Regression]
Accuracy: 0.50 (+/- 0.09) [Random Forest]

In [2]:
language = 'spanish'
train_year = '2015'
test_year = '2016'
prediction = 'gender'
mixed_evaluate(language, train_year, test_year, prediction)

NameError: name 'mixed_evaluate' is not defined

In [None]:
language = 'spanish'
year = '2016'
#features(language, year)
evaluate(language, year, 'age')

Accuracy: 0.73 (+/- 0.08) [Logistic Regression]
Accuracy: 0.74 (+/- 0.08) [Random Forest]
Accuracy: 0.73 (+/- 0.08) [Extra Trees]
Accuracy: 0.76 (+/- 0.06) [Decision Tree]

In [72]:
tags, gender, age = load_features(language, year)
final_tags = complete_tags(tags)
print tags
print pos_tags
print final_tags

[[(u'MD', 0.008408071748878924), ('VB', 0.10594170403587444), (u':', 0.005605381165919282), (u'VBG', 0.0016816143497757848), ('JJ', 0.04035874439461883), ('NN', 0.28475336322869954), (u'CC', 0.020179372197309416), (u'PRP$', 0.010089686098654708), (u'.', 0.14405829596412556), (u'"', 0.011210762331838564), (u'PRP', 0.033071748878923765), ('RB', 0.05100896860986547), (u'IN', 0.13565022421524664), (u'WP$', 0.027466367713004484), (u'Zu', 0.0005605381165919282), (u'DT', 0.08800448430493274), (u'CD', 0.010650224215246636), (u',', 0.018497757847533634), (u'UH', 0.002802690582959641), ('CAPS', 9.0), ('ELONGATED', 16.0), ('!', 47), ('?', 9), ('OMG', 0), ('<3', 0), ('lol', 0), ('lmfao', 0), ('EMOTCOUNT', 1), ('EMOTSCORE', 1), ('@COUNT', 67), ('#COUNT', 12), ('RT', 0), ('URL', 2), ('PIC', 0), ('TEXTLEN', 0.009900990099009901), ('WORDLEN', 7060.0), ('EXTRAV', 50.17713004484305), ('GFCOUNT', 1), ('BFCOUNT', 0), ('GENDER', 0), ('AGE', 0), (u'(', 0), (u')', 0)], [(u'MD', 0.008690254500310366), (u'PRP$

In [15]:
"""cv = cross_validation.KFold(tag_total.shape[0], 3)

for clf, label in zip([clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, eclf, grid], ['Logistic Regression', 'Random Forest',
                                                 'Decision Tree', 'SVC Linear','Gaussian NB', 'Bernoulli NB', 
                                                'Gradient Boosting Classifier', 'AdaBoost', 'One vs Rest', 'Voting Ensemble', 'Grid Search']):
    scores = cross_validation.cross_val_score(clf, features, predicted_class, cv=cv, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))"""

TypeError: Singleton array array(<152x1048576 sparse matrix of type '<type 'numpy.float64'>'
	with 220223 stored elements in Compressed Sparse Row format>, dtype=object) cannot be considered a valid collection.

In [None]:
clf8.fit(tag_total, predicted_class)
for c in range(1,len(clf8.feature_importances_)):
    print pos_tags[c], clf8.feature_importances_[c]

In [None]:
clf3.fit(tag_total, gender_total)
from sklearn.externals.six import StringIO
with open("gender.dot", 'w') as f:
    f = tree.export_graphviz(clf3, out_file=f, filled=True, class_names=['Female', 'Male'], feature_names=pos_tags)