In [1]:
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [2]:
import pandas as pd
import gensim
import numpy as np
import re

from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

from gensim.models import KeyedVectors

from model import Model

In [3]:
cow = KeyedVectors.load_word2vec_format("./models/cow/cow-320.txt", binary=False)
#rou = KeyedVectors.load_word2vec_format("./models/roularta/roularta-320.txt", binary=False)
UD = Model('./models/dutch_ud.udpipe')

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.base import clone

In [5]:
def xml_to_df(filename):
    with open('./data/{}.txt'.format(filename)) as f:
        return pd.DataFrame(
            re.findall('<doc id=\"(.*?)\" genre=\"(.*?)\" gender=\"(.*?)\">\n(.*?)</doc>', f.read(), re.DOTALL),
            columns=['id','genre','gender','text'])
    
def get_chains(text):
    try:
        sentences = UD.tokenize(text)
        for s in sentences:
            UD.tag(s)
            UD.parse(s)
        conllu = UD.write(sentences, "conllu")
        rows = conllu.split('\n')
        string_pos, string_rel_type, string_lemmas = '', '', ''
        for row in rows:
            if '#' not in row and row != '':
                row = row.split('\t')
                string_pos = string_pos + row[3] + ' '
                string_rel_type = string_rel_type + row[7] + ' '
                string_lemmas = string_lemmas + row[2] + ' '
            else:
                string_pos += '\n'
                string_rel_type += '\n'
                string_lemmas += '\n'
        return (re.sub('\n+','\n', string_lemmas[:-1]).strip(), 
                re.sub('\n+','\n', string_pos[:-1]).strip(), 
                re.sub('\n+','\n', string_rel_type[:-1].replace(':','_')).strip())
    except:
        return '', '', ''

In [54]:
news = xml_to_df('GxG_News')
twitter = xml_to_df('GxG_Twitter')
youtube = xml_to_df('GxG_YouTube')

In [55]:
%%time
data = np.array([list(get_chains(i)) for i in news['text'].values]).reshape(news.shape[0],3).T
news['lemmatized'], news['pos'], news['rel'] = data[0], data[1], data[2]

CPU times: user 4min 34s, sys: 234 ms, total: 4min 34s
Wall time: 4min 34s


In [56]:
news.head()

Unnamed: 0,id,genre,gender,text,lemmatized,pos,rel
0,1,news,M,KVV begint aan nieuw voetbalhoofdstuk\nZelzate...,Kvv beginnen aan nieuw voetbalhoofdstuk Zelzat...,NOUN VERB ADP ADJ NOUN PROPN PUNCT ADP DET PRO...,nsubj root case amod obl appos punct case det ...
1,2,news,M,"'t Endezomer "" het laatste zomerse Meetjeslan...","het Endezomer "" het laat zomers Meetjeslands m...",DET PROPN PUNCT DET ADJ ADJ ADJ NOUN PUNCT X P...,det flat_name punct det amod amod amod nsubj p...
2,3,news,M,Kamperen bij het kanaal in de Zouten\nBerendre...,kamperen bij het kanaal in de zouten Berendrec...,VERB ADP DET NOUN ADP DET PROPN PROPN PUNCT DE...,nsubj case det obl case det obl flat_name punc...
3,4,news,F,Sintdreiger komt weg met boete\nUTRECHT - Hij ...,Sintdreiger komen weg met boete Utrecht - hij ...,PROPN VERB ADV ADP NOUN PROPN PUNCT PRON VERB ...,nsubj root compound_prt case obl appos punct n...
4,5,news,F,‘In Irak zal ik óók het belang van de Nederlan...,‘In Irak zullen ik óóken het belang van de Ned...,ADP PROPN AUX PRON ADJ DET NOUN ADP DET ADJ NO...,case obl aux nsubj advmod det obj case det amo...


In [62]:
print (news.pos.values[1])

DET PROPN PUNCT DET ADJ ADJ ADJ NOUN PUNCT X PUNCT PROPN PROPN PROPN PUNCT DET NOUN ADP PROPN CCONJ PRON NOUN AUX ADV VERB SCONJ PRON PUNCT ADV ADJ SCONJ ADJ PUNCT DET ADJ NOUN AUX VERB PUNCT 
DET NOUN AUX ADV ADJ ADP PUNCT


In [59]:
%%time
data = np.array([list(get_chains(i)) for i in youtube['text'].values]).reshape(youtube.shape[0],3).T
youtube['lemmatized'], youtube['pos'], youtube['rel'] = data[0], data[1], data[2]

CPU times: user 3min 27s, sys: 1.25 s, total: 3min 28s
Wall time: 3min 28s


In [60]:
%%time
data = np.array([list(get_chains(i)) for i in twitter['text'].values]).reshape(twitter.shape[0],3).T
twitter['lemmatized'], twitter['pos'], twitter['rel'] = data[0], data[1], data[2]

CPU times: user 4min 55s, sys: 219 ms, total: 4min 56s
Wall time: 4min 56s


In [61]:
news.to_csv('./data/news.csv', sep='\t', index=False)
youtube.to_csv('./data/youtube.csv', sep='\t', index=False)
twitter.to_csv('./data/twitter.csv', sep='\t', index=False)

In [12]:
news = pd.read_csv('./data/news.csv', sep='\t').fillna('')
youtube = pd.read_csv('./data/youtube.csv', sep='\t').fillna('')
twitter= pd.read_csv('./data/twitter.csv', sep='\t').fillna('')

In [13]:
news.head()

Unnamed: 0,id,genre,gender,text,lemmatized,pos,rel
0,1,news,M,KVV begint aan nieuw voetbalhoofdstuk\nZelzate...,Kvv beginnen aan nieuw voetbalhoofdstuk Zelzat...,NOUN VERB ADP ADJ NOUN PROPN PUNCT ADP DET PRO...,nsubj root case amod obl appos punct case det ...
1,2,news,M,"'t Endezomer "" het laatste zomerse Meetjeslan...","het Endezomer "" het laat zomers Meetjeslands m...",DET PROPN PUNCT DET ADJ ADJ ADJ NOUN PUNCT X P...,det flat_name punct det amod amod amod nsubj p...
2,3,news,M,Kamperen bij het kanaal in de Zouten\nBerendre...,kamperen bij het kanaal in de zouten Berendrec...,VERB ADP DET NOUN ADP DET PROPN PROPN PUNCT DE...,nsubj case det obl case det obl flat_name punc...
3,4,news,F,Sintdreiger komt weg met boete\nUTRECHT - Hij ...,Sintdreiger komen weg met boete Utrecht - hij ...,PROPN VERB ADV ADP NOUN PROPN PUNCT PRON VERB ...,nsubj root compound_prt case obl appos punct n...
4,5,news,F,‘In Irak zal ik óók het belang van de Nederlan...,‘In Irak zullen ik óóken het belang van de Ned...,ADP PROPN AUX PRON ADJ DET NOUN ADP DET ADJ NO...,case obl aux nsubj advmod det obj case det amo...


In [7]:
news_test = xml_to_df('GxG_News_test')
twitter_test = xml_to_df('GxG_Twitter_test')
youtube_test = xml_to_df('GxG_YouTube_test')

data = np.array([list(get_chains(i)) for i in news_test['text'].values]).reshape(news_test.shape[0],3).T
news_test['lemmatized'], news_test['pos'], news_test['rel'] = data[0], data[1], data[2]

data = np.array([list(get_chains(i)) for i in twitter_test['text'].values]).reshape(twitter_test.shape[0],3).T
twitter_test['lemmatized'], twitter_test['pos'], twitter_test['rel'] = data[0], data[1], data[2]

data = np.array([list(get_chains(i)) for i in youtube_test['text'].values]).reshape(youtube_test.shape[0],3).T
youtube_test['lemmatized'], youtube_test['pos'], youtube_test['rel'] = data[0], data[1], data[2]

In [8]:
news_test.head()

Unnamed: 0,id,genre,gender,text,lemmatized,pos,rel
0,1,news,?,Doodrijder juf Mare: ’Alsof er uit het niets i...,Doodrijder juf Mare : ’Alsof er uit het niets ...,NOUN PROPN PROPN PUNCT VERB ADV ADP DET PRON P...,nsubj appos flat_name punct root advmod case f...
1,2,news,?,"Wie zag Lady?\n Wetteren - Lady, een akita te...","wie zien Lady ? \nWetteren - Lady , een akita ...",PRON VERB X PUNCT \nVERB PUNCT X PUNCT DET NOU...,nsubj root obj punct \nroot punct parataxis pu...
2,3,news,?,Lasershooten in het speelbos als alternatief k...,Lasershooten in het speelbos als alternatief k...,NOUN ADP DET NOUN ADP ADJ NOUN PROPN VERB DET ...,nsubj case det nmod mark amod nmod appos root ...
3,4,news,?,Aannemers tonen geen interesse voor Torendraai...,Aannemer tonen geen interesse voor Torendraaie...,NOUN VERB DET NOUN ADP NOUN PRON VERB ADP NOUN...,nsubj root det obj case nmod nsubj acl case ob...
4,5,news,?,Het verkeer aan de Brielpoort bij het stemmen\...,het verkeer aan de Brielpoort bij het stem Dei...,DET NOUN ADP DET PROPN ADP DET NOUN PROPN PUNC...,det nsubj case det nmod case det nmod appos pu...


In [14]:
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full
from gensim.corpora import Dictionary

def vectors(docs, model):
    docs = [doc.split() for doc in docs]
    docs_dict = Dictionary(docs)
    docs_dict.filter_extremes(no_below=20, no_above=0.8)
    docs_dict.compactify()

    docs_corpus = [docs_dict.doc2bow(doc) for doc in docs]
    model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
    docs_tfidf  = model_tfidf[docs_corpus]
    docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])
    tfidf_emb_vecs = np.vstack([model[docs_dict[i]] if docs_dict[i] in model else np.zeros(320) 
                                for i in range(len(docs_dict)) ])
    docs_emb = np.dot(docs_vecs, tfidf_emb_vecs) 
    return docs_emb

In [15]:
news_w2v = vectors(news.lemmatized.values, cow)
np.save('news_w2v', news_w2v)

In [16]:
youtube_w2v = vectors(youtube.lemmatized.values, cow)
np.save('youtube_w2v', youtube_w2v)

In [17]:
twitter_w2v = vectors(twitter.lemmatized.values, cow)
np.save('twitter_w2v', twitter_w2v)

In [24]:
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
cross_val_score(clf, news_w2v, news.gender.values, cv=10).mean()

0.6162028189202102

In [25]:
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
cross_val_score(clf, youtube_w2v, youtube.gender.values, cv=10).mean()

0.5793570028644656

In [26]:
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
cross_val_score(clf, twitter_w2v, twitter.gender.values, cv=10).mean()

0.6089

In [9]:
def _normalized(text):
    if text.startswith('@'):
        return '@@'
    elif text.startswith('#'):
        return '##'
    elif text.startswith('http') or text.startswith('www'):
        return '~~'
    else:
        return text
    
def normalized(text):
    text = [_normalized(i) for i in tknzr.tokenize(text)]
    text = ' '.join(text)
    return text

In [28]:
news['tokenized'] = news['text'].apply(normalized)
twitter['tokenized'] = twitter['text'].apply(normalized)
youtube['tokenized'] = youtube['text'].apply(normalized)

In [29]:
twitter.head()

Unnamed: 0,id,genre,gender,text,lemmatized,pos,rel,tokenized
0,1,twitter,F,Ik weet ook gewoon niet meer wie en wat ik ben...,ik weten ook gewoon niet veel wie en wat ik zi...,PRON VERB ADV ADJ ADV PRON PRON CCONJ PRON PRO...,nsubj root advmod advmod advmod obl nsubj cc c...,Ik weet ook gewoon niet meer wie en wat ik ben...
1,2,twitter,M,"@HaroldinhoXL Foto’s van laten maken, voor die...","@HaroldinhoXL Foto’s van laten maken , voor di...",PROPN PROPN ADP AUX VERB PUNCT ADP DET NOUN SC...,nsubj flat_name compound_prt aux root punct ca...,"@@ Foto ’ s van laten maken , voor die tijd da..."
2,3,twitter,M,@Rijkswaterstaat dacht dat de werken bij vught...,@Rijkswaterstaan denken dat de werk bij vught ...,AUX VERB SCONJ DET NOUN ADP NOUN ADP NUM NOUN ...,cop root mark det nsubj case obl case nummod n...,@@ dacht dat de werken bij vught tot 12 aug zo...
3,4,twitter,M,Wat een zaadpot. Als dit onze concurrentie is....,wat een zaadpot . \nals dit ons concurrentie z...,PRON DET NOUN PUNCT \nSCONJ PRON PRON NOUN AUX...,root det nsubj punct \nmark nsubj nmod_poss ad...,Wat een zaadpot . Als dit onze concurrentie is...
4,5,twitter,M,Ik voel een leuke trip aankomen ;-) https://t....,ik voelen een leuk trip aankomen ;- ) https://...,PRON VERB DET ADJ NOUN VERB PUNCT PUNCT SYM,nsubj root det amod obj ccomp punct punct obj,Ik voel een leuke trip aankomen ;-) ~~


In [10]:
news_test['tokenized'] = news_test['text'].apply(normalized)
twitter_test['tokenized'] = twitter_test['text'].apply(normalized)
youtube_test['tokenized'] = youtube_test['text'].apply(normalized)

In [11]:
news_test.head()

Unnamed: 0,id,genre,gender,text,lemmatized,pos,rel,tokenized
0,1,news,?,Doodrijder juf Mare: ’Alsof er uit het niets i...,Doodrijder juf Mare : ’Alsof er uit het niets ...,NOUN PROPN PROPN PUNCT VERB ADV ADP DET PRON P...,nsubj appos flat_name punct root advmod case f...,Doodrijder juf Mare : ’ Alsof er uit het niets...
1,2,news,?,"Wie zag Lady?\n Wetteren - Lady, een akita te...","wie zien Lady ? \nWetteren - Lady , een akita ...",PRON VERB X PUNCT \nVERB PUNCT X PUNCT DET NOU...,nsubj root obj punct \nroot punct parataxis pu...,"Wie zag Lady ? Wetteren - Lady , een akita tee..."
2,3,news,?,Lasershooten in het speelbos als alternatief k...,Lasershooten in het speelbos als alternatief k...,NOUN ADP DET NOUN ADP ADJ NOUN PROPN VERB DET ...,nsubj case det nmod mark amod nmod appos root ...,Lasershooten in het speelbos als alternatief k...
3,4,news,?,Aannemers tonen geen interesse voor Torendraai...,Aannemer tonen geen interesse voor Torendraaie...,NOUN VERB DET NOUN ADP NOUN PRON VERB ADP NOUN...,nsubj root det obj case nmod nsubj acl case ob...,Aannemers tonen geen interesse voor Torendraai...
4,5,news,?,Het verkeer aan de Brielpoort bij het stemmen\...,het verkeer aan de Brielpoort bij het stem Dei...,DET NOUN ADP DET PROPN ADP DET NOUN PROPN PUNC...,det nsubj case det nmod case det nmod appos pu...,Het verkeer aan de Brielpoort bij het stemmen ...


In [12]:
def cv_encoder(text):
    def _cv(letter):
        if letter in 'eyuioaàèìòùÀÈÌÒÙáéíóúýÁÉÍÓÚÝâêîôûÂÊÎÔÛãñõÃÑÕäëïöüÿÄËÏÖÜŸåÅæÆœŒøØ':
            return 'v'
        elif letter in 'qwrtpasdfghjklzxcvbnmQWRTPSDFGHJKLZXCVBNMçÇðÐ':
            return 'c'
        else:
            return letter
            
    text = ''.join([_cv(letter) for letter in text])
    return text

def ul_encoder(text):
    def _ul(letter):
        if letter.isupper():
            return 'U'
        elif letter.isalpha():
            return 'L'
        else:
            return letter
    text = ''.join([_ul(letter) for letter in text])
    return text

def len_encoder(text):
    def _len(word):
        if word.isalpha():
            return str(len(word))
        else:
            return word
    text = text.split()
    text = ' '.join([_len(letter) for letter in text])
    return text
    

In [31]:
text = twitter.tokenized.values[1]
print (text)
cv_encoder(text), ul_encoder(text), len_encoder(text)

@@ Foto ’ s van laten maken , voor die tijd dat de badkamer vetnieuwd moet worden …


('@@ cvcv ’ c cvc cvcvc cvcvc , cvvc cvv cvcc cvc cv cvccvcvc cvccvvvcc cvvc cvccvc …',
 '@@ ULLL ’ L LLL LLLLL LLLLL , LLLL LLL LLLL LLL LL LLLLLLLL LLLLLLLLL LLLL LLLLLL …',
 '@@ 4 ’ 1 3 5 5 , 4 3 4 3 2 8 9 4 6 …')

In [32]:
%%time 

news['cv'] = news['tokenized'].apply(cv_encoder)
news['ul'] = news['tokenized'].apply(ul_encoder)
news['len'] = news['tokenized'].apply(len_encoder)

twitter['cv'] = twitter['tokenized'].apply(cv_encoder)
twitter['ul'] = twitter['tokenized'].apply(ul_encoder)
twitter['len'] = twitter['tokenized'].apply(len_encoder)

youtube['cv'] = youtube['tokenized'].apply(cv_encoder)
youtube['ul'] = youtube['tokenized'].apply(ul_encoder)
youtube['len'] = youtube['tokenized'].apply(len_encoder)

CPU times: user 2.17 s, sys: 3.14 ms, total: 2.18 s
Wall time: 2.18 s


In [13]:
%%time 

news_test['cv'] = news_test['tokenized'].apply(cv_encoder)
news_test['ul'] = news_test['tokenized'].apply(ul_encoder)
news_test['len'] = news_test['tokenized'].apply(len_encoder)

twitter_test['cv'] = twitter_test['tokenized'].apply(cv_encoder)
twitter_test['ul'] = twitter_test['tokenized'].apply(ul_encoder)
twitter_test['len'] = twitter_test['tokenized'].apply(len_encoder)

youtube_test['cv'] = youtube_test['tokenized'].apply(cv_encoder)
youtube_test['ul'] = youtube_test['tokenized'].apply(ul_encoder)
youtube_test['len'] = youtube_test['tokenized'].apply(len_encoder)

CPU times: user 1.39 s, sys: 12 ms, total: 1.4 s
Wall time: 1.4 s


In [14]:
youtube_test.head()

Unnamed: 0,id,genre,gender,text,lemmatized,pos,rel,tokenized,cv,ul,len
0,21885,youtube,?,Deze video bekijken terwijl je een vrouw bent😭...,deze video bekijken terwijl je een vrouw bent😭😂!﻿,DET NOUN VERB SCONJ PRON DET NOUN NOUN,det obj root mark nsubj det advcl nmod,Deze video bekijken terwijl je een vrouw bent ...,cvcv cvcvv cvcvccvc cvccvcc cv vvc ccvvc cvcc ...,ULLL LLLLL LLLLLLLL LLLLLLL LL LLL LLLLL LLLL ...,4 5 8 7 2 3 5 4 😭 😂 ! ﻿
1,21709,youtube,?,Nee hoor een kat heeft geen maatje nodig\nIk h...,nee hoor een kat hebben geen maat nodig \nik h...,INTJ INTJ DET NOUN VERB DET NOUN ADJ \nPRON AU...,obl fixed det nsubj root det obj xcomp \nnsubj...,Nee hoor een kat heeft geen maatje nodig Ik he...,cvv cvvc vvc cvc cvvcc cvvc cvvccv cvcvc Ic cv...,ULL LLLL LLL LLL LLLLL LLLL LLLLLL LLLLL UL LL...,3 4 3 3 5 4 6 5 2 3 3 3 3 2 2 2 2 5 4 6 ﻿ 5 7 ...
2,18586,youtube,?,Subb op mij﻿\n,Subb op mij﻿,AUX ADP NOUN,root case obl,Subb op mij ﻿,cvcc vc cvc ﻿,ULLL LL LLL ﻿,4 2 3 ﻿
3,28421,youtube,?,Mijn irritatie us dat er te veel kleine kinder...,mijn irritatie us dat er te veel klein kind zi...,PRON NOUN ADJ SCONJ ADV ADV PRON ADJ NOUN AUX ...,nmod_poss nsubj amod mark advmod advmod advmod...,Mijn irritatie us dat er te veel kleine kinder...,cvcc vccvcvcvv vc cvc vc cv cvvc ccvvcv cvccvc...,ULLL LLLLLLLLL LL LLL LL LL LLLL LLLLLL LLLLLL...,4 9 2 3 2 2 4 6 8 4 3 5 7 4 ﻿ 2 3 2 4 5 6 ﻿ 6 ...
4,26799,youtube,?,Wie heeft er ook geen schoenen aan doe dan ff ...,wie hebben er ook geen schoen aan doen dan ff ...,PRON AUX ADV ADV DET NOUN ADP NOUN ADV ADJ DET...,nsubj root advmod advmod det obj case obl advm...,Wie heeft er ook geen schoenen aan doe dan ff ...,cvv cvvcc vc vvc cvvc cccvvcvc vvc cvv cvc cc ...,ULL LLLLL LL LLL LLLL LLLLLLLL LLL LLL LLL LL ...,3 5 2 3 4 8 3 3 3 2 3 4 🙏 🏿 ﻿


In [33]:
youtube.head(2)

Unnamed: 0,id,genre,gender,text,lemmatized,pos,rel,tokenized,cv,ul,len
0,22988,youtube,M,12:55 stickers van zijn lelijke kut hoofd😂😂﻿\n,12:55 sticker van zijn lelijk kut hoofd😂😂﻿,NUM NOUN ADP PRON ADJ NOUN NOUN,nummod nsubj case nmod_poss amod nmod root,12:55 stickers van zijn lelijke kut hoofd 😂 😂 ﻿,12:55 ccvccvcc cvc cvcc cvcvccv cvc cvvcc 😂 😂 ﻿,12:55 LLLLLLLL LLL LLLL LLLLLLL LLL LLLLL 😂 😂 ﻿,12:55 8 3 4 7 3 5 😂 😂 ﻿
1,22192,youtube,M,Top video! ;) Jullie maken letterlijk de aller...,top video ! ; ) Jullië maken letterlijk de all...,ADP NOUN PUNCT PUNCT PUNCT PRON VERB ADJ DET A...,case parataxis punct punct punct nsubj paratax...,Top video ! ;) Jullie maken letterlijk de alle...,cvc cvcvv ! ;) cvccvv cvcvc cvccvccvcc cv vccv...,ULL LLLLL ! ;) ULLLLL LLLLL LLLLLLLLLL LL LLLL...,3 5 ! ;) 6 5 10 2 12 6 3 2 4 3 4 > ... ﻿ 3 4 2...


In [34]:
news.to_csv('./data/news.csv', sep='\t', index=False)
youtube.to_csv('./data/youtube.csv', sep='\t', index=False)
twitter.to_csv('./data/twitter.csv', sep='\t', index=False)

In [15]:
news_test.to_csv('./data/news_test.csv', sep='\t', index=False)
youtube_test.to_csv('./data/youtube_test.csv', sep='\t', index=False)
twitter_test.to_csv('./data/twitter_test.csv', sep='\t', index=False)

In [40]:
def run_all(data, classifier, dense=False):
    for i in ['lemmatized','pos','rel','cv','ul','len']:
        vectorizer = TfidfVectorizer(analyzer='word', token_pattern='\S+', ngram_range=(1, 4), 
                             max_df=0.9, min_df=5, max_features=20000)

        vectorizer.fit(data[i].values)

        X = vectorizer.transform(data[i].values)
        if dense: X = X.toarray()
        #print (X.shape)
        y = [1 if i=='F' else 0 for i in data.gender.values]

        clf = clone(classifier)
        score = cross_val_score(clf, X, y, cv=10).mean()
        print (i, score)

In [41]:
%%time 
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
print ('news')
run_all(news, clf)
print ('\ntwitter')
run_all(twitter, clf)
print ('\nyoutube')
run_all(youtube, clf)

news
lemmatized 0.6653786430960345
pos 0.5742713807931199
rel 0.5786192068800764
cv 0.6254837553750596
ul 0.5976827520305781
len 0.5781115623506927

twitter
lemmatized 0.6215
pos 0.5677
rel 0.55305
cv 0.5866999999999999
ul 0.59005
len 0.5809

youtube
lemmatized 0.5980753108073822
pos 0.5479533228168104
rel 0.5384574724308979
cv 0.6058766588344309
ul 0.5830871510886071
len 0.5836292484363107
CPU times: user 4min 6s, sys: 5min 42s, total: 9min 48s
Wall time: 2min 10s


In [42]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
%%time 

from sklearn.svm import SVC

clf = SVC(random_state=23)
print ('news')
run_all(news, clf)
print ('\ntwitter')
run_all(twitter, clf)
print ('\nyoutube')
run_all(youtube, clf)

In [44]:
from catboost import CatBoostClassifier

In [None]:
%%time 

clf = CatBoostClassifier(learning_rate=1, #тоже верхняя граница
                           depth=6, #это верхняя граница, если сделать выше, то с малым lr обучается оч долго
                           iterations=100, #с запасом, вряд ли столько нужно
                           random_state=23, #сид
                           early_stopping_rounds=10, #столько итераций с запасом, лучше остановиться
                           loss_function='Logloss', 
                           custom_loss='F1',
                           
                           verbose=0) #без вывода
print ('news')
run_all(news, clf, dense=True)
print ('twitter')
run_all(twitter, clf, dense=True)
print ('youtube')
run_all(youtube, clf, dense=True)

In [18]:
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full
from gensim.corpora import Dictionary
def vectors(docs, model, target):
    docs = [doc.split() for doc in docs]
    docs_dict = Dictionary(docs)
    docs_dict.filter_extremes(no_below=5, no_above=0.8)
    docs_dict.compactify()
    
    target = [doc.split() for doc in target]

    docs_corpus = [docs_dict.doc2bow(doc) for doc in docs]
    docs_target = [docs_dict.doc2bow(doc) for doc in target] #target
    model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
    docs_tfidf  = model_tfidf[docs_target] #target
    docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])
    tfidf_emb_vecs = np.vstack([model[docs_dict[i]] if docs_dict[i] in model else np.zeros(320) 
                                for i in range(len(docs_dict)) ])
    X_train = np.dot(docs_vecs, tfidf_emb_vecs) 
    
    docs_tfidf  = model_tfidf[docs_corpus]
    docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])
    tfidf_emb_vecs = np.vstack([model[docs_dict[i]] if docs_dict[i] in model else np.zeros(320) 
                                for i in range(len(docs_dict)) ])
    X_test = np.dot(docs_vecs, tfidf_emb_vecs) 
    return X_train, X_test

In [17]:
news = pd.read_csv('./data/news.csv', sep='\t').fillna('')
youtube = pd.read_csv('./data/youtube.csv', sep='\t').fillna('')
twitter= pd.read_csv('./data/twitter.csv', sep='\t').fillna('')

# News In-genre

In [5]:
news.head(1)

Unnamed: 0,id,genre,gender,text,lemmatized,pos,rel,tokenized,cv,ul,len
0,1,news,M,KVV begint aan nieuw voetbalhoofdstuk\nZelzate...,Kvv beginnen aan nieuw voetbalhoofdstuk Zelzat...,NOUN VERB ADP ADJ NOUN PROPN PUNCT ADP DET PRO...,nsubj root case amod obl appos punct case det ...,KVV begint aan nieuw voetbalhoofdstuk Zelzate ...,ccc cvcvcc vvc cvvvc cvvccvccvvccccvc cvccvcv ...,UUU LLLLLL LLL LLLLL LLLLLLLLLLLLLLLL ULLLLLL ...,3 6 3 5 16 7 - 2 2 7 3 6 2 2 8 6 5 3 7 3 2 17 ...


In [54]:
news_w2v = vectors(news.lemmatized.values, cow)
#np.save('news_w2v', news_w2v)

In [20]:
from tqdm import tqdm_notebook as tqdm

In [21]:
from scipy.sparse import coo_matrix, hstack

In [19]:
def pipeline1(data, target_df, model):
    X_train = None
    X_test = None
    for i in tqdm(['lemmatized','pos','rel','cv','ul','len']):
        vectorizer = TfidfVectorizer(analyzer='word', token_pattern='\S+', ngram_range=(1, 2), 
                             max_df=0.9, min_df=5, max_features=20000)
        vectorizer.fit(data[i].values)
        X_t = vectorizer.transform(target_df[i].values)#.toarray()
        if X_train is not None:
            X_train = hstack([X_train, X_t])
        else:
            X_train = X_t
            
        X_t = vectorizer.transform(data[i].values)#.toarray()
        if X_test is not None:
            X_test = hstack([X_test, X_t])
        else:
            X_test = X_t
    
    for i in tqdm(['tokenized', 'cv','ul']):
        vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 4), 
                             max_df=0.9, min_df=5, max_features=20000)
        vectorizer.fit(data[i].values)
        X_t = vectorizer.transform(target_df[i].values)#.toarray()
        X_train = hstack([X_train, X_t])
        
        X_t = vectorizer.transform(data[i].values)#.toarray()
        X_test = hstack([X_test, X_t])
            
    X_t, X_t2 = vectors(data.lemmatized.values, model, target_df.lemmatized.values)
    X_train = hstack([X_train, X_t])
    X_test = hstack([X_test, X_t2])
    
    y_train = [1 if i=='F' else 0 for i in data.gender.values]
    y_test = [1 if i=='F' else 0 for i in target_df.gender.values]
    return X_train, X_test, y_train, y_test

In [78]:
%%time
X_train, X_test, y_train, y_test = pipeline1(news, news, cow)

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


CPU times: user 56.8 s, sys: 9.59 s, total: 1min 6s
Wall time: 1min 4s


In [81]:
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
cross_val_score(clf, X_train, y_train, cv=5).mean()

0.6506474221905441

In [22]:
def pipeline2(data, target_df, model):
    X_train = None
    X_test = None
    for i in tqdm(['pos','rel','cv','ul','len']):
        vectorizer = TfidfVectorizer(analyzer='word', token_pattern='\S+', ngram_range=(1, 4), 
                             max_df=0.9, min_df=5, max_features=20000)
        vectorizer.fit(data[i].values)
        X_t = vectorizer.transform(target_df[i].values)
        if X_train is not None:
            X_train = hstack([X_train, X_t])
        else:
            X_train = X_t
            
        X_t = vectorizer.transform(data[i].values)
        if X_test is not None:
            X_test = hstack([X_test, X_t])
        else:
            X_test = X_t
    
    for i in tqdm(['tokenized', 'cv','ul']):
        vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 4), 
                             max_df=0.9, min_df=5, max_features=20000)
        vectorizer.fit(data[i].values)
        X_t = vectorizer.transform(target_df[i].values)
        X_train = hstack([X_train, X_t])
        
        X_t = vectorizer.transform(data[i].values)
        X_test = hstack([X_test, X_t])
            
    #X_t, X_t2 = vectors(data.lemmatized.values, model, target_df.lemmatized.values)
    #X_train = np.hstack([X_train, X_t])
    #X_test = np.hstack([X_test, X_t2])
    
    y_train = [1 if i=='F' else 0 for i in data.gender.values]
    y_test = [1 if i=='F' else 0 for i in target_df.gender.values]
    return X_train, X_test, y_train, y_test

In [60]:
del X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = pipeline1(twitter, twitter, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
cross_val_score(clf, X_train, y_train, cv=5).mean()

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

0.6191000000000001

In [61]:
del X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = pipeline2(twitter, twitter, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
cross_val_score(clf, X_train, y_train, cv=5).mean()

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

0.6073999999999999

# News cross-genre

In [62]:
del X_train, X_test, y_train, y_test

X_test, X_train, y_train, y_test = pipeline1(pd.concat([twitter, youtube]), news, cow)

clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
accuracy_score(clf.predict(X_test), y_test)

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

0.537117903930131

In [63]:
del X_train, X_test, y_train, y_test

X_test, X_train, y_train, y_test = pipeline2(pd.concat([twitter, youtube]), news, cow)

clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
accuracy_score(clf.predict(X_test), y_test)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

0.5338427947598253

# Twitter in-genre

In [51]:
del X_train, X_test, y_train, y_test

X_test, X_train, y_train, y_test = pipeline1(twitter, twitter, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
cross_val_score(clf, X_train, y_train, cv=5).mean()

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

0.6191000000000001

In [52]:
del X_train, X_test, y_train, y_test
X_test, X_train, y_train, y_test = pipeline2(twitter, twitter, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
cross_val_score(clf, X_train, y_train, cv=5).mean()

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

0.6073999999999999

# Twitter cross-genre

In [54]:
del X_train, X_test, y_train, y_test

X_test, X_train, y_train, y_test = pipeline1(pd.concat([news, youtube]), twitter, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
accuracy_score(clf.predict(X_test), y_test)

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

0.5612

In [55]:
del X_train, X_test, y_train, y_test

X_test, X_train, y_train, y_test = pipeline2(pd.concat([news, youtube]), twitter, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
accuracy_score(clf.predict(X_test), y_test)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

0.54995

# Youtube in-genre

In [56]:
del X_train, X_test, y_train, y_test
X_test, X_train, y_train, y_test = pipeline1(youtube, youtube, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
cross_val_score(clf, X_train, y_train, cv=5).mean()

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

0.6169976312581928

In [57]:
del X_train, X_test, y_train, y_test
X_test, X_train, y_train, y_test = pipeline2(youtube, youtube, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
cross_val_score(clf, X_train, y_train, cv=5).mean()

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

0.6160499505553895

# Youtube cross-genre

In [58]:
del X_train, X_test, y_train, y_test

X_test, X_train, y_train, y_test = pipeline1(pd.concat([news, twitter]), youtube, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
accuracy_score(clf.predict(X_test), y_test)

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

0.5332338578404775

In [59]:
del X_train, X_test, y_train, y_test

X_test, X_train, y_train, y_test = pipeline2(pd.concat([news, twitter]), youtube, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
accuracy_score(clf.predict(X_test), y_test)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

0.5356755290287575

# Submissions

In [36]:
#news in-genre
X_test, X_train, y_train, y_test = pipeline1(news, news_test, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
result = ['F' if i ==1 else 'M' for i in clf.predict(X_test)]
result = pd.DataFrame({'id':news_test.id, 'z_G':result})
result.to_csv('Glazunov_IN_news_1', index=False, header=False, sep=' ')

#news in-genre
X_test, X_train, y_train, y_test = pipeline2(news, news_test, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
result = ['F' if i ==1 else 'M' for i in clf.predict(X_test)]
result = pd.DataFrame({'id':news_test.id, 'z_G':result})
result.to_csv('Glazunov_IN_news_2', index=False, header=False, sep=' ')

#news cross-genre
X_test, X_train, y_train, y_test = pipeline1(pd.concat([twitter, youtube]), news_test, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
result = ['F' if i ==1 else 'M' for i in clf.predict(X_test)]
result = pd.DataFrame({'id':news_test.id, 'z_G':result})
result.to_csv('Glazunov_CROSS_news_1', index=False, header=False, sep=' ')

#news cross-genre
X_test, X_train, y_train, y_test = pipeline2(pd.concat([twitter, youtube]), news_test, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
result = ['F' if i ==1 else 'M' for i in clf.predict(X_test)]
result = pd.DataFrame({'id':news_test.id, 'z_G':result})
result.to_csv('Glazunov_CROSS_news_2', index=False, header=False, sep=' ')

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




In [37]:
#twitter in-genre
X_test, X_train, y_train, y_test = pipeline1(twitter, twitter_test, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
result = ['F' if i ==1 else 'M' for i in clf.predict(X_test)]
result = pd.DataFrame({'id':twitter_test.id, 'z_G':result})
result.to_csv('Glazunov_IN_twitter_1', index=False, header=False, sep=' ')

#twitter in-genre
X_test, X_train, y_train, y_test = pipeline2(twitter, twitter_test, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
result = ['F' if i ==1 else 'M' for i in clf.predict(X_test)]
result = pd.DataFrame({'id':twitter_test.id, 'z_G':result})
result.to_csv('Glazunov_IN_twitter_2', index=False, header=False, sep=' ')

#twitter cross-genre
X_test, X_train, y_train, y_test = pipeline1(pd.concat([news, youtube]), twitter_test, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
result = ['F' if i ==1 else 'M' for i in clf.predict(X_test)]
result = pd.DataFrame({'id':twitter_test.id, 'z_G':result})
result.to_csv('Glazunov_CROSS_twitter_1', index=False, header=False, sep=' ')

#twitter cross-genre
X_test, X_train, y_train, y_test = pipeline2(pd.concat([news, youtube]), twitter_test, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
result = ['F' if i ==1 else 'M' for i in clf.predict(X_test)]
result = pd.DataFrame({'id':twitter_test.id, 'z_G':result})
result.to_csv('Glazunov_CROSS_twitter_2', index=False, header=False, sep=' ')

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




In [38]:
#youtube in-genre
X_test, X_train, y_train, y_test = pipeline1(youtube, youtube_test, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
result = ['F' if i ==1 else 'M' for i in clf.predict(X_test)]
result = pd.DataFrame({'id':youtube_test.id, 'z_G':result})
result.to_csv('Glazunov_IN_youtube_1', index=False, header=False, sep=' ')

#youtube in-genre
X_test, X_train, y_train, y_test = pipeline2(youtube, youtube_test, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
result = ['F' if i ==1 else 'M' for i in clf.predict(X_test)]
result = pd.DataFrame({'id':youtube_test.id, 'z_G':result})
result.to_csv('Glazunov_IN_youtube_2', index=False, header=False, sep=' ')

#youtube cross-genre
X_test, X_train, y_train, y_test = pipeline1(pd.concat([news, twitter]), youtube_test, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
result = ['F' if i ==1 else 'M' for i in clf.predict(X_test)]
result = pd.DataFrame({'id':youtube_test.id, 'z_G':result})
result.to_csv('Glazunov_CROSS_youtube_1', index=False, header=False, sep=' ')

#youtube cross-genre
X_test, X_train, y_train, y_test = pipeline2(pd.concat([news, twitter]), youtube_test, cow)
clf = LogisticRegression(random_state=23, solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
result = ['F' if i ==1 else 'M' for i in clf.predict(X_test)]
result = pd.DataFrame({'id':youtube_test.id, 'z_G':result})
result.to_csv('Glazunov_CROSS_youtube_2', index=False, header=False, sep=' ')

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


