In [1]:
import pandas as pd
import xml.etree.ElementTree as et
import io
import os
import re
import numpy as np
from nltk.tokenize import TweetTokenizer
import unidecode

In [2]:
def iter_docs(author, id, genero, variedad):
    author_doc = {}
    author_doc['id'] = id
    author_doc['genero'] = genero
    author_doc['variedad'] = variedad
    
    total = 0
    tweet = ''
    for doc in author.iter('document'):
        doc_dict = author_doc.copy()
        tweet = tweet +  " " + doc.text.replace("\n", "")
        total += 1
    
    author_doc['total'] = total
    author_doc['tweet'] = tweet
    return author_doc

In [3]:
data = {}
truth = {}

training = 'training'
test = 'test'

for dirname in [training, test]:
    data[dirname] = pd.DataFrame()
    truth[dirname] = pd.read_csv('./'+dirname+'/truth.txt', sep=":::", names = ["id", "genero", "pais"], header=None, engine='python')

    for filename in os.listdir(dirname):
        if filename == "truth.txt":
            continue
        f = './'+dirname+'/'+filename
        parsed = et.parse(f)
        
        t = truth[dirname].loc[truth[dirname]['id'] == filename.replace(".xml", "")]
        for index, row in t.iterrows():
            genero = row[1]
            variedad = row[2]

        data[dirname] = data[dirname].append([iter_docs(parsed.getroot(), filename.replace(".xml", ""), genero, variedad)])

In [4]:
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)

def clean(text):
    tweet = text.replace("\\n", "")
    #tokenizamos el tweet
    tokens = tokenizer.tokenize(tweet)
    
    #quitmos acentos
    return unidecode.unidecode(" ".join(tokens))

for dirname in [training, test]:
    data[dirname]['ctweet'] = data[dirname].apply(lambda row: clean(row['tweet']), axis=1)

data[training].head()

Unnamed: 0,genero,id,total,tweet,variedad,ctweet
0,female,f4c9c348fe3345a221b211a662e44f6b,100,ELN asesina a un policía y meten bomba en su ...,colombia,eln asesina a un policia y meten bomba en su c...
0,male,4c8b44a69798168e2d05d5e7d30f567c,100,NO DUERMAN QUE SE VA!!!! https://t.co/QxP5vjF...,argentina,no duerman que se va ! ! ! https://t.co/QxP5vj...
0,male,8bdf815f5b76cfebc25ae51d5f5ab054,100,@santorendon por lo que cuenta es como primo ...,colombia,por lo que cuenta es como primo lejano de #har...
0,male,35ccb63279d819182acbe99ace6106c6,100,raspando la olla https://t.co/DpdOOroknp si n...,venezuela,raspando la olla https://t.co/DpdOOroknp si no...
0,male,3b8bff5ba21d4e76c2e239b8e4a30c1d,100,Así mismo es... https://t.co/ugqXINZHYi Así e...,mexico,asi mismo es ... https://t.co/ugqXINZHYi asi e...


In [5]:
#nuevas características
emoticons_pattern = re.compile("[\U00010000-\U0010ffff]", flags=re.UNICODE)
def emoticons(text):
    count = 0
    tokens = tokenizer.tokenize(text)
    for t in tokens:
        search = emoticons_pattern.findall(t)
        count += len(search)
    return count

mentions_pattern = re.compile("@\w+")
def mentions(text):
    users = mentions_pattern.findall(text)
    return len(users)

hashtags_pattern = re.compile("#\w+")
def hashtags(text):
    hashes = hashtags_pattern.findall(text)
    return len(hashes)

url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
def readurl(text):
    urls = url_pattern.findall(text)
    return len(urls)

def tokenscount(text):
    tokens = tokenizer.tokenize(text)
    return len(tokens)

retweets_pattern = re.compile("(RT|via)((?:\b\W*@\w+)+)")
def retweets(text):
    rts = retweets_pattern.findall(text)
    return len(rts)

In [6]:
transformaciones = [
    lambda x: len(x),
    lambda x: x.count(" "),
    lambda x: x.count("."),
    lambda x: x.count("!"),
    lambda x: x.count("?"),
    lambda x: len(x) / (x.count(" ") + 1),
    lambda x: x.count(" ") / (x.count(".") + 1),
    lambda x: len(re.findall("\d", x)),
    lambda x: len(re.findall("[A-Z]", x)),
    lambda x: emoticons(x),
    lambda x: mentions(x),
    lambda x: hashtags(x),
    lambda x: readurl(x),
    lambda x: tokenscount(x),
    lambda x: retweets(x)
]

meta = {}
for d in [training, test]:
    meta[d] = []
    for func in transformaciones:
        meta[d].append(data[d]['tweet'].apply(func)/data[d]['total'])
        
meta[training] = np.asarray(meta[training]).T
meta[test] = np.asarray(meta[test]).T

In [8]:
meta[training][0:1]

array([[  1.18540000e+02,   1.61400000e+01,   6.40000000e-01,
          5.20000000e-01,   7.00000000e-02,   7.33993808e-02,
          2.48307692e-01,   6.90000000e-01,   7.44000000e+00,
          5.00000000e-02,   1.12000000e+00,   8.80000000e-01,
          3.40000000e-01,   1.65400000e+01,   0.00000000e+00]])

In [18]:
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score,confusion_matrix
import time
from sklearn import decomposition
from nltk.stem.porter import PorterStemmer
import nltk 
import string

In [23]:
def printModels():
    for key in models:
        start = time.time()
        model = models[key].fit(features, y_train.ravel())
        predicted = model.predict(features_test)
        end = time.time()
        print(key, accuracy_score(y_test,predicted), "Time:", end-start)
        
        
stemmer = PorterStemmer()
trans_table = {ord(c): None for c in string.punctuation + string.digits} 
def tokenize(text):
    tokens = [word for word in nltk.word_tokenize(text.translate(trans_table)) if len(word) > 1] #if len(word) > 1 because I only want to retain words that are at least two characters before stemming, although I can't think of any such words that are not also stopwords
    stems = [stemmer.stem(item) for item in tokens]
    return stems

In [29]:
#genero
vectorizer = TfidfVectorizer(analyzer="word", stop_words=stopwords.words('spanish'),\
                            ngram_range=(1,2), min_df=10, max_features=8000,norm='l2',\
                            use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
  
y_train=data[training]['genero'].values
X_train = vectorizer.fit_transform(data[training]['ctweet'])

features = np.hstack([meta[training], X_train.todense()])
                      
#creando bolsa de palabras por género
#data['male'] = data[training][data[training].genero=='male']
#data['female'] = data[training][data[training].genero=='female']

#vectorizer_male = TfidfVectorizer(analyzer="word", max_features=200, tokenizer=tokenize)
#vectorizer_male.fit(data['male']['ctweet'])
#X_train_male = vectorizer_male.transform(data[training]['ctweet'])

#vectorizer_female = TfidfVectorizer(analyzer="word", max_features=200, tokenizer=tokenize)
#vectorizer_female.fit(data['female']['ctweet'])
#X_train_female = vectorizer_female.transform(data[training]['ctweet'])

#features = np.hstack([meta[training], X_train.todense(), X_train_male.todense(), X_train_female.todense()])

print(features[:1])

y_test=data[test]['genero'].values
X_test=vectorizer.transform(data[test]['ctweet'])

#creando bolsa de palabras por género
#X_test_male = vectorizer_male.transform(data[test]['ctweet'])
#X_test_female = vectorizer_female.transform(data[test]['ctweet'])

#features_test = np.hstack([meta[test], X_test.todense(), X_test_male.todense(), X_test_female.todense()])
features_test = np.hstack([meta[test], X_test.todense()])

[[ 118.54   16.14    0.64 ...,    0.      0.      0.  ]]


In [30]:
models={
    "L": LinearSVC(),
    "NB": MultinomialNB(),
    "R": RandomForestClassifier(),
    "KN": KNeighborsClassifier(),
    "LO": LogisticRegression(),
    "NE": MLPClassifier()
}

    
printModels()

#con bolsa de palabras por género
#L 0.681428571429 Time: 4.463599920272827
#NB 0.677857142857 Time: 0.08524703979492188
#R 0.673571428571 Time: 0.7116379737854004
#KN 0.587142857143 Time: 6.8814311027526855
#LO 0.775 Time: 0.902728796005249
#NE 0.774285714286 Time: 69.29499626159668

L 0.592142857143 Time: 3.0182721614837646
NB 0.686428571429 Time: 0.1158747673034668
R 0.645714285714 Time: 0.7247958183288574
KN 0.577142857143 Time: 6.471596002578735
LO 0.769285714286 Time: 0.6107280254364014
NE 0.775714285714 Time: 75.25347208976746


In [31]:
#variedad del castellano
vectorizer = TfidfVectorizer(analyzer="word", stop_words=stopwords.words('spanish'),\
                            min_df=50, max_features=8000,norm='l2',\
                            use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
  
y_train=data[training]['variedad'].values
X_train = vectorizer.fit_transform(data[training]['ctweet'])

y_test=data[test]['variedad'].values
X_test=vectorizer.transform(data[test]['ctweet'])

features = np.hstack([meta[training], X_train.todense()])
features_test = np.hstack([meta[test], X_test.todense()])


In [135]:
models={
    "L": LinearSVC(),
    "NB": MultinomialNB(),
    "R": RandomForestClassifier(),
    "KN": KNeighborsClassifier(),
    "LO": LogisticRegression(),
    "NE": MLPClassifier()
}


printModels()

L 0.676428571429 Time: 14.360452890396118
NB 0.732142857143 Time: 0.04561209678649902
R 0.831428571429 Time: 0.46947312355041504
KN 0.23 Time: 3.02897310256958
LO 0.936428571429 Time: 2.371469020843506
NE 0.937857142857 Time: 23.478174924850464
