In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import string
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

### Parse Data

In [2]:
def parse_data(path):
    # load data
    root = ET.parse(path).getroot()    
    ids = []
    tweets = []
    labels = []
    for tag in root.findall('tweet'):
        tweetid = tag.find('tweetid')
        content = tag.find('content')
        value = tag.find('sentiment/polarity/value')
        ids.append(tweetid.text)
        tweets.append(content.text)
        labels.append(value.text)
        
    return ids, tweets, labels

def parse_data_test(path):
    # load data
    root = ET.parse(path).getroot()    
    ids = []
    tweets = []
    for tag in root.findall('tweet'):
        tweetid = tag.find('tweetid')
        content = tag.find('content')
        ids.append(tweetid.text)
        tweets.append(content.text)
        
    return ids, tweets

In [3]:
id_train, x_train, y_train = parse_data('TASS2017_T1_training.xml')
id_dev, x_dev, y_dev = parse_data('TASS2017_T1_development.xml')
id_test, x_test = parse_data_test("TASS2017_T1_test.xml")

### Preprocess Data

In [4]:
reUser = re.compile(r'@+\w+')
reHashtag = re.compile(r'#+\w+')
reWeb = re.compile(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})')

def clean_text(text):
    res = []
    for element in text:
        aux = []    

        # Remove stopwords
        for word in element.split():
            if word not in stopwords.words("spanish"):
                aux.append(word)
        element = " ".join(aux)
        # Normalize user tags
        for item in re.finditer(reUser, element):
            element = reUser.sub('#user', element)
        # Normalize hastags
        for item in re.finditer(reHashtag, element):
            element = reHashtag.sub('#hastag', element)
        # Normalize urls
        for item in re.finditer(reWeb, element):
            element = reWeb.sub('#web', element)
        # Remove punctuation
        element = element.translate(str.maketrans('', '', string.punctuation))

        res.append(element)
    return res
    
train = clean_text(x_train)
dev = clean_text(x_dev)
test = clean_text(x_test)

In [5]:
tokenizer = TweetTokenizer(strip_handles=False, reduce_len=True, preserve_case=False)

train_clean = list(map(" ".join, map(tokenizer.tokenize, train)))
dev_clean = list(map(" ".join, map(tokenizer.tokenize, dev)))
test_clean = list(map(" ".join, map(tokenizer.tokenize, test)))

In [6]:
dev_clean

['hastag 100010 verdad voy decir petarda quiero mismo ✨',
 'hastag hastag hastag aún leído caerán prontito',
 'al final sido 3h bueno mañana fiesta así que no quejo',
 'hastag tiempo cosas ahora mismo',
 'hastag ves brillo coso hace sepan kk',
 'tengo perrina adorable sabéis acompaña habitación voy dormir',
 'hastag es ojeando año pasado tampoco muchas canciones jajajajaja',
 'bueno batalla final conquista después faltaría revelación',
 'hastag ¿ mañana sábado 31 en día vives mañana miércoles 31',
 'hastag caminante mar niebla cuadros favoritos portada',
 'hastag ¡ sí y encantado ¿ tú visto ¿ cuándo comentamos',
 'hastag se olvidaban grandes hastag hastag a ver si interesa hilillo',
 'hastag mejor si pones link cuenta costado encontrarte',
 'hastag por tenía pensado verla después segunda daredevil',
 'hastag lado manita usas vea negro',
 'llevo despierto 8 puto mosquito volando puta oreja',
 'hastag qué estupendo y ¿ cómo encargo ¿ es estupendísima versión barce',
 'cosas enamora tosta

### Experiments

In [6]:
# LogisticRegression

pipe = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("clf", LogisticRegression())
])
    
param_grid = {"tfidf__ngram_range" : [(1,2),(1,3),(2,3),(3,4),(3,5),(3,6),(4,5)],
              "tfidf__max_df":[0.3,0.4,0.5,0.6,0.7,0.8,0.9],
              "tfidf__min_df":[1,2,3,5], # or percentages
              "tfidf__analyzer":["char_wb"], # n-grams
              "clf__C":[1,10,100,1000,10000]
             }

clf_lr = GridSearchCV(pipe,
                      param_grid,
                      cv=5,
                      n_jobs=-1,
                      verbose=2,
                      scoring="f1_macro")

clf_lr.fit(train_clean, y_train)
print(clf_lr.best_score_)
clf_lr.best_params_

Fitting 5 folds for each of 980 candidates, totalling 4900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   46.4s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 16.6min
[Parallel(n_jobs=-1)]: Done 4900 out of 4900 | elapsed: 16.7min finished


0.3877597477137727


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


{'clf__C': 100,
 'tfidf__analyzer': 'char_wb',
 'tfidf__max_df': 0.8,
 'tfidf__min_df': 2,
 'tfidf__ngram_range': (3, 6)}

In [7]:
# SVC

pipe = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("clf", SVC())
])
    
param_grid = {"tfidf__ngram_range" : [(1,2),(1,3),(2,3),(3,4),(3,5),(3,6),(4,5)],
              "tfidf__max_df":[0.3,0.4,0.5,0.6,0.7,0.8,0.9],
              "tfidf__min_df":[1,2,3,5], # or percentages
              "tfidf__analyzer":["char_wb"], # n-grams
              "clf__kernel":['linear', 'rbf'],
              "clf__C":[1,10,100,1000,10000]
             }

clf_svc = GridSearchCV(pipe,
                       param_grid,
                       cv=5,
                       n_jobs=-1,
                       verbose=2,
                       scoring="f1_macro")

clf_svc.fit(train_clean, y_train)
print(clf_svc.best_score_)
clf_svc.best_params_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 1960 candidates, totalling 9800 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   29.3s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 15.8min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 19.2min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed: 23.0min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed: 27.0min
[Parallel(n_jobs=-1)]: Done 7922 tasks      | elapsed: 31.5min
[Parallel(n_jobs=-1)]: Done 9097 tasks      | elapsed: 36.1min
[Parallel(n_jobs=-1)]: Done 9800 out of 9800 | elapsed: 39.1

0.3848031342070236


{'clf__C': 10,
 'clf__kernel': 'linear',
 'tfidf__analyzer': 'char_wb',
 'tfidf__max_df': 0.8,
 'tfidf__min_df': 2,
 'tfidf__ngram_range': (4, 5)}

In [8]:
# RandomForestClassifier

pipe = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("clf", RandomForestClassifier())
])
    
param_grid = {"tfidf__ngram_range" : [(1,2),(1,3),(2,3),(3,4),(3,5),(3,6),(4,5)],
              "tfidf__max_df":[0.3,0.4,0.5,0.6,0.7,0.8,0.9],
              "tfidf__min_df":[1,2,3,5], # or percentages
              "tfidf__analyzer":["char_wb"], # n-grams
              "clf__n_estimators":[50,100,150,200,300]
             }

clf_rfc = GridSearchCV(pipe,
                       param_grid,
                       cv=5,
                       n_jobs=-1,
                       verbose=2,
                       scoring="f1_macro")

clf_rfc.fit(train_clean, y_train)
print(clf_rfc.best_score_)
clf_rfc.best_params_

Fitting 5 folds for each of 980 candidates, totalling 4900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   47.8s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 19.4min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 28.1min
[Parallel(n_jobs=-1)]: Done 4900 out of 4900 | elapsed: 28.2min finished


0.31993195546535513


{'clf__n_estimators': 50,
 'tfidf__analyzer': 'char_wb',
 'tfidf__max_df': 0.5,
 'tfidf__min_df': 3,
 'tfidf__ngram_range': (4, 5)}

In [9]:
# KNN

pipe = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("clf", KNeighborsClassifier())
])
    
param_grid = {"tfidf__ngram_range" : [(1,2),(1,3),(2,3),(3,4),(3,5),(3,6),(4,5)],
              "tfidf__max_df":[0.3,0.4,0.5,0.6,0.7,0.8,0.9],
              "tfidf__min_df":[1,2,3,5], # or percentages
              "tfidf__analyzer":["char_wb"], # n-grams
              "clf__n_neighbors":[3,5,7,11,15,21,25]
             }

clf_knn = GridSearchCV(pipe,
                       param_grid,
                       cv=5,
                       n_jobs=-1,
                       verbose=2,
                       scoring="f1_macro")

clf_knn.fit(train_clean, y_train)
print(clf_knn.best_score_)
clf_knn.best_params_

Fitting 5 folds for each of 1372 candidates, totalling 6860 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   26.5s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:   42.2s
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed:  4.8min


0.36833977444122495


[Parallel(n_jobs=-1)]: Done 6860 out of 6860 | elapsed:  4.8min finished


{'clf__n_neighbors': 5,
 'tfidf__analyzer': 'char_wb',
 'tfidf__max_df': 0.3,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (3, 4)}

### Evaluation over development

In [6]:
# LogisticRegression

pipe = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(3,6),
                                 max_df=0.8,
                                 min_df=2,
                                 analyzer="char_wb")),
        ("clf", LogisticRegression(C=100))
])

pipe.fit(train_clean, y_train)
predictions = pipe.predict(dev_clean)
print(classification_report(y_dev, predictions))

              precision    recall  f1-score   support

           N       0.56      0.67      0.61       219
         NEU       0.11      0.07      0.09        69
        NONE       0.39      0.19      0.26        62
           P       0.54      0.57      0.55       156

    accuracy                           0.50       506
   macro avg       0.40      0.38      0.38       506
weighted avg       0.47      0.50      0.48       506



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [7]:
# RandomForestClassifier

pipe = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(4,5),
                                 max_df=0.5,
                                 min_df=3,
                                 analyzer="char_wb")),
        ("clf", RandomForestClassifier(n_estimators=50))
])

pipe.fit(train_clean, y_train)
predictions = pipe.predict(dev_clean)
print(classification_report(y_dev, predictions))

              precision    recall  f1-score   support

           N       0.56      0.71      0.63       219
         NEU       0.07      0.01      0.02        69
        NONE       0.32      0.19      0.24        62
           P       0.55      0.63      0.59       156

    accuracy                           0.53       506
   macro avg       0.38      0.39      0.37       506
weighted avg       0.46      0.53      0.49       506



In [8]:
# KNN

pipe = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(3,4),
                                 max_df=0.3,
                                 min_df=1,
                                 analyzer="char_wb")),
        ("clf", KNeighborsClassifier(n_neighbors=5))
])

pipe.fit(train_clean, y_train)
predictions = pipe.predict(dev_clean)
print(classification_report(y_dev, predictions))

              precision    recall  f1-score   support

           N       0.57      0.71      0.63       219
         NEU       0.14      0.10      0.12        69
        NONE       0.38      0.19      0.26        62
           P       0.55      0.53      0.54       156

    accuracy                           0.51       506
   macro avg       0.41      0.38      0.39       506
weighted avg       0.48      0.51      0.49       506



In [9]:
# SVC
pipe = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(4,5),
                                 max_df=0.8,
                                 min_df=2,
                                 analyzer="char_wb")),
        ("clf", SVC(C=10, kernel="linear"))
])

pipe.fit(train_clean, y_train)
predictions = pipe.predict(dev_clean)
print(classification_report(y_dev, predictions))

              precision    recall  f1-score   support

           N       0.52      0.63      0.57       219
         NEU       0.15      0.12      0.13        69
        NONE       0.43      0.32      0.37        62
           P       0.55      0.49      0.52       156

    accuracy                           0.48       506
   macro avg       0.41      0.39      0.40       506
weighted avg       0.47      0.48      0.47       506



In [10]:
# SVC
pipe = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(4,5),
                                 max_df=0.8,
                                 min_df=2,
                                 analyzer="char_wb")),
        ("clf", SVC(C=10, kernel="rbf"))
])

pipe.fit(train_clean, y_train)
predictions = pipe.predict(dev_clean)
print(classification_report(y_dev, predictions))

              precision    recall  f1-score   support

           N       0.53      0.83      0.65       219
         NEU       0.10      0.01      0.03        69
        NONE       0.70      0.11      0.19        62
           P       0.59      0.55      0.57       156

    accuracy                           0.55       506
   macro avg       0.48      0.38      0.36       506
weighted avg       0.51      0.55      0.48       506



In [11]:
# SVC
pipe = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(3,5),
                                 max_df=0.8,
                                 min_df=1,
                                 analyzer="char_wb")),
        ("clf", SVC(C=100, kernel="linear"))
])

pipe.fit(train_clean, y_train)
predictions = pipe.predict(dev_clean)
print(classification_report(y_dev, predictions))

              precision    recall  f1-score   support

           N       0.58      0.73      0.65       219
         NEU       0.17      0.12      0.14        69
        NONE       0.51      0.31      0.38        62
           P       0.57      0.54      0.55       156

    accuracy                           0.53       506
   macro avg       0.46      0.42      0.43       506
weighted avg       0.51      0.53      0.52       506



### Final Classifier

In [None]:
# BEST PARAMETERS
{'clf__C': 100,
 'clf__kernel': 'linear',
 'tfidf__analyzer': 'char_wb',
 'tfidf__max_df': 0.8,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (3, 5)}

In [12]:
# FINAL

pipe = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(3,5),
                                 max_df=0.8,
                                 min_df=1,
                                 analyzer="char_wb")),
        ("clf", SVC(C=100, kernel="linear"))
])

pipe.fit(train_clean, y_train)
predictions = pipe.predict(test_clean)

In [13]:
with open("results.txt", "w") as f:
    for id_ts, pred in zip(id_test , predictions):
        f.write("{}\t{}\n".format(id_ts, pred))