In [1]:
!conda install -c intel scikit-learn

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [2]:
import string

import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

In [3]:
stop_words = set(stopwords.words('spanish'))

In [4]:
def tokenizer(text):
    tt = TweetTokenizer()
    return tt.tokenize(text)

### Loading labeled tweets

In [5]:
# Dataset loaded from: https://docs.google.com/spreadsheets/d/1I3qXro6Hy4UiTmC68axmuC3Qqrj_9YEKSzFILizyl3A/edit?usp=sharing
tweets_df = pd.read_csv('./data/medellin_tweets_labeled.csv', sep = ',')

In [6]:
tweets_df.head()

Unnamed: 0,full_text,sentiment
0,@hugouribev: @isamg6 @MOinternationa @petrogus...,0.0
1,"@maridelas18: Fueraaa, Petro de Medellín",0.0
2,@MunozEnith: Medellín se lució hoy. El repudio...,0.0
3,@isamg6: La historia contará que en Medellín s...,0.0
4,@funurbiano: Los del asalto a la fundidora de ...,0.0


In [7]:
tweets_df['sentiment'].value_counts(dropna = False)

0.0    3685
1.0    2337
NaN      14
Name: sentiment, dtype: int64

In [8]:
tweets_df['sentiment'].value_counts(dropna = False, normalize = True)

0.0    0.610504
1.0    0.387177
NaN    0.002319
Name: sentiment, dtype: float64

### Leaving out unlabeled texts, this data is not useful for training or validating a supervised model

In [9]:
tweets_labeled_df = tweets_df.loc[tweets_df['sentiment'].notnull()]

In [10]:
tweets_labeled_df.shape

(6022, 2)

In [11]:
tweets_nolabeled_df = tweets_df.loc[tweets_df['sentiment'].isnull()]

In [12]:
tweets_nolabeled_df.shape

(14, 2)

### Splitting train and test datasets

In [40]:
X_train, X_test, y_train, y_test = train_test_split(tweets_labeled_df['full_text'], tweets_labeled_df['sentiment'], test_size = 0.5, stratify = tweets_labeled_df['sentiment'], random_state = 1)

In [41]:
X_train.shape

(3011,)

In [42]:
pd.Series(y_train).value_counts(normalize = True)

0.0    0.612089
1.0    0.387911
Name: sentiment, dtype: float64

In [43]:
X_test.shape

(3011,)

In [44]:
pd.Series(y_test).value_counts(normalize = True)

0.0    0.611757
1.0    0.388243
Name: sentiment, dtype: float64

### Vectorizing texts

In [45]:
bow = CountVectorizer(tokenizer = tokenizer, stop_words = stop_words)

In [46]:
tfidf = TfidfVectorizer(tokenizer = tokenizer, stop_words = stop_words)

In [47]:
X_bow = bow.fit_transform(X_train)

In [48]:
X_tfidf = tfidf.fit_transform(X_train)

### Training and evaluating a model using BOW

In [49]:
logistic_model = LogisticRegression(random_state = 2)

In [50]:
logistic_model.fit(X_bow, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=2)

In [51]:
y_train_bow_predict = logistic_model.predict(X_bow)
y_test_bow_predict = logistic_model.predict(bow.transform(X_test))

In [52]:
confusion_matrix(y_train, y_train_bow_predict)

array([[1838,    5],
       [  79, 1089]], dtype=int64)

In [53]:
confusion_matrix(y_test, y_test_bow_predict)

array([[1389,  453],
       [ 833,  336]], dtype=int64)

In [54]:
print('Precision:', precision_score(y_test, y_test_bow_predict))
print('Recall:', recall_score(y_test, y_test_bow_predict))
print('F1:', f1_score(y_test, y_test_bow_predict))

Precision: 0.42585551330798477
Recall: 0.2874251497005988
F1: 0.3432073544433095


### Training and evaluating a model using TF-IDF

In [55]:
logistic_model = LogisticRegression(random_state = 2)

In [56]:
logistic_model.fit(X_tfidf, y_train)

LogisticRegression(random_state=2)

In [57]:
y_train_tfidf_predict = logistic_model.predict(X_tfidf)
y_test_tfidf_predict = logistic_model.predict(bow.transform(X_test))

In [58]:
confusion_matrix(y_train, y_train_tfidf_predict)

array([[1839,    4],
       [ 643,  525]], dtype=int64)

In [59]:
confusion_matrix(y_test, y_test_tfidf_predict)

array([[1566,  276],
       [ 952,  217]], dtype=int64)

In [60]:
print('Precision:', precision_score(y_test, y_test_tfidf_predict))
print('Recall:', recall_score(y_test, y_test_tfidf_predict))
print('F1:', f1_score(y_test, y_test_tfidf_predict))

Precision: 0.44016227180527384
Recall: 0.18562874251497005
F1: 0.2611311672683514
