# Identifying type of opinions in spanish wikipedia discussions 

In this analysis, we are going to automatically identify the kind of opinion of authors in the discussions on talk pages of spanish wikipedia.

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit
from wdds_tokenizer import tokenize
import wdds_tokenizer
import pandas as pd
import numpy as np
import os
import gc
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#matplotlib.style.use('seaborn-ticks')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['ytick.labelsize'] = 14
matplotlib.rcParams['xtick.labelsize'] = 14
matplotlib.rcParams['axes.labelsize'] = 14
matplotlib.rcParams['axes.titlesize'] = 18
sns.set_style('ticks')

Load our labelled dataset, containing the opinions in the talk pages of wikipedia segmented by sentences assuming normal punctuation. This dataset includes the initial 1000 edits of talk pages of political leaders in America.

In [2]:
ds = pd.read_csv('data/wdds.csv')
ds.shape

(2097, 20)

In [3]:
ds = ds[~ds.type.isnull()]
ds = ds[ds.subtype!='INVALID']
ds = ds[ds.subtype!='SIGN']
ds = ds[ds.subtype!='OLAN']

ds['target'] = ds['type']
ds['opinion'] = ds['clean_opinion']
ds.shape

(1583, 21)

In [4]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=0)
sss.get_n_splits(ds.opinion, ds.target)

for train_index, test_index in sss.split(ds.opinion, ds.target):
   X_train, X_test = ds.iloc[train_index].opinion, ds.iloc[test_index].opinion
   y_train, y_test = ds.iloc[train_index].target, ds.iloc[test_index].target

print(f'train size: {X_train.shape[0]}')
print(f'test size: {X_test.shape[0]}')

train size: 949
test size: 634


In [5]:
text_vectorizer = [
    ('vect', CountVectorizer(strip_accents='ascii', 
                             min_df=3, max_df=0.8,
                             stop_words=wdds_tokenizer.stopset, 
                             tokenizer=tokenize)),
    ('tfidf', TfidfTransformer(use_idf=True, sublinear_tf=True))
]

multinb_clf = Pipeline(text_vectorizer+[('clf', MultinomialNB())])
lsvc_clf = Pipeline(text_vectorizer+[('clf', LinearSVC())])
mf_clf = Pipeline(text_vectorizer+[('clf', DummyClassifier(strategy='most_frequent', random_state=0))])
uniform_clf = Pipeline(text_vectorizer+[('clf', DummyClassifier(strategy='uniform', random_state=0))])
strat_clf = Pipeline(text_vectorizer+[('clf', DummyClassifier(strategy='stratified', random_state=0))])
use_stemmer = True

## Evaluation of the performance on the test set

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

summary_scores = ['CI (95\%)', '']

uniform_clf.fit(X_train, y_train)
scores = cross_val_score(uniform_clf, X_train, y_train)
confidence_interval = scores.std() * 2
print("RND Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), confidence_interval))
summary_scores.extend([confidence_interval, '', ''])

parameters = {'clf__C':[0.1, 1, 10]}
parameters['clf__loss']=('hinge','squared_hinge')
parameters['clf__multi_class']= ('ovr', 'crammer_singer')
parameters['clf__class_weight'] = (None, 'balanced')
parameters['clf__tol'] = [1e-3, 1e-4]
lsvc_cv = GridSearchCV(lsvc_clf, parameters)
lsvc_cv.fit(X_train, y_train)
scores = cross_val_score(lsvc_cv, X_train, y_train)
confidence_interval = scores.std() * 2
print(f"LSVC best score: {lsvc_cv.best_score_}")
print(f"LSVC best params: {lsvc_cv.best_params_}")
print("LSVC Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), confidence_interval))
summary_scores.extend([confidence_interval, '',''])

parameters = {'clf__alpha':[ 0.1, 1.0, 10.0]}
parameters['clf__fit_prior']= [True, False]
multinb_cv = GridSearchCV(multinb_clf, parameters)
multinb_cv.fit(X_train, y_train)
scores =  cross_val_score(multinb_cv, X_train, y_train)
confidence_interval = scores.std() * 2
print(f"MNB best score: {multinb_cv.best_score_}")
print(f"MNB best params: {multinb_cv.best_params_}")
print("MNB Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), confidence_interval))
summary_scores.extend([confidence_interval, '',''])



RND Accuracy: 0.30 (+/- 0.01)
LSVC best score: 0.6164383561643836
LSVC best params: {'clf__C': 1, 'clf__class_weight': None, 'clf__loss': 'squared_hinge', 'clf__multi_class': 'ovr', 'clf__tol': 0.001}
LSVC Accuracy: 0.62 (+/- 0.02)
MNB best score: 0.5879873551106428
MNB best params: {'clf__alpha': 0.1, 'clf__fit_prior': True}
MNB Accuracy: 0.58 (+/- 0.03)


In [7]:
from sklearn.metrics import precision_recall_fscore_support

docs_test = X_test
labels = y_test.unique()
labels.sort()
rds = pd.DataFrame({'Label': labels})
macro_results = ['Macro', len(y_test)]

predicted = uniform_clf.predict(docs_test)
results = precision_recall_fscore_support(y_test, predicted)
macro_results.extend(precision_recall_fscore_support(y_test, predicted, average='macro')[:3])
rds['Support'] = results[3]
rds['BL-P'] = results[0]
rds['BL-R'] = results[1]
rds['BL-F1'] = results[2]

predicted = lsvc_cv.predict(docs_test)
results = precision_recall_fscore_support(y_test, predicted)
macro_results.extend(precision_recall_fscore_support(y_test, predicted, average='macro')[:3])
rds['LSVC-P'] = results[0]
rds['LSVC-R'] = results[1]
rds['LSVC-F1'] = results[2]

predicted = multinb_cv.predict(docs_test) 
results = precision_recall_fscore_support(y_test, predicted)
macro_results.extend(precision_recall_fscore_support(y_test, predicted, average='macro')[:3])
rds['MNB-P'] = results[0]
rds['MNB-R'] = results[1]
rds['MNB-F1'] = results[2]

rds.loc[len(rds)]=macro_results
rds.loc[len(rds)]=summary_scores

rds.to_csv('output/classif_report.csv', index=False)
rds

Unnamed: 0,Label,Support,BL-P,BL-R,BL-F1,LSVC-P,LSVC-R,LSVC-F1,MNB-P,MNB-R,MNB-F1
0,ARGUMENTATIVE,309.0,0.493088,0.346278,0.406844,0.657343,0.608414,0.631933,0.594444,0.692557,0.639761
1,INTERPERSONAL,60.0,0.101852,0.366667,0.15942,0.928571,0.65,0.764706,0.95,0.633333,0.76
2,PERFORMATIVE,265.0,0.452736,0.343396,0.390558,0.565359,0.65283,0.605954,0.551282,0.486792,0.517034
3,Macro,634.0,0.349225,0.352114,0.318941,0.717091,0.637081,0.667531,0.698575,0.604227,0.638932
4,CI (95\%),,0.007185,,,0.023118,,,0.028611,,


In [8]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predicted)

array([[214,   1,  94],
       [ 11,  38,  11],
       [135,   1, 129]])

# model usage

In [9]:
sample = X_test[:2]
labels = y_test[:2]

docs_new = sample
predicted = lsvc_cv.predict(docs_new)

for doc, label, pred in zip(docs_new, labels, predicted):
    print('%r => %s, %s' % (doc, label, pred))


'tengo una cuestión: en el recuadro en el que aparece el gabinete del actual presidente de la república no aparece el nombre del actual secretario de gobernación, aunque en el código de la página sí se incluyó su nombre, su nombramiento y según el código aún permanece en el cargo, pero ello no es visible en la página.' => ARGUMENTATIVE, ARGUMENTATIVE
'hola a todos, hice una modificacion de una linea en la parte de "tensiones con colombia" y el problema es que no se como poner el link de la fuente..' => PERFORMATIVE, PERFORMATIVE
