# Identifying type of opinions in spanish wikipedia discussions 

In this analysis, we are going to automatically identify the kind of opinion of authors in the discussions on talk pages of spanish wikipedia.

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit
from wdds_tokenizer import tokenize
import wdds_tokenizer
import pandas as pd
import numpy as np
import os
import gc
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#matplotlib.style.use('seaborn-ticks')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['ytick.labelsize'] = 14
matplotlib.rcParams['xtick.labelsize'] = 14
matplotlib.rcParams['axes.labelsize'] = 14
matplotlib.rcParams['axes.titlesize'] = 18
sns.set_style('ticks')

Load our labelled dataset, containing the opinions in the talk pages of wikipedia segmented by sentences assuming normal punctuation. This dataset includes the initial 1000 edits of talk pages of political leaders in America.

In [60]:
ds = pd.read_csv('data/wdds.csv')
ds.shape

(2097, 18)

In [61]:
ds = ds[~ds.type.isnull()]
ds = ds[ds.subtype!='INVALID']
ds = ds[ds.subtype!='SIGN']
ds = ds[ds.subtype!='OLAN']

ds['target'] = ds['type']
ds['opinion'] = ds['clean_opinion']
ds.shape

(1579, 19)

In [62]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.35, random_state=0)
sss.get_n_splits(ds.opinion, ds.target)

for train_index, test_index in sss.split(ds.opinion, ds.target):
   X_train, X_test = ds.iloc[train_index].opinion, ds.iloc[test_index].opinion
   y_train, y_test = ds.iloc[train_index].target, ds.iloc[test_index].target

print(f'train size: {X_train.shape[0]}')
print(f'test size: {X_test.shape[0]}')

train size: 1026
test size: 553


In [63]:
text_vectorizer = [
    ('vect', CountVectorizer(strip_accents='ascii', 
                             min_df=3, max_df=0.8,
                             stop_words=wdds_tokenizer.stopset, 
                             tokenizer=tokenize)),
    ('tfidf', TfidfTransformer(use_idf=True, sublinear_tf=True))
]

multinb_clf = Pipeline(text_vectorizer+[('clf', MultinomialNB())])
lsvc_clf = Pipeline(text_vectorizer+[('clf', LinearSVC())])
mf_clf = Pipeline(text_vectorizer+[('clf', DummyClassifier(strategy='most_frequent', random_state=0))])
uniform_clf = Pipeline(text_vectorizer+[('clf', DummyClassifier(strategy='uniform', random_state=0))])
strat_clf = Pipeline(text_vectorizer+[('clf', DummyClassifier(strategy='stratified', random_state=0))])
use_stemmer = True

## Evaluation of the performance on the test set

In [64]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

summary_scores = ['CI (95\%)', '']

uniform_clf.fit(X_train, y_train)
scores = cross_val_score(uniform_clf, X_train, y_train)
confidence_interval = scores.std() * 2
print("RND Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), confidence_interval))
summary_scores.extend([confidence_interval, '', ''])

#parameters = {'clf__C':[0.1, 1, 10]}
parameters = {'clf__C':[1]}
parameters['clf__loss']=('hinge','squared_hinge')
parameters['clf__multi_class']= ('ovr', 'crammer_singer')
parameters['clf__class_weight'] = (None, 'balanced')
parameters['clf__tol'] = [1e-3, 1e-4]
lsvc_cv = GridSearchCV(lsvc_clf, parameters)
lsvc_cv.fit(X_train, y_train)
scores = cross_val_score(lsvc_cv, X_train, y_train)
confidence_interval = scores.std() * 2
print(f"LSVC best score: {lsvc_cv.best_score_}")
print(f"LSVC best params: {lsvc_cv.best_params_}")
print("LSVC Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), confidence_interval))
summary_scores.extend([confidence_interval, '',''])

parameters = {'clf__alpha':[ 0.1, 1.0, 10.0]}
parameters['clf__fit_prior']= [True, False]
multinb_cv = GridSearchCV(multinb_clf, parameters)
multinb_cv.fit(X_train, y_train)
scores =  cross_val_score(multinb_cv, X_train, y_train)
confidence_interval = scores.std() * 2
print(f"MNB best score: {multinb_cv.best_score_}")
print(f"MNB best params: {multinb_cv.best_params_}")
print("MNB Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), confidence_interval))
summary_scores.extend([confidence_interval, '',''])



RND Accuracy: 0.33 (+/- 0.05)
LSVC best score: 0.621832358674464
LSVC best params: {'clf__C': 1, 'clf__tol': 0.001}
LSVC Accuracy: 0.62 (+/- 0.06)
MNB best score: 0.6023391812865497
MNB best params: {'clf__alpha': 1.0, 'clf__fit_prior': True}
MNB Accuracy: 0.60 (+/- 0.02)


In [65]:
from sklearn.metrics import precision_recall_fscore_support

docs_test = X_test
labels = y_test.unique()
labels.sort()
rds = pd.DataFrame({'Label': labels})
macro_results = ['Macro', len(y_test)]

predicted = uniform_clf.predict(docs_test)
results = precision_recall_fscore_support(y_test, predicted)
macro_results.extend(precision_recall_fscore_support(y_test, predicted, average='macro')[:3])
rds['Support'] = results[3]
rds['BL-P'] = results[0]
rds['BL-R'] = results[1]
rds['BL-F1'] = results[2]

predicted = lsvc_cv.predict(docs_test)
results = precision_recall_fscore_support(y_test, predicted)
macro_results.extend(precision_recall_fscore_support(y_test, predicted, average='macro')[:3])
rds['LSVC-P'] = results[0]
rds['LSVC-R'] = results[1]
rds['LSVC-F1'] = results[2]

predicted = multinb_cv.predict(docs_test) 
results = precision_recall_fscore_support(y_test, predicted)
macro_results.extend(precision_recall_fscore_support(y_test, predicted, average='macro')[:3])
rds['MNB-P'] = results[0]
rds['MNB-R'] = results[1]
rds['MNB-F1'] = results[2]

rds.loc[len(rds)]=macro_results
rds.loc[len(rds)]=summary_scores

rds.to_csv('output/classif_report.csv', index=False)
rds

Unnamed: 0,Label,Support,BL-P,BL-R,BL-F1,LSVC-P,LSVC-R,LSVC-F1,MNB-P,MNB-R,MNB-F1
0,ARGUMENTATIVE,284.0,0.484536,0.330986,0.393305,0.71134,0.728873,0.72,0.631579,0.84507,0.722892
1,INTERPERSONAL,52.0,0.090909,0.326923,0.142259,0.860465,0.711538,0.778947,1.0,0.653846,0.790698
2,PERFORMATIVE,217.0,0.337209,0.267281,0.298201,0.639269,0.645161,0.642202,0.647482,0.414747,0.505618
3,Macro,553.0,0.304218,0.308397,0.277922,0.737025,0.695191,0.713716,0.759687,0.637888,0.673069
4,CI (95\%),,0.052739,,,0.057428,,,0.017915,,


In [66]:
from sklearn.metrics import confusion_matrix
predicted = lsvc_cv.predict(docs_test)
confusion_matrix(y_test, predicted)

array([[207,   3,  74],
       [ 10,  37,   5],
       [ 74,   3, 140]])

# model usage

In [67]:
sample = X_test[:2]
labels = y_test[:2]

docs_new = sample
predicted = lsvc_cv.predict(docs_new)

for doc, label, pred in zip(docs_new, labels, predicted):
    print('%r => %s, %s' % (doc, label, pred))


'y existen páginas específicas, foros por ejemplo, para exponer las opiniones sin más.' => ARGUMENTATIVE, PERFORMATIVE
'realmente no coincido contigo en tu argumento, pues creo que en aquel caso los motivos eran también evidentes, pero no vamos a retomar esa discusión.' => ARGUMENTATIVE, ARGUMENTATIVE
