In [3]:
import pandas as pd
import random
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC

# Loading dataset
#We extracted named-entity frequence for each twitter using the spacy library. We also extracted the sentiment classification.
dataset = pd.read_csv('df_features.csv')

#Show the dataset information 
dataset.info()
dataset.isnull()
print(dataset.shape)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 872 entries, 0 to 871
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   EVENT   872 non-null    int64 
 1   FAC     872 non-null    int64 
 2   GPE     872 non-null    int64 
 3   LOC     872 non-null    int64 
 4   NORP    872 non-null    int64 
 5   ORG     872 non-null    int64 
 6   PERSON  872 non-null    int64 
 7   OTHER   872 non-null    int64 
 8   SENT    872 non-null    int64 
 9   CLASS   872 non-null    object
dtypes: int64(9), object(1)
memory usage: 68.2+ KB
(872, 10)


In [4]:
print(dataset.head())


   EVENT  FAC  GPE  LOC  NORP  ORG  PERSON  OTHER  SENT CLASS
0      0    0    0    0     1    1       1      0     3   NON
1      0    0    0    0     0    0       0      0     3   NON
2      0    0    1    0     2    0       0      0     3   NON
3      0    0    0    0     3    0       0      2     3   NON
4      0    0    0    0     2    0       0      0     3   NON


In [5]:
print(dataset.describe())

            EVENT         FAC         GPE  ...      PERSON       OTHER        SENT
count  872.000000  872.000000  872.000000  ...  872.000000  872.000000  872.000000
mean     0.005734    0.012615    0.338303  ...    0.446101    0.491972    2.243119
std      0.075549    0.139135    0.750966  ...    0.878018    0.947499    0.892923
min      0.000000    0.000000    0.000000  ...    0.000000    0.000000    1.000000
25%      0.000000    0.000000    0.000000  ...    0.000000    0.000000    1.000000
50%      0.000000    0.000000    0.000000  ...    0.000000    0.000000    2.000000
75%      0.000000    0.000000    0.000000  ...    1.000000    1.000000    3.000000
max      1.000000    3.000000    6.000000  ...    7.000000    7.000000    5.000000

[8 rows x 9 columns]


In [6]:
#Separating features and class
X = dataset.iloc[:, :-1]
Y = dataset.iloc[:,-1]
print('---------------------------(features)-------------------------------')
print(X)
print('---------------------------(class)----------------------------------')
print(Y)

---------------------------(features)-------------------------------
     EVENT  FAC  GPE  LOC  NORP  ORG  PERSON  OTHER  SENT
0        0    0    0    0     1    1       1      0     3
1        0    0    0    0     0    0       0      0     3
2        0    0    1    0     2    0       0      0     3
3        0    0    0    0     3    0       0      2     3
4        0    0    0    0     2    0       0      0     3
..     ...  ...  ...  ...   ...  ...     ...    ...   ...
867      0    0    3    0     0    0       0      0     1
868      0    0    0    0     0    0       0      0     2
869      0    0    0    0     0    0       0      0     2
870      0    0    0    0     0    0       0      0     2
871      1    0    3    0     1    1       0      3     1

[872 rows x 9 columns]
---------------------------(class)----------------------------------
0       NON
1       NON
2       NON
3       NON
4       NON
       ... 
867    ANTI
868    ANTI
869    ANTI
870    ANTI
871    ANTI
Name: CLAS

In [8]:
#Separating test and training data
validation_size = 0.10
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
scoring = 'accuracy'


models = []
#models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

#Cross-validation
results = []
names = []
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=None)
	cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

LDA: 0.975738 (0.021733)
KNN: 0.973207 (0.019325)
CART: 0.974489 (0.023518)
NB: 0.942486 (0.072826)
SVM: 0.978302 (0.016220)


In [13]:
#SVM classifier
SVMclassifier = SVC()
SVMclassifier.fit(X_train, Y_train)
predictions = SVMclassifier.predict(X_validation)

#Scores
print("Score do classificador: %.2f" % accuracy_score(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

Score do classificador: 0.99
              precision    recall  f1-score   support

        ANTI       0.98      1.00      0.99        44
         NON       1.00      0.98      0.99        44

    accuracy                           0.99        88
   macro avg       0.99      0.99      0.99        88
weighted avg       0.99      0.99      0.99        88



In [21]:
import pickle

# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(SVMclassifier, open(filename, 'wb'))

#load the dataset with new samples
dataset = pd.read_csv('df_features_test.csv')
dataset.info()
dataset.isnull()
print(dataset.shape)

#split 
X_test = dataset.iloc[:, :-1]
Y_test = dataset.iloc[:,-1]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 754 entries, 0 to 753
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   EVENT   754 non-null    int64 
 1   FAC     754 non-null    int64 
 2   GPE     754 non-null    int64 
 3   LOC     754 non-null    int64 
 4   NORP    754 non-null    int64 
 5   ORG     754 non-null    int64 
 6   PERSON  754 non-null    int64 
 7   OTHER   754 non-null    int64 
 8   SENT    754 non-null    int64 
 9   CLASS   754 non-null    object
dtypes: int64(9), object(1)
memory usage: 59.0+ KB
(754, 10)


In [24]:
# load and use the proposed model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)

0.4960212201591512


In [10]:
#MLP classifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

def model_tester(inputs, target):
  clf = MLPClassifier(hidden_layer_sizes=(5,), random_state=0, max_iter=10000)
  scores = cross_val_score(clf, inputs, target, cv=10)
  print("Score do classificador: %.2f" % (scores.mean()*100))

#Score
model_tester(X, Y)

Score do classificador: 97.71
