In [None]:
%matplotlib inline
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, hamming_loss
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('spanish'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
import seaborn as sns
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import LabelPowerset




from skmultilearn.model_selection import iterative_train_test_split



### Helper Functions

- Model Performance_plot: This helper function displays how the accuracy, F1 Score and Hamming loss vary according to the prediction threshold. This function only works with sklearns machine learning algorithms that have the predict_proba method. 

In [None]:
def model_performance_plot():
    '''
    This helper function displays how the accuracy, 
    F1 Score and Hamming loss vary according to the prediction threshold. 
    This function only works with sklearns machine learning algorithms that have the predict_proba method. 
    __________________________________________________________________________
    '''

    th = []
    f = []
    ham = []
    ac = []

    for t in range (5,60): # threshold value
        y_pred_new = (predictions_proba >= t/100).astype(int)
        th.append(t)
        ac.append(accuracy_score(y_test,y_pred_new))
        f.append(f1_score(y_test,y_pred_new, average="micro"))
        ham.append(hamming_loss(y_test,y_pred_new))
    plt.rcParams["figure.figsize"] = (12,6)
    with plt.style.context('ggplot'):
        plt.plot(th, f)
        plt.plot(th, ham)
        plt.plot(th, ac)
        plt.legend(['F1', 'Hamming loss', 'Accuracy'], loc='center left', fontsize = 14)
        plt.ylabel("metrics", fontsize = 14)
        plt.xlabel("threshold", fontsize = 14)
        plt.title("Classfier Chain Model", fontsize = 18)
    plt.show()
    
    
def group_labels_in_column(df):
    '''
    This function stacks all the target columns into a single column. 
    '''
    text = []
    category = []

    for index, row in df.iterrows():
        for value in row.iteritems():
    #         print(value)
            if value[1] == 1:
                category.append(value[0])
                text.append(row[0])
            else:
                next

    tweets_new = pd.DataFrame({'text':text, 'category':category})


    X = tweets_new.iloc[:,0]
    y = tweets_new.iloc[:, 1]


### Reading Data


In [None]:
import sys
sys.path.append(r'../src/c4v/data')

from data_loader import BratDataLoader

dataset = BratDataLoader(['../data/processed/brat/sampled_58_30'])

# # Create X and y variables.  
X = dataset.X
y = dataset.y

# # Traint test split
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 21)



## Models

### Naive Bayes

In [None]:
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('tfidf_trans', TfidfTransformer()),
                ('clf', BinaryRelevance(MultinomialNB(
                    fit_prior=True, class_prior=None))),
                ])

NB_pipeline.fit(X_train, y_train)

prediction = NB_pipeline.predict(X_test)
predictions_proba = NB_pipeline.predict_proba(X_test)

print(
f'''
    Accuracy: {accuracy_score(y_test, prediction)}
    Recall: {recall_score(y_test, prediction, average = 'weighted')}
    F1: {f1_score(y_test, prediction, average = 'weighted')}
'''
)

# print(NB_pipeline.predict_proba(X_test)[0])

    
model_performance_plot()

### Support vector classifier 

In [None]:
# categories = X_trai.category

from sklearn.ensemble import RandomForestClassifier

SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('tfidf_trans', TfidfTransformer()),
                ('clf', LabelPowerset(LinearSVC(random_state = 21))),
            ])

SVC_pipeline.fit(X_train, y_train)
prediction = SVC_pipeline.predict(X_test)


print(
f'''
    Accuracy: {accuracy_score(y_test, prediction)}
    Recall: {recall_score(y_test, prediction, average = 'weighted')}
    F1: {f1_score(y_test, prediction, average = 'weighted')}
'''
)



### Logistic Regression

In [None]:
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('tfidf_trans', TfidfTransformer()),
                ('clf', LabelPowerset(LogisticRegression(solver='sag', random_state = 21))),
            ])

LogReg_pipeline.fit(X_train, y_train)
prediction = LogReg_pipeline.predict(X_test)
predictions_proba = LogReg_pipeline.predict_proba(X_test)

print(
f'''
    Accuracy: {accuracy_score(y_test, prediction)}
    Recall: {recall_score(y_test, prediction, average = 'weighted')}
    F1: {f1_score(y_test, prediction, average = 'weighted')}
'''
)

model_performance_plot()

### Stochastic Gradient Descent Classifier

In [None]:
SGDClassifier = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('tfidf_trans', TfidfTransformer()),
                ('clf', LabelPowerset(SGDClassifier(random_state = 21, loss="log", penalty="elasticnet"))),
            ])

SGDClassifier.fit(X_train, y_train)
prediction = SGDClassifier.predict(X_test)
predictions_proba = SGDClassifier.predict_proba(X_test)

print(
f'''
    Accuracy: {accuracy_score(y_test, prediction)}
    Recall: {recall_score(y_test, prediction, average = 'weighted')}
    F1: {f1_score(y_test, prediction, average = 'weighted')}
'''
)

model_performance_plot()