# Final Results

In [0]:
import numpy as np
import pandas as pd
import pickle
from joblib import dump, load
pd.set_option('max_colwidth', 160)
import warnings
warnings.filterwarnings('ignore')
#warnings.filterwarnings(action='once')

data_train = pd.read_csv('Train_data_compeition.csv')
data_validation = pd.read_csv('Validation_data_competition.csv')
data_finaltest = pd.read_csv('testdata_gold_labels.csv')

In [0]:
# feature engineering
data_train['total_words'] = data_train.apply(lambda x: len(x['tweet_content'].split()), axis=1)
data_validation['total_words'] = data_validation.apply(lambda x: len(x['tweet_content'].split()), axis=1)
data_finaltest['total_words'] = data_finaltest.apply(lambda x: len(x['tweet_content'].split()), axis=1)

In [0]:
# text pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, TransformerMixin
import re
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report, confusion_matrix, roc_auc_score, recall_score

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]
      
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.field]]


import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def Tokenizer__(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words
  

#stop_words = set(stopwords.words('english'))
stop_words = stopwords.words('english')
#stop_words = stop_words.extend(['RT'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
def print_results(y_test, preds):
  acc = "Accuracy: " + str(accuracy_score(y_test, preds))
  precision = "Precision: " + str(precision_score(y_test, preds))
  f1 = "F1 Score: " + str(f1_score(y_test, preds))
  roc_auc = "ROC AUC:" + str(roc_auc_score(y_test, preds))
  report = str(classification_report(y_test, preds))
  conf_matrix = str(confusion_matrix(y_test, preds))
  #results = acc + '\n' + precision + '\n' + f1 + '\n' + roc_auc + '\n' + report + '\n' + conf_matrix
  results = acc + '\n' + precision + '\n' + f1 + '\n' + roc_auc
  print(results, '\n')

def print_cv_results(cv_results):
  for key in cv_results:
    score = key
    value = np.mean(cv_results[key])
    print(score + ": ", value)

## Task A

In [0]:
#X = data_train[['tweet_content','total_words']]
#Y = data_train['harassment']

data_all = data_train.append(data_validation)
X = data_all[['tweet_content','total_words']]
Y = data_all['harassment']

### Approach 1: RF

In [0]:
classifier = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('tweet_content')),
            ('tfidf', TfidfVectorizer(tokenizer=Tokenizer__, stop_words=stop_words,
                     min_df=5, max_df=0.9, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
        ('words', Pipeline([
            ('wordext', NumberSelector('total_words')),
            ('wscaler', StandardScaler()),
        ])),
    ])),
     ('clf', RandomForestClassifier()),
    ])

classifier.fit(X, Y)
dump(classifier, 'taskA_RF.joblib')

['taskA_RF.joblib']

### Approach 2: XGBoost

In [0]:
classifier = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('tweet_content')),
            ('tfidf', TfidfVectorizer(tokenizer=Tokenizer__, stop_words=stop_words,
                     min_df=5, max_df=0.9, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
        ('words', Pipeline([
            ('wordext', NumberSelector('total_words')),
            ('wscaler', StandardScaler()),
        ])),
    ])),
    ('clf', 
     XGBClassifier(max_depth=2, n_estimators=300, learning_rate=0.1)
     ),
    ])
classifier.fit(X, Y)
dump(classifier, 'taskA_XGBoost.joblib')

['taskA_XGBoost.joblib']

### Approach 3: LSTM

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping

MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 15
EMBEDDING_DIM = 100

tokenizerLSTM = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizerLSTM.fit_on_texts(data_all['tweet_content'].values)
word_index = tokenizerLSTM.word_index

X = tokenizerLSTM.texts_to_sequences(data_all['tweet_content'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
Y = pd.get_dummies(data_all['harassment']).values

X_val = tokenizerLSTM.texts_to_sequences(data_validation['tweet_content'].values)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH)
Y_val = pd.get_dummies(data_validation['harassment']).values

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 50
batch_size = 64

#history = model.fit(X, Y, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
model.fit(X, Y, epochs=epochs, batch_size=batch_size,validation_data=(X_val, Y_val),callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
dump(model, 'taskA_LSTM.joblib')

Train on 8499 samples, validate on 2125 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50


['taskA_LSTM.joblib']

### Scores

#### Validation

In [0]:
X = data_validation[['tweet_content','total_words']]
Y = data_validation['harassment']

In [0]:
# Approach 1: RF
approach = 'taskA_RF'
print(approach, 'validation')
print('============')
model = load(approach + '.joblib')
preds = model.predict(X)
print_results(Y,preds)

taskA_RF validation
Accuracy: 0.9854117647058823
Precision: 0.993431855500821
F1 Score: 0.9750201450443191
ROC AUC:0.977299655777595 



In [0]:
# Approach 2: XGBoost
approach = 'taskA_XGBoost'
print(approach, 'validation')
print('============')
model = load(approach + '.joblib')
preds = model.predict(X)
print_results(Y,preds)

taskA_XGBoost validation
Accuracy: 0.9228235294117647
Precision: 0.9020618556701031
F1 Score: 0.8649093904448105
ROC AUC:0.896259018881362 



In [0]:
# Approach 3: LSTM
approach = 'taskA_LSTM'
print(approach, 'validation')
print('============')
model = load(approach + '.joblib')
X = tokenizerLSTM.texts_to_sequences(data_validation['tweet_content'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

preds = model.predict(X, batch_size=64, verbose=1)
preds = np.argmax(preds, axis=1)
preds_final = []
for pred in preds:
  preds_final.append(pred)
Y = pd.get_dummies(data_validation['harassment'])
Y = Y.idxmax(axis=1)
Y_final = []
for y_real in Y:
  Y_final.append(y_real)
print_results(Y_final,preds_final)

taskA_LSTM validation
Accuracy: 0.9943529411764706
Precision: 0.990506329113924
F1 Score: 0.990506329113924
ROC AUC:0.9932437874638609 



#### Test Final

In [0]:
X = data_finaltest[['tweet_content','total_words']]
Y = data_finaltest['harassment']

In [0]:
# Approach 1: RF
approach = 'taskA_RF'
print(approach, 'test final')
print('============')
model = load(approach + '.joblib')
preds = model.predict(X)
print_results(Y,preds)

taskA_RF test final
Accuracy: 0.7974564295807819
Precision: 0.7986798679867987
F1 Score: 0.5295404814004376
ROC AUC:0.6778640488746872 



In [0]:
# Approach 2: XGBoost
approach = 'taskA_XGBoost'
print(approach, 'test final')
print('============')
model = load(approach + '.joblib')
preds = model.predict(X)
print_results(Y,preds)

taskA_XGBoost test final
Accuracy: 0.8191238813000471
Precision: 0.814404432132964
F1 Score: 0.6049382716049382
ROC AUC:0.7184331133799219 



In [0]:
# Approach 3: LSTM
approach = 'taskA_LSTM'
print(approach, 'test final')
print('============')
model = load(approach + '.joblib')
X = tokenizerLSTM.texts_to_sequences(data_finaltest['tweet_content'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

preds = model.predict(X, batch_size=64, verbose=1)
preds = np.argmax(preds, axis=1)
preds_final = []
for pred in preds:
  preds_final.append(pred)
Y = pd.get_dummies(data_finaltest['harassment'])
Y = Y.idxmax(axis=1)
Y_final = []
for y_real in Y:
  Y_final.append(y_real)
print_results(Y_final,preds_final)

taskA_LSTM test final
Accuracy: 0.7644842204427696
Precision: 0.6305882352941177
F1 Score: 0.5173745173745173
ROC AUC:0.6673946128733362 



In [0]:
# Approach 2.submitted: XGBoost
def Tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words
  
X = data_finaltest[['tweet_content','total_words']]
Y = data_finaltest['harassment']
approach = 'harass-xgboost-train'
print(approach, 'test final')
print('============')
model = load(approach + '.joblib')
preds = model.predict(X)
print_results(Y,preds)

harass-xgboost-train test final
Accuracy: 0.8082901554404145
Precision: 0.788135593220339
F1 Score: 0.5782383419689119
ROC AUC:0.7035126516509494 



## Task B

In [0]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [0]:
def get_type(x):
  if x[3] == 1:
    return 1
  if x[4] == 1:
    return 2
  if x[5] == 1:
    return 3
  return 0

data_train['harassment-type'] = data_train.apply(get_type, axis=1)
data_validation['harassment-type'] = data_validation.apply(get_type, axis=1)
data_finaltest['harassment-type'] = data_finaltest.apply(get_type, axis=1)

stop_words = stopwords.words('english')
stop_words = stop_words.extend(['RT'])

def print_results_multiclass(y_test, preds):
  print('True labels:', y_test)
  print('Pred labels:', preds)
  acc = "Accuracy: " + str(accuracy_score(y_test, preds))
  precision = "Precision: " + str(precision_score(y_test, preds, average='macro'))
  recall = "Recall: " + str(recall_score(y_test, preds, average='macro'))
  #f1 = "F1 Score: " + str(f1_score(y_test, preds))
  #roc_auc = "ROC AUC:" + str(roc_auc_score(y_test, preds, average='macro'))
  f1_macro = "F1 Macro-avg:" + str(f1_score(y_test, preds, average='macro'))
  report = str(classification_report(y_test, preds))
  conf_matrix = str(confusion_matrix(y_test, preds))
  #results = acc + '\n' + precision + '\n' + f1 + '\n' + roc_auc + '\n' + report + '\n' + conf_matrix
  results = acc + '\n' + precision + '\n' + recall + '\n' + f1_macro
  print(results, '\n')

In [0]:
#data_train_harassment = data_train[data_train['harassment'] == 1]
data_all = data_train.append(data_validation)
data_train_harassment = data_all[data_all['harassment'] == 1]

X = data_train_harassment[['tweet_content','total_words']]
Y = data_train_harassment['harassment-type']

### Approach 1: OORF

In [0]:
classifier = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('tweet_content')),
            ('tfidf', TfidfVectorizer(tokenizer=Tokenizer__, stop_words=stop_words,
                     min_df=.0025, max_df=0.25, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
        ('words', Pipeline([
            ('wordext', NumberSelector('total_words')),
            ('wscaler', StandardScaler()),
        ])),
    ])),
    ('clf', OneVsOneClassifier(RandomForestClassifier())),
    ])
classifier.fit(X, Y)
dump(classifier, 'taskB_OORF.joblib')

['taskB_OORF.joblib']

### Approach 2: OCRF

In [0]:
classifier = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('tweet_content')),
            ('tfidf', TfidfVectorizer(tokenizer=Tokenizer__, stop_words=stop_words,
                     min_df=.0025, max_df=0.25, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
        ('words', Pipeline([
            ('wordext', NumberSelector('total_words')),
            ('wscaler', StandardScaler()),
        ])),
    ])),
    ('clf', OutputCodeClassifier(RandomForestClassifier(), code_size=2, random_state=0)),
    ])
classifier.fit(X, Y)
dump(classifier, 'taskB_OCRF.joblib')

['taskB_OCRF.joblib']

### Approach 3: OCGB

In [0]:
classifier = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('tweet_content')),
            ('tfidf', TfidfVectorizer(tokenizer=Tokenizer__, stop_words=stop_words,
                     min_df=5, max_df=0.9, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=250)), #for XGB
        ])),
        ('text2', Pipeline([
            ('colext', TextSelector('tweet_content')),
            ('tfidf', TfidfVectorizer(tokenizer=Tokenizer__, stop_words=stop_words,
                     min_df=5, max_df=0.9, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
        
    #    ('words', Pipeline([
    #        ('wordext', NumberSelector('total_words')),
    #        ('wscaler', StandardScaler()),
    #    ])),
    ])),
    #('clf', OneVsOneClassifier(
        #ExtraTreesClassifier(n_estimators=200, max_depth=None,min_samples_split=3, random_state=0)
        #AdaBoostClassifier(DecisionTreeClassifier(max_depth=None), algorithm="SAMME", n_estimators=5)
    #    GradientBoostingClassifier(n_estimators=20, learning_rate=1.0,max_depth=7, random_state=14)
        #XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1)
    #    )),
    ('clf', OutputCodeClassifier(
        GradientBoostingClassifier(n_estimators=20, learning_rate=1.0,max_depth=7, random_state=14),
        code_size=15, random_state=0)
    )
    
    ])
classifier.fit(X, Y)
dump(classifier, 'taskB_OCGB.joblib')

['taskB_OCGB.joblib']

### Approach 4: OOCB

In [0]:
!pip install catboost
from catboost import CatBoostClassifier, Pool

classifier = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('tweet_content')),
            ('tfidf', TfidfVectorizer(tokenizer=Tokenizer__, stop_words=stop_words,
                     min_df=.0025, max_df=0.25, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
        ('words', Pipeline([
            ('wordext', NumberSelector('total_words')),
            ('wscaler', StandardScaler()),
        ])),
    ])),
    ('clf', OneVsOneClassifier(CatBoostClassifier(iterations=5,
                           depth=10,
                           learning_rate=1,
                           loss_function='Logloss',
                           verbose=True))),
    ])
classifier.fit(X, Y)
dump(classifier, 'taskB_OOCB.joblib')

0:	learn: 0.4003370	total: 815ms	remaining: 3.26s
1:	learn: 0.2165603	total: 1.64s	remaining: 2.46s
2:	learn: 0.1409314	total: 2.45s	remaining: 1.63s
3:	learn: 0.0994242	total: 3.26s	remaining: 816ms
4:	learn: 0.0788926	total: 4.08s	remaining: 0us
0:	learn: 0.1506976	total: 906ms	remaining: 3.62s
1:	learn: 0.1134568	total: 1.81s	remaining: 2.71s
2:	learn: 0.0957433	total: 2.71s	remaining: 1.81s
3:	learn: 0.0808850	total: 3.62s	remaining: 904ms
4:	learn: 0.0642638	total: 4.53s	remaining: 0us
0:	learn: 0.1513744	total: 910ms	remaining: 3.64s
1:	learn: 0.1253973	total: 1.82s	remaining: 2.72s
2:	learn: 0.1091040	total: 2.73s	remaining: 1.82s
3:	learn: 0.0947542	total: 3.63s	remaining: 908ms
4:	learn: 0.0825960	total: 4.54s	remaining: 0us


['taskB_OOCB.joblib']

### Approach 5: LSTM

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping

MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 15
EMBEDDING_DIM = 100

tokenizerLSTM = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizerLSTM.fit_on_texts(data_train_harassment['tweet_content'].values)
word_index = tokenizerLSTM.word_index

X = tokenizerLSTM.texts_to_sequences(data_train_harassment['tweet_content'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
Y = pd.get_dummies(data_train_harassment['harassment-type']).values

X_val = tokenizerLSTM.texts_to_sequences(data_validation_harassment['tweet_content'].values)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH)
Y_val = pd.get_dummies(data_validation_harassment['harassment-type']).values

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 50
batch_size = 64

#history = model.fit(X, Y, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
model.fit(X, Y, epochs=epochs, batch_size=batch_size,validation_data=(X_val, Y_val),callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
dump(model, 'taskB_LSTM.joblib')

Train on 3345 samples, validate on 632 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50


['taskB_LSTM.joblib']

### Scores

In [0]:
approaches = ['taskB_OORF', 'taskB_OCRF', 'taskB_OCGB', 'taskB_OOCB', 'multi-outputcode-gradientboost-all']

def Tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words

#### Validation

In [0]:
data_validation_harassment = data_validation[data_validation['harassment'] == 1]
X = data_validation_harassment[['tweet_content','total_words']]
Y = data_validation_harassment['harassment-type']

for approach in approaches:
  print(approach, 'validation')
  print('============')
  model = load(approach + '.joblib')
  preds = model.predict(X)
  preds_final = []
  for pred in preds:
    preds_final.append(pred)
  Y_final = []
  for y_real in Y:
    Y_final.append(y_real)
  print_results_multiclass(Y_final,preds_final)
  print()


print('taskB_LSTM', 'validation')
print('============')
      
data_validation_harassment = data_validation[data_validation['harassment'] == 1]
X = tokenizerLSTM.texts_to_sequences(data_validation_harassment['tweet_content'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
model = load('taskB_LSTM' + '.joblib')
preds = model.predict(X, batch_size=64, verbose=1)
preds = np.argmax(preds, axis=1)
preds_final = []
for pred in preds:
  preds_final.append(pred+1)
Y = pd.get_dummies(data_validation_harassment['harassment-type'])
Y = Y.idxmax(axis=1)
Y_final = []
for y_real in Y:
  Y_final.append(y_real)
print_results_multiclass(Y_final,preds_final)
print()

taskB_OORF validation
True labels: [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 1, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3

#### Test Final

In [0]:
data_finaltest_harassment = data_finaltest[data_finaltest['harassment'] == 1]
X = data_finaltest_harassment[['tweet_content','total_words']]
Y = data_finaltest_harassment['harassment-type']

for approach in approaches:
  print(approach, 'test final')
  print('============')
  model = load(approach + '.joblib')
  preds = model.predict(X)
  preds_final = []
  for pred in preds:
    preds_final.append(pred-1)
  Y_final = []
  for y_real in Y:
    Y_final.append(y_real)
  print_results_multiclass(Y_final,preds_final)
  print()
  
  
print('taskB_LSTM', 'test final')
print('============')
      
data_finaltest_harassment = data_finaltest[data_finaltest['harassment'] == 1]
X = tokenizerLSTM.texts_to_sequences(data_finaltest_harassment['tweet_content'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
model = load('taskB_LSTM' + '.joblib')
preds = model.predict(X, batch_size=64, verbose=1)
preds = np.argmax(preds, axis=1)
preds_final = []
for pred in preds:
  preds_final.append(pred)
Y = pd.get_dummies(data_finaltest_harassment['harassment-type'])
Y = Y.idxmax(axis=1)
Y_final = []
for y_real in Y:
  Y_final.append(y_real)
print_results_multiclass(Y_final,preds_final)
print()

taskB_OORF test final
True labels: [2, 0, 2, 0, 0, 0, 0, 2, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 0, 2, 0, 2, 2, 0, 1, 2, 2, 2, 2, 0, 1, 1, 2, 2, 0, 2, 1, 2, 1, 0, 2, 1, 1, 0, 0, 0, 2, 0, 0, 1, 1, 2, 2, 0, 2, 1, 2, 2, 0, 2, 0, 1, 0, 0, 1, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 0, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 2, 2, 2, 1, 0, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 0, 0, 0, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 1, 2, 2, 0, 2, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 2, 0, 2, 1, 2, 2, 1, 2, 2, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 1, 0, 2, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 1, 2, 0, 2, 2, 2, 0, 2, 1, 1, 0, 1, 0, 2, 0, 0, 2, 0, 2, 2, 1, 2, 0, 2, 1, 1, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 1, 2, 1, 2, 0, 0, 0, 0, 1, 2, 2, 2, 1, 0, 2, 2, 0, 1, 2, 1, 0, 2, 1, 2, 0, 2, 2, 1, 2, 2, 2, 0, 0, 0, 0, 1, 1, 2, 2, 0, 2, 2, 2, 1, 1, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 0, 2, 0, 1, 2, 2