In [13]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

data = pd.read_csv('./datasets/LIAR/train.tsv', sep='\t', header=None)
category_mapping = {
    'pants-fire': 0,
    'false': 1,
    'barely-true': 2,
    'half-true': 3,
    'mostly-true': 4,
    'true': 5
}
data[1] = data[1].map(category_mapping)
X = data[2]
y = data[1]

In [None]:
X

0        Says the Annies List political group supports ...
1        When did the decline of coal start? It started...
2        Hillary Clinton agrees with John McCain "by vo...
3        Health care reform legislation is likely to ma...
4        The economic turnaround started at the end of ...
                               ...                        
10235    There are a larger number of shark attacks in ...
10236    Democrats have now become the party of the [At...
10237    Says an alternative to Social Security that op...
10238    On lifting the U.S. Cuban embargo and allowing...
10239    The Department of Veterans Affairs has a manua...
Name: 2, Length: 10240, dtype: object

In [None]:
#using word2vec
import gensim.downloader as api
from nltk.tokenize import word_tokenize
import numpy as np

word2vec_model = api.load("word2vec-google-news-300")

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cicic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def vectorize_sentence(sentence, model): 
    words = word_tokenize(sentence.lower())
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

In [None]:
X_vectorized = np.array([vectorize_sentence(sentence, word2vec_model) for sentence in X])
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2)

In [None]:
pipeline = Pipeline([
    ('clf', XGBClassifier(objective='multi:softprob'))
])

In [None]:
param_grid = {
    'clf__n_estimators': [100, 200, 300],
    'clf__learning_rate': [0.01, 0.05, 0.1],
    'clf__max_depth': [3, 4, 5],
    'clf__min_child_weight': [1, 3, 5],
    'clf__gamma': [0, 0.1, 0.2],
}

In [9]:
gs_pipeline = GridSearchCV(pipeline, param_grid, cv=2, verbose=2, n_jobs=4)

In [11]:
gs_pipeline.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
gs_pipeline.fit(X_train, y_train)

Fitting 5 folds for each of 2187 candidates, totalling 10935 fits


In [78]:
y_pred = clf.predict(X_test)

In [80]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.33      0.11      0.16       147
           1       0.24      0.24      0.24       425
           2       0.19      0.18      0.18       326
           3       0.20      0.27      0.23       423
           4       0.20      0.23      0.21       378
           5       0.24      0.17      0.20       349

    accuracy                           0.22      2048
   macro avg       0.23      0.20      0.21      2048
weighted avg       0.22      0.22      0.21      2048



In [39]:
param_grid = {
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__min_df': [1, 2, 3],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    # Add more parameters here if needed
}

In [40]:
from sklearn import pipeline
from sklearn.model_selection import GridSearchCV
gs_pipeline = GridSearchCV(pipeline, param_grid, cv=5, verbose=2, n_jobs=-1)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [42]:
gs_pipeline.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [47]:
gs_pipeline.best_params_

{'tfidf__max_df': 0.5, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2)}

In [52]:
gs_pipeline.best_score_

0.2377921854431922

In [48]:
y_pred = gs_pipeline.predict(X_test)

In [49]:
y_pred

array([4, 5, 3, ..., 1, 4, 2], dtype=int64)

In [50]:
y_pred_proba = gs_pipeline.predict_proba(X_test)
y_pred_proba

array([[0.04420721, 0.14974801, 0.16409853, 0.21723942, 0.23683108,
        0.18787576],
       [0.02670329, 0.24970888, 0.09089257, 0.23065847, 0.15185286,
        0.25018394],
       [0.1155571 , 0.20512004, 0.16140322, 0.21397603, 0.17675519,
        0.12718837],
       ...,
       [0.08210164, 0.272085  , 0.15803835, 0.19221787, 0.18316413,
        0.11239298],
       [0.04672801, 0.17075713, 0.18198013, 0.21192686, 0.22092421,
        0.16768368],
       [0.03987504, 0.17378919, 0.34705627, 0.19084123, 0.14095783,
        0.10748047]], dtype=float32)

In [51]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.21      0.09      0.13       170
           1       0.25      0.32      0.28       404
           2       0.25      0.16      0.20       351
           3       0.23      0.26      0.24       424
           4       0.23      0.28      0.25       368
           5       0.27      0.23      0.25       331

    accuracy                           0.24      2048
   macro avg       0.24      0.23      0.23      2048
weighted avg       0.24      0.24      0.24      2048

