In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install gensim
!pip install nltk



In [None]:
import nltk
nltk.download('word2vec_sample')

[nltk_data] Downloading package word2vec_sample to /root/nltk_data...
[nltk_data]   Unzipping models/word2vec_sample.zip.


True

In [None]:
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np


tqdm.pandas()


df = pd.read_csv('/content/drive/MyDrive/Samsung/PROY/NewsBias.csv')
df.head()

Unnamed: 0,id,text,label
0,0,what got us through for many it was hobbies re...,leaning-left
1,1,new year s eve gatherings could accelerate ent...,leaning-left
2,2,entity traditions that you can still participa...,center
3,3,here is how countries around the world are rin...,leaning-right
4,4,police in entity said on thursday evening that...,left


In [None]:
import gensim
from nltk.data import find

word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model_english = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

In [None]:
def embed(clean,modelo):

    tokens = clean.strip().split()

    model_len = modelo.vector_size
    vec = np.zeros(model_len)

    for t in tokens:
        if t in modelo:
            vec += modelo[t]

    vec = vec / len(tokens)

    return vec

def vectorizar(vectorizador,df):
     X = vectorizador.fit_transform(df['text'])
     return X

In [None]:
df['embedding'] = df['text'].progress_apply(lambda x: embed(x, model_english))

100%|██████████| 10754/10754 [00:21<00:00, 497.69it/s]


In [None]:
vectorizaciones = {
    'binary': CountVectorizer(binary=True),
    'TF': CountVectorizer(binary=False),
    'TF-IDF-l1': TfidfVectorizer(norm='l1'),
    'TF-IDF-l2': TfidfVectorizer(norm='l2'),
    'bigrams': CountVectorizer(ngram_range=(2, 2), binary=False),
    'trigram': CountVectorizer(ngram_range=(3, 3), binary=False),
}

In [None]:
DATA = {}

for n, f in tqdm(vectorizaciones.items(), desc="Vectorizando"):
    DATA[n] = vectorizar(f, df)


DATA['embedding'] = df['embedding'].to_list()

Vectorizando: 100%|██████████| 6/6 [01:09<00:00, 11.64s/it]


In [None]:
Y = df['label'].map({
    'left': 0,
    'leaning-left': 1,
    'center': 2,
    'leaning-right': 3,
    'right': 4
})

In [None]:
from sklearn.metrics import f1_score, classification_report, make_scorer
from sklearn.model_selection import train_test_split

def show_subtask2(ref, pred):
    f1_macro = f1_score(ref, pred, average='macro', zero_division=0)
    report = classification_report(ref, pred, digits=4, zero_division=0)
    print("\nSUBTASK 2")
    print(f"F1-score (Macro-Averaged): {f1_macro}")
    print(report)
    return f1_macro

def action2(X,y,modelo, name_model):

    print(f"\nModelo: {name_model}\n")
    #print('-'*50, '\n')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)
    modelo.fit(X_train, y_train)
    predicted = modelo.predict(X_test)

    #print("PREDICCIONES:\n", predicted[:10],'\n')
    #print("REALES:\n",y_test[:10],'\n')

    f1 = show_subtask2(y_test, predicted)

    return f1

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC


model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = LinearSVC(dual=False,max_iter=100)

ensemble1 = VotingClassifier(estimators=[('lr', model1), ('dt', model2), ('svm', model3)], voting='hard')

modelos = {
    'Logistic Regression': model1,
    'Decision Tree': model2,
    'SVC': model3,
    'Ensemble Voting': ensemble1
}

In [None]:
import warnings

warnings.filterwarnings("ignore")

results = {}

for vectorizador, X in tqdm(DATA.items(), desc="Evaluando vectorizadores"):
    print('*' * 50)
    print(f"\nVectorizador: {vectorizador}\n")

    results[vectorizador] = {}
    n = 0

    for name, model in modelos.items():
        score = action2(X, Y, model, name)
        results[vectorizador][name] = score
        n += score

    mean = n / len(modelos)
    results[vectorizador]['mean'] = mean

    print('*' * 50)


Evaluando vectorizadores:   0%|          | 0/7 [00:00<?, ?it/s]

**************************************************

Vectorizador: binary


Modelo: Logistic Regression

F1-score (Macro-Averaged): 0.5585764600992802

Modelo: Decision Tree

F1-score (Macro-Averaged): 0.5040219382058747

Modelo: SVC

F1-score (Macro-Averaged): 0.5434509070260617

Modelo: Ensemble Voting



Evaluando vectorizadores:  14%|█▍        | 1/7 [01:54<11:27, 114.65s/it]

F1-score (Macro-Averaged): 0.5627314258312939
**************************************************
**************************************************

Vectorizador: TF


Modelo: Logistic Regression

F1-score (Macro-Averaged): 0.5050499130242285

Modelo: Decision Tree

F1-score (Macro-Averaged): 0.5226041168706667

Modelo: SVC

F1-score (Macro-Averaged): 0.5340250224866923

Modelo: Ensemble Voting



Evaluando vectorizadores:  29%|██▊       | 2/7 [04:27<11:24, 136.98s/it]

F1-score (Macro-Averaged): 0.5739166886426892
**************************************************
**************************************************

Vectorizador: TF-IDF-l1


Modelo: Logistic Regression

F1-score (Macro-Averaged): 0.1258587368285353

Modelo: Decision Tree

F1-score (Macro-Averaged): 0.493642292537878

Modelo: SVC

F1-score (Macro-Averaged): 0.26203218704950115

Modelo: Ensemble Voting



Evaluando vectorizadores:  43%|████▎     | 3/7 [05:20<06:34, 98.75s/it] 

F1-score (Macro-Averaged): 0.3003216633660423
**************************************************
**************************************************

Vectorizador: TF-IDF-l2


Modelo: Logistic Regression

F1-score (Macro-Averaged): 0.5106656306517283

Modelo: Decision Tree

F1-score (Macro-Averaged): 0.4937204540429171

Modelo: SVC

F1-score (Macro-Averaged): 0.564261832205512

Modelo: Ensemble Voting



Evaluando vectorizadores:  57%|█████▋    | 4/7 [06:41<04:34, 91.55s/it]

F1-score (Macro-Averaged): 0.5654143139173187
**************************************************
**************************************************

Vectorizador: bigrams


Modelo: Logistic Regression

F1-score (Macro-Averaged): 0.5927470179004437

Modelo: Decision Tree

F1-score (Macro-Averaged): 0.5009250455290124

Modelo: SVC

F1-score (Macro-Averaged): 0.5785721828782484

Modelo: Ensemble Voting



Evaluando vectorizadores:  71%|███████▏  | 5/7 [23:37<14:10, 425.06s/it]

F1-score (Macro-Averaged): 0.5973299656171402
**************************************************
**************************************************

Vectorizador: trigram


Modelo: Logistic Regression

F1-score (Macro-Averaged): 0.581241749600862

Modelo: Decision Tree

F1-score (Macro-Averaged): 0.48149887152584636

Modelo: SVC

F1-score (Macro-Averaged): 0.5749020601875718

Modelo: Ensemble Voting



Evaluando vectorizadores:  86%|████████▌ | 6/7 [1:05:01<18:45, 1125.24s/it]

F1-score (Macro-Averaged): 0.5893655633052128
**************************************************
**************************************************

Vectorizador: embedding


Modelo: Logistic Regression

F1-score (Macro-Averaged): 0.19849532279352317

Modelo: Decision Tree

F1-score (Macro-Averaged): 0.26439933111050484

Modelo: SVC

F1-score (Macro-Averaged): 0.29577881639291353

Modelo: Ensemble Voting



Evaluando vectorizadores: 100%|██████████| 7/7 [1:05:17<00:00, 559.67s/it]

F1-score (Macro-Averaged): 0.26094617711392043
**************************************************





In [None]:
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.reset_index(inplace=True)
results_df.rename(columns={'index': 'Vectorizador'}, inplace=True)

results_df

Unnamed: 0,Vectorizador,Logistic Regression,Decision Tree,SVC,Ensemble Voting,mean
0,binary,0.558576,0.504022,0.543451,0.562731,0.542195
1,TF,0.50505,0.522604,0.534025,0.573917,0.533899
2,TF-IDF-l1,0.125859,0.493642,0.262032,0.300322,0.295464
3,TF-IDF-l2,0.510666,0.49372,0.564262,0.565414,0.533516
4,bigrams,0.592747,0.500925,0.578572,0.59733,0.567394
5,trigram,0.581242,0.481499,0.574902,0.589366,0.556752
6,embedding,0.198495,0.264399,0.295779,0.260946,0.254905


In [None]:
best_vectorizer = results_df.loc[results_df['mean'].idxmax(), 'Vectorizador']
print(f"\nMejor vectorizador: {best_vectorizer}\n")


Mejor vectorizador: bigrams



In [None]:
X = DATA[best_vectorizer]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1234)

In [None]:
from sklearn.model_selection import GridSearchCV

f1_scorer = make_scorer(f1_score, pos_label=1)

param_grid_logreg = {
    'penalty': ['l2'],
    'C': [0.5, 1.0],
    'solver': ['lbfgs'],
    'class_weight': [None],
    'max_iter': [100],
}

model1 = GridSearchCV(
    LogisticRegression(),
    param_grid=param_grid_logreg,
    scoring=f1_scorer,
    cv=10,
    n_jobs=-1,
    verbose=1
)

model1.fit(X_train, y_train)

Fitting 10 folds for each of 2 candidates, totalling 20 fits


In [None]:
mejor = model1.best_estimator_
mejor.fit(X_train, y_train)
predicted = mejor.predict(X_test)

print("PREDICCIONES:\n", predicted[:10],'\n')
print("REALES:\n",y_test[:10],'\n')

f1 = show_subtask2(y_test, predicted)

PREDICCIONES:
 [3 1 1 3 3 3 4 3 1 0] 

REALES:
 8848     0
10223    1
6126     1
6533     3
1469     4
5293     4
3515     1
9088     1
2819     1
2072     0
Name: label, dtype: int64 


SUBTASK 2
F1-score (Macro-Averaged): 0.5943664492207648
              precision    recall  f1-score   support

           0     0.4671    0.4084    0.4358       382
           1     0.5419    0.6277    0.5817       556
           2     0.6615    0.5165    0.5801       333
           3     0.6772    0.6121    0.6430       281
           4     0.6980    0.7679    0.7313       599

    accuracy                         0.6086      2151
   macro avg     0.6091    0.5865    0.5944      2151
weighted avg     0.6083    0.6086    0.6052      2151

