In [None]:
import numpy as np
import pandas as pd
import random

import nltk
from nltk.corpus import names, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.metrics import classification_report

from xgboost import XGBClassifier

2023-05-31 13:52:24.361556: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-31 13:52:24.396276: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load data

In [None]:
data = pd.read_excel('Media_Data.xlsx')

In [None]:
con_media = ['Fox','WSJ','Forbes','Breitbart']
pro_media = ['CNN','MSNBC','NPR','NYT','TIME','The Guardian','The Washington Post','PBS','Politico','Vox']

In [None]:
data['label'] = None

for i in range(len(data)):
  try:
    if data['media'][i] in con_media:
        data['label'][i] = 1
    elif data['media'][i] in pro_media:
        data['label'][i] = 0
  except:
    pass

In [None]:
data = data.dropna(subset=['article'])

# Undersampling

In [None]:
filt_1 = data['label'] == 1
filt_0 = data['label'] == 0

minority_num = data['label'].value_counts().min()
random_samples = random.sample(list(range(4249)), k=minority_num)

data_1 = data[filt_1].iloc[random_samples]
data_0 = data[filt_0]

data = pd.concat([data_1, data_0])

data['label'].value_counts()

1    3855
0    3855
Name: label, dtype: int64

# Preprocessing

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('names')
nltk.download('stopwords')

all_stopwords = stopwords.words('english')
all_names = set(names.words())

[nltk_data] Downloading package punkt to /home/dxlab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dxlab/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package names to /home/dxlab/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to /home/dxlab/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Text Preprocessing
def letters_only(word):
    return word.encode().isalpha()

lemmatizer = WordNetLemmatizer()

def clean_text(doc):
    cleaned_doc = []
    for word in doc.split(' '): # split doc. by blank (' ')
        word = word.lower() # ABD -> abd
        if letters_only(word) and word not in all_names and len(word) > 2 and word not in all_stopwords: # remove number and punc. and name entity
            cleaned_doc.append(lemmatizer.lemmatize(word))

    return ' '.join(cleaned_doc)

cleaned_docs = [clean_text(doc) for doc in data['article']]

# TF-IDF

In [None]:
tfidf = TfidfVectorizer()
vectors = tfidf.fit_transform(cleaned_docs).toarray()

In [None]:
x_train, x_val, y_train, y_val = train_test_split(pd.DataFrame(vectors), data['label'], test_size=0.2, shuffle=True, stratify=data['label'], random_state=34)
train_doc_num = list(x_train.index)
valid_doc_num = list(x_val.index)
x_train, x_val, y_train, y_val = np.array(x_train), np.array(x_val), np.array(y_train.astype('float')), np.array(y_val.astype('float'))

# Training

In [None]:
xgb_clf = XGBClassifier(colsample_bytree=1, learning_rate=0.1, n_estimators=900)
xgb_clf.fit(x_train,y_train)
cross_val_score(xgb_clf, x_train, y_train, scoring='accuracy', cv=5)

array([0.88573744, 0.89546191, 0.88573744, 0.88969992, 0.91159773])

In [None]:
pred = xgb_clf.predict(x_val)
classification_report(y_val, pred, digits=4, output_dict=True)

Unnamed: 0,precision,recall,f1-score,support
0.0,0.889026,0.935149,0.911504,771.0
1.0,0.931601,0.883268,0.906791,771.0
accuracy,0.909209,0.909209,0.909209,0.909209
macro avg,0.910313,0.909209,0.909148,1542.0
weighted avg,0.910313,0.909209,0.909148,1542.0
