### Imports

In [1]:
import pandas as pd
import numpy as np
import sys
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
from terminaltables import AsciiTable

from sklearn.ensemble import RandomForestClassifier
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

cmap = plt.get_cmap('viridis')

os.makedirs("data", exist_ok=True)

### Load dataset

In [2]:
article_dataset = pd.read_pickle("data/articles")

### Extract article texts and targets (news or blog)

In [3]:
articles = []
targets = []

target_encoding = {
    "News": 0,
    "Blog": 1
}

idxs = np.random.choice(range(article_dataset.shape[0]), size=6000, replace=False)

for i in tqdm(idxs, desc="Extracting articles"):
    text = article_dataset.iloc[i]["content"]
    articles.append(text)
    
    target_name = article_dataset.iloc[i]["media-type"]
    target = target_encoding[target_name]
    targets.append(target)
    
targets = np.array(targets)

Extracting articles: 100%|██████████| 6000/6000 [00:03<00:00, 1973.28it/s]


### Get token counts

In [4]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(articles)
X_counts.shape

(6000, 82443)

### Get features

In [5]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_counts)
X_tf = tf_transformer.transform(X_counts)
X = X_tf.toarray()
print (X.shape)

(6000, 82443)


### Train / test split

In [6]:
y = targets

# Split the training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Classification

In [7]:
# Define classifier
clf = RandomForestClassifier(n_estimators=100)

# Fit - Train
clf.fit(X_train, y_train)

# Make a prediction on test set
y_pred = clf.predict(X_test)

# Measure the performance of the model
print ("Test set accuracy: %.2f%%\n" %  float(100*accuracy_score(y_test, y_pred)))

print (classification_report(y_test, y_pred, target_names=["News", "Blog"]))


Test set accuracy: 76.25%

             precision    recall  f1-score   support

       News       0.78      0.75      0.76       611
       Blog       0.75      0.77      0.76       589

avg / total       0.76      0.76      0.76      1200

