In [1]:
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LinearRegression
from keras.layers import *
from keras.models import Sequential
import numpy as np

Using TensorFlow backend.


### Load Dataset

In [2]:
### Labels
labels = {
   'New York Times': -0.45,
   'Breitbart': 1,
   'CNN': -0.4,
   'Business Insider': -0.3,
   'Atlantic': -0.6,
   'Fox News': 0.8,
   'Talking Points Memo': -0.6,
   'Buzzfeed News': -0.7,
   'National Review': 0.8,
   'New York Post': 0.5,
   'Guardian': -0.5,
   'NPR': -0.5,
   'Reuters': 0,
   'Vox': -0.85,
   'Washington Post': -0.7,
}

In [3]:
os.listdir('local_cache')

['articles1.csv', 'articles2.csv', 'articles3.csv']

In [4]:
df = pd.concat([pd.read_csv('local_cache/'+str(x), index_col=0) for x in os.listdir('local_cache')])

In [5]:
df = df[['title', 'publication', 'author', 'content']]

In [6]:
df['labels'] = df.apply(lambda x: labels[x['publication']], axis=1)

In [7]:
df = df.sample(frac=.5)

In [8]:
# 60% Training
# 20% Test
# 20% Validation
msk = np.random.rand(len(df)) < 0.6
train = df[msk]
tmp = df[~msk]
msk = np.random.rand(len(tmp)) < 0.5
test = tmp[msk]
val = tmp[~msk]

## Bag of Words Features

In [16]:
count_vect = CountVectorizer(stop_words='english',
                             ngram_range=(1, 5),
                             strip_accents='ascii',
                             max_features=40000)
X_train_counts = count_vect.fit_transform(tqdm(train['content']))


  0%|          | 0/42572 [00:00<?, ?it/s][A
  0%|          | 51/42572 [00:00<01:24, 500.58it/s][A
  0%|          | 104/42572 [00:00<01:22, 513.91it/s][A
  0%|          | 143/42572 [00:00<01:31, 465.18it/s][A
  0%|          | 179/42572 [00:00<01:36, 437.85it/s][A
  1%|          | 228/42572 [00:00<01:34, 447.71it/s][A
  1%|          | 265/42572 [00:00<01:39, 424.66it/s][A
  1%|          | 312/42572 [00:00<01:38, 429.53it/s][A
  1%|          | 364/42572 [00:00<01:35, 439.98it/s][A
  1%|          | 409/42572 [00:00<01:35, 440.96it/s][A
  1%|          | 454/42572 [00:01<01:36, 437.08it/s][A
  1%|          | 497/42572 [00:01<01:39, 424.17it/s][A
  1%|▏         | 542/42572 [00:01<01:38, 425.83it/s][A
  1%|▏         | 588/42572 [00:01<01:38, 428.32it/s][A
  1%|▏         | 638/42572 [00:01<01:36, 432.54it/s][A
  2%|▏         | 699/42572 [00:01<01:34, 442.48it/s][A
  2%|▏         | 748/42572 [00:01<01:34, 443.80it/s][A
  2%|▏         | 797/42572 [00:01<01:33, 446.05it/s][A
  2

 16%|█▌        | 6707/42572 [00:15<01:25, 421.91it/s][A
 16%|█▌        | 6751/42572 [00:15<01:24, 421.98it/s][A
 16%|█▌        | 6795/42572 [00:16<01:24, 421.93it/s][A
 16%|█▌        | 6838/42572 [00:16<01:24, 421.92it/s][A
 16%|█▌        | 6881/42572 [00:16<01:24, 421.93it/s][A
 16%|█▋        | 6924/42572 [00:16<01:24, 421.80it/s][A
 16%|█▋        | 6966/42572 [00:16<01:24, 421.77it/s][A
 16%|█▋        | 7008/42572 [00:16<01:24, 421.71it/s][A
 17%|█▋        | 7050/42572 [00:16<01:24, 421.55it/s][A
 17%|█▋        | 7092/42572 [00:16<01:24, 421.54it/s][A
 17%|█▋        | 7143/42572 [00:16<01:24, 421.71it/s][A
 17%|█▋        | 7191/42572 [00:17<01:23, 422.02it/s][A
 17%|█▋        | 7242/42572 [00:17<01:23, 422.43it/s][A
 17%|█▋        | 7288/42572 [00:17<01:23, 422.19it/s][A
 17%|█▋        | 7332/42572 [00:17<01:23, 422.25it/s][A
 17%|█▋        | 7376/42572 [00:17<01:23, 422.35it/s][A
 17%|█▋        | 7420/42572 [00:17<01:23, 422.03it/s][A
 18%|█▊        | 7465/42572 [00

 31%|███▏      | 13402/42572 [00:31<01:08, 423.22it/s][A
 32%|███▏      | 13450/42572 [00:31<01:08, 423.31it/s][A
 32%|███▏      | 13503/42572 [00:31<01:08, 423.64it/s][A
 32%|███▏      | 13552/42572 [00:31<01:08, 423.70it/s][A
 32%|███▏      | 13600/42572 [00:32<01:08, 423.44it/s][A
 32%|███▏      | 13644/42572 [00:32<01:08, 423.30it/s][A
 32%|███▏      | 13687/42572 [00:32<01:08, 423.07it/s][A
 32%|███▏      | 13728/42572 [00:32<01:08, 422.51it/s][A
 32%|███▏      | 13766/42572 [00:32<01:08, 421.77it/s][A
 32%|███▏      | 13808/42572 [00:32<01:08, 421.75it/s][A
 33%|███▎      | 13853/42572 [00:32<01:08, 421.82it/s][A
 33%|███▎      | 13892/42572 [00:32<01:08, 421.68it/s][A
 33%|███▎      | 13932/42572 [00:33<01:07, 421.60it/s][A
 33%|███▎      | 13983/42572 [00:33<01:07, 421.84it/s][A
 33%|███▎      | 14026/42572 [00:33<01:07, 421.57it/s][A
 33%|███▎      | 14069/42572 [00:33<01:07, 421.55it/s][A
 33%|███▎      | 14112/42572 [00:33<01:07, 421.57it/s][A
 33%|███▎     

 46%|████▋     | 19731/42572 [00:47<00:55, 411.66it/s][A
 46%|████▋     | 19774/42572 [00:48<00:55, 411.63it/s][A
 47%|████▋     | 19815/42572 [00:48<00:55, 411.60it/s][A
 47%|████▋     | 19856/42572 [00:48<00:55, 411.59it/s][A
 47%|████▋     | 19902/42572 [00:48<00:55, 411.67it/s][A
 47%|████▋     | 19946/42572 [00:48<00:54, 411.71it/s][A
 47%|████▋     | 19991/42572 [00:48<00:54, 411.79it/s][A
 47%|████▋     | 20041/42572 [00:48<00:54, 411.95it/s][A
 47%|████▋     | 20092/42572 [00:48<00:54, 412.13it/s][A
 47%|████▋     | 20139/42572 [00:48<00:54, 411.78it/s][A
 47%|████▋     | 20182/42572 [00:49<00:54, 411.82it/s][A
 48%|████▊     | 20224/42572 [00:49<00:54, 411.82it/s][A
 48%|████▊     | 20266/42572 [00:49<00:54, 411.83it/s][A
 48%|████▊     | 20308/42572 [00:49<00:54, 411.69it/s][A
 48%|████▊     | 20356/42572 [00:49<00:53, 411.81it/s][A
 48%|████▊     | 20402/42572 [00:49<00:53, 411.89it/s][A
 48%|████▊     | 20450/42572 [00:49<00:53, 412.01it/s][A
 48%|████▊    

 62%|██████▏   | 26193/42572 [01:03<00:39, 414.42it/s][A
 62%|██████▏   | 26235/42572 [01:03<00:39, 414.41it/s][A
 62%|██████▏   | 26276/42572 [01:03<00:39, 414.35it/s][A
 62%|██████▏   | 26324/42572 [01:03<00:39, 414.46it/s][A
 62%|██████▏   | 26373/42572 [01:03<00:39, 414.55it/s][A
 62%|██████▏   | 26422/42572 [01:03<00:38, 414.66it/s][A
 62%|██████▏   | 26468/42572 [01:03<00:38, 414.56it/s][A
 62%|██████▏   | 26511/42572 [01:03<00:38, 414.56it/s][A
 62%|██████▏   | 26557/42572 [01:04<00:38, 414.61it/s][A
 62%|██████▏   | 26600/42572 [01:04<00:38, 414.30it/s][A
 63%|██████▎   | 26639/42572 [01:04<00:38, 414.21it/s][A
 63%|██████▎   | 26687/42572 [01:04<00:38, 414.30it/s][A
 63%|██████▎   | 26730/42572 [01:04<00:38, 414.31it/s][A
 63%|██████▎   | 26776/42572 [01:04<00:38, 414.34it/s][A
 63%|██████▎   | 26818/42572 [01:04<00:38, 414.32it/s][A
 63%|██████▎   | 26867/42572 [01:04<00:37, 414.43it/s][A
 63%|██████▎   | 26911/42572 [01:04<00:37, 414.29it/s][A
 63%|██████▎  

 77%|███████▋  | 32616/42572 [01:18<00:23, 415.95it/s][A
 77%|███████▋  | 32656/42572 [01:18<00:23, 415.92it/s][A
 77%|███████▋  | 32708/42572 [01:18<00:23, 416.03it/s][A
 77%|███████▋  | 32757/42572 [01:18<00:23, 416.10it/s][A
 77%|███████▋  | 32801/42572 [01:18<00:23, 416.06it/s][A
 77%|███████▋  | 32843/42572 [01:18<00:23, 415.97it/s][A
 77%|███████▋  | 32893/42572 [01:19<00:23, 416.07it/s][A
 77%|███████▋  | 32937/42572 [01:19<00:23, 416.10it/s][A
 77%|███████▋  | 32981/42572 [01:19<00:23, 415.83it/s][A
 78%|███████▊  | 33023/42572 [01:19<00:22, 415.83it/s][A
 78%|███████▊  | 33070/42572 [01:19<00:22, 415.84it/s][A
 78%|███████▊  | 33111/42572 [01:19<00:22, 415.78it/s][A
 78%|███████▊  | 33151/42572 [01:19<00:22, 415.74it/s][A
 78%|███████▊  | 33195/42572 [01:19<00:22, 415.76it/s][A
 78%|███████▊  | 33249/42572 [01:19<00:22, 415.91it/s][A
 78%|███████▊  | 33295/42572 [01:20<00:22, 415.95it/s][A
 78%|███████▊  | 33340/42572 [01:20<00:22, 415.95it/s][A
 78%|███████▊ 

 92%|█████████▏| 39358/42572 [01:35<00:07, 412.51it/s][A
 93%|█████████▎| 39404/42572 [01:35<00:07, 412.52it/s][A
 93%|█████████▎| 39449/42572 [01:35<00:07, 412.53it/s][A
 93%|█████████▎| 39495/42572 [01:35<00:07, 412.58it/s][A
 93%|█████████▎| 39540/42572 [01:35<00:07, 412.54it/s][A
 93%|█████████▎| 39595/42572 [01:35<00:07, 412.68it/s][A
 93%|█████████▎| 39642/42572 [01:36<00:07, 412.68it/s][A
 93%|█████████▎| 39687/42572 [01:36<00:06, 412.68it/s][A
 93%|█████████▎| 39736/42572 [01:36<00:06, 412.76it/s][A
 93%|█████████▎| 39786/42572 [01:36<00:06, 412.84it/s][A
 94%|█████████▎| 39833/42572 [01:36<00:06, 412.89it/s][A
 94%|█████████▎| 39886/42572 [01:36<00:06, 413.01it/s][A
 94%|█████████▍| 39935/42572 [01:36<00:06, 412.82it/s][A
 94%|█████████▍| 39985/42572 [01:36<00:06, 412.90it/s][A
 94%|█████████▍| 40032/42572 [01:36<00:06, 412.85it/s][A
 94%|█████████▍| 40076/42572 [01:37<00:06, 412.87it/s][A
 94%|█████████▍| 40126/42572 [01:37<00:05, 412.96it/s][A
 94%|█████████

In [17]:
tf_transformer = TfidfTransformer().fit(X_train_counts)

In [18]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

X_val_counts = count_vect.transform(tqdm(val['content']))
X_val_tfidf = tfidf_transformer.transform(X_val_counts)


  0%|          | 0/14225 [00:00<?, ?it/s][A
  0%|          | 51/14225 [00:00<00:29, 482.68it/s][A
  1%|          | 92/14225 [00:00<00:31, 444.74it/s][A
  1%|          | 136/14225 [00:00<00:32, 431.57it/s][A
  1%|          | 176/14225 [00:00<00:33, 423.14it/s][A
  2%|▏         | 226/14225 [00:00<00:32, 437.37it/s][A
  2%|▏         | 274/14225 [00:00<00:31, 443.03it/s][A
  2%|▏         | 318/14225 [00:00<00:31, 441.89it/s][A
  3%|▎         | 374/14225 [00:00<00:30, 454.98it/s][A
  3%|▎         | 420/14225 [00:00<00:30, 449.18it/s][A
  3%|▎         | 473/14225 [00:01<00:30, 457.00it/s][A
  4%|▎         | 520/14225 [00:01<00:30, 442.33it/s][A
  4%|▍         | 567/14225 [00:01<00:30, 443.75it/s][A
  4%|▍         | 613/14225 [00:01<00:30, 443.70it/s][A
  5%|▍         | 658/14225 [00:01<00:30, 443.83it/s][A
  5%|▍         | 703/14225 [00:01<00:31, 435.24it/s][A
  5%|▌         | 752/14225 [00:01<00:30, 436.70it/s][A
  6%|▌         | 799/14225 [00:01<00:30, 438.13it/s][A
  6%

 50%|████▉     | 7075/14225 [00:15<00:15, 454.02it/s][A
 50%|█████     | 7119/14225 [00:15<00:15, 453.88it/s][A
 50%|█████     | 7167/14225 [00:15<00:15, 454.01it/s][A
 51%|█████     | 7217/14225 [00:15<00:15, 454.29it/s][A
 51%|█████     | 7265/14225 [00:15<00:15, 454.42it/s][A
 51%|█████▏    | 7318/14225 [00:16<00:15, 454.85it/s][A
 52%|█████▏    | 7373/14225 [00:16<00:15, 455.43it/s][A
 52%|█████▏    | 7424/14225 [00:16<00:14, 455.41it/s][A
 53%|█████▎    | 7474/14225 [00:16<00:14, 455.61it/s][A
 53%|█████▎    | 7523/14225 [00:16<00:14, 455.55it/s][A
 53%|█████▎    | 7571/14225 [00:16<00:14, 454.73it/s][A
 54%|█████▎    | 7615/14225 [00:16<00:14, 454.41it/s][A
 54%|█████▍    | 7661/14225 [00:16<00:14, 454.36it/s][A
 54%|█████▍    | 7705/14225 [00:16<00:14, 453.70it/s][A
 55%|█████▍    | 7758/14225 [00:17<00:14, 454.09it/s][A
 55%|█████▍    | 7807/14225 [00:17<00:14, 454.28it/s][A
 55%|█████▌    | 7853/14225 [00:17<00:14, 452.34it/s][A
 56%|█████▌    | 7899/14225 [00

 99%|█████████▉| 14131/14225 [00:30<00:00, 456.37it/s][A
100%|█████████▉| 14183/14225 [00:31<00:00, 456.38it/s][A
100%|██████████| 14225/14225 [00:31<00:00, 456.08it/s][A

In [19]:
model = Sequential()
model.add(Dense(2455, input_shape=(X_train_tfidf.shape[1], )))
model.add(Dense(1000))
model.add(Dense(100))
model.add(Dense(1))
model.compile('SGD', 'MAE')

In [20]:
train.head()

Unnamed: 0,title,publication,author,content,labels
23056,"Brazil Nears 100,000 Zika Cases Less Than 100 ...",Breitbart,Frances Martel,Brazil confirms this week it has documented a ...,1.0
20419,Breitbart News CEO &amp President Responds to ...,Breitbart,John Nolte,Michelle Fields is scheduled to be on “Good Mo...,1.0
73666,Donald Trump Lies That He Opposed Iraq War Fr...,Buzzfeed News,Kyle Blaine,During NBC News’ presidential forum on Wednes...,-0.7
82804,Judge’s blocking of travel ban sets stage for ...,New York Post,Joe Tacopino and Daniel Halper,A federal judge in Seattle Friday struck down ...,0.5
114928,"Americans Don’t Like Caucuses, But Replacing T...",NPR,Megan Verlee,"This week, as part of the Nation Engaged proje...",-0.5


In [21]:
model.fit(X_train_tfidf, np.array(train['labels'].tolist()).reshape(-1, 1),
          validation_data=())

Epoch 1/1


<keras.callbacks.History at 0x7f627e2ff5d0>

In [None]:
clf = LinearRegression().fit(X_train_tfidf, np.array(train['labels'].tolist()).reshape(-1, 1))

## Test data

In [None]:
docs_new = test['content']
X_new_counts = count_vect.transform(tqdm(docs_new))
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [None]:
predicted = clf.predict(X_new_tfidf)

In [None]:
np.array(test['labels'].tolist()).reshape(-1, 1)

In [None]:
(predicted-np.array(test['labels'].tolist()).reshape(-1, 1)).mean()