## Language Processing for Different Mags

### Importing Libs & Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import scipy
import sklearn
import spacy
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

In [3]:
article = pd.read_csv('articles1.csv')
article2 = pd.read_csv('articles2.csv')
article3 = pd.read_csv('articles3.csv')

In [4]:
article.drop(['url'],axis=1,inplace=True)

In [5]:
article.dropna(inplace=True)

In [6]:
a = article[article['author'] == 'Breitbart News'][['author','content']][0:500]

## Functions

In [7]:
nlp = spacy.load('en')

In [8]:
def body_of_work(content):
    body = []
    for work in content:
        body.append(work)
    return(body)

In [9]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [10]:
from collections import Counter

# Utility function to calculate how frequently words appear in the text.
def word_frequencies(text, include_stop=True):
    
    # Build a list of words.
    # Strip out punctuation and, optionally, stop words.
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    # Build and return a Counter object containing word counts.
    return Counter(words)

In [169]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(250)]

In [12]:
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

## Testing Functions 500 Articles by Single Author

In [47]:
a = body_of_work(a['content'])

In [48]:
a_500 = str(a)

In [49]:
a_500 = text_cleaner(a_500)

In [51]:
a_500_doc = nlp(a_500)

In [72]:
Breitbart_words = word_frequencies(a_500_doc,include_stop=False).most_common(50)

In [75]:
Breitbart_words[0:5]

[('’s', 1173), ('Trump', 1119), ('says', 987), ('2016', 740), ('He', 484)]

In [53]:
Breitbart_people = [entity.text for entity in list(a_500_doc.ents) if entity.label_ == "PERSON"]

In [76]:
Counter(Breitbart_people).most_common(5)

[('Clinton', 290),
 ('Cruz', 214),
 ('Trump', 132),
 ('Donald Trump', 121),
 ('Obama', 120)]

In [60]:
article['publication'].value_counts()

Breitbart           23781
New York Times       7767
CNN                  7025
Business Insider     4950
Atlantic              171
Name: publication, dtype: int64

In [64]:
article2['publication'].value_counts()

New York Post          17493
Atlantic                7008
National Review         6203
Talking Points Memo     5214
Guardian                4873
Buzzfeed News           4854
Fox News                4354
Name: publication, dtype: int64

In [65]:
article3['publication'].value_counts()

NPR                11992
Washington Post    11114
Reuters            10710
Vox                 4947
Guardian            3808
Name: publication, dtype: int64

## Applying Functions to Top Ten Publishers
### Extracting Author and Content

In [294]:
Breitbart_doc = article[article['publication'] == 'Breitbart'][['author','content']][0:1000]

In [73]:
New_York_Times_doc = article[article['publication'] == 'New York Times'][['author','content']][0:100]

In [74]:
CNN_doc = article[article['publication'] == 'CNN'][['author','content']][0:100]

In [75]:
New_York_Post_doc = article2[article2['publication'] == 'New York Post'][['author','content']][0:100]

In [123]:
Atlantic_doc = article2[article2['publication'] == 'Atlantic'][['author','content']][0:75]

In [77]:
National_Review_doc = article2[article2['publication'] == 'National Review'][['author','content']][0:100]

In [78]:
Talking_Points_Memo_doc = article2[article2['publication'] == 'Talking Points Memo'][['author','content']][0:100]

In [79]:
NPR_doc = article3[article3['publication'] == 'NPR'][['author','content']][0:100]

In [131]:
Washington_Post_doc = article3[article3['publication'] == 'Washington Post'][['author','content']][0:83]

In [81]:
Reuters_doc = article3[article3['publication'] == 'Reuters'][['author','content']][0:100]

### Creating Single Document

In [287]:
Breitbart_doc = body_of_work(Breitbart_doc['content'])

In [83]:
New_York_Times_doc = body_of_work(New_York_Times_doc['content'])

In [84]:
CNN_doc = body_of_work(CNN_doc['content'])

In [85]:
New_York_Post_doc = body_of_work(New_York_Post_doc['content'])

In [124]:
Atlantic_doc = body_of_work(Atlantic_doc['content'])

In [87]:
National_Review_doc = body_of_work(National_Review_doc['content'])

In [88]:
Talking_Points_Memo_doc = body_of_work(Talking_Points_Memo_doc['content'])

In [89]:
NPR_doc = body_of_work(NPR_doc['content'])

In [132]:
Washington_Post_doc = body_of_work(Washington_Post_doc['content'])

In [91]:
Reuters_doc = body_of_work(Reuters_doc['content'])

### Changing Data Type

In [92]:
Breitbart_5000 = str(Breitbart_doc)

In [93]:
New_York_Times_5000 = str(New_York_Times_doc)

In [94]:
CNN_5000 = str(CNN_doc)

In [95]:
New_York_Post_5000 = str(New_York_Post_doc)

In [125]:
Atlantic_5000 = str(Atlantic_doc)

In [97]:
National_Review_5000 = str(National_Review_doc)

In [98]:
Talking_Points_Memo_5000 = str(Talking_Points_Memo_doc)

In [99]:
NPR_5000 = str(NPR_doc)

In [133]:
Washington_Post_5000 = str(Washington_Post_doc)

In [101]:
Reuters_5000 = str(Reuters_doc)

### Cleaning Text

In [102]:
Breitbart_5000 = text_cleaner(Breitbart_5000)

In [103]:
New_York_Times_5000 = text_cleaner(New_York_Times_5000)

In [104]:
CNN_5000 = text_cleaner(CNN_5000)

In [105]:
New_York_Post_5000 = text_cleaner(New_York_Post_5000)

In [126]:
Atlantic_5000  = text_cleaner(Atlantic_5000)

In [107]:
National_Review_5000 = text_cleaner(National_Review_5000)

In [108]:
Talking_Points_Memo_5000 = text_cleaner(Talking_Points_Memo_5000)

In [109]:
NPR_5000 = text_cleaner(NPR_5000)

In [134]:
Washington_Post_5000 = text_cleaner(Washington_Post_5000)

In [111]:
Reuters_5000 = text_cleaner(Reuters_5000)

In [112]:
def word_count(text):
    count = 0
    for word in text:
        count += 1
    return(count)

In [113]:
word_count(Breitbart_5000)

558493

In [114]:
word_count(New_York_Times_5000)

512587

In [115]:
word_count(CNN_5000)

352166

In [116]:
word_count(New_York_Post_5000)

252519

In [127]:
word_count(Atlantic_5000)

521937

In [118]:
word_count(National_Review_5000)

563754

In [119]:
word_count(Talking_Points_Memo_5000)

293985

In [120]:
word_count(NPR_5000)

430998

In [135]:
word_count(Washington_Post_5000)

557560

In [122]:
word_count(Reuters_5000)

341686

### Applying Spacy

In [128]:
Breitbart_5000 = nlp(Breitbart_5000)

In [130]:
New_York_Times_5000 = nlp(New_York_Times_5000)

In [136]:
CNN_5000 = nlp(CNN_5000)

In [137]:
New_York_Post_5000 = nlp(New_York_Post_5000)

In [138]:
Atlantic_5000 = nlp(Atlantic_5000)

In [139]:
National_Review_5000 = nlp(National_Review_5000)

In [140]:
Talking_Points_Memo_5000 = nlp(Talking_Points_Memo_5000)

In [141]:
NPR_5000 = nlp(NPR_5000)

In [142]:
Washington_Post_5000 = nlp(Washington_Post_5000)

In [143]:
Reuters_5000 = nlp(Reuters_5000)

### Mapping Publisher to Sentences

In [144]:
Breitbart_sents = [[sent, "Breitbart"] for sent in Breitbart_5000.sents]

In [145]:
New_York_Times_sents = [[sent, "New York Times"] for sent in New_York_Times_5000.sents]

In [146]:
CNN_sents = [[sent, "CNN"] for sent in CNN_5000.sents]

In [147]:
New_York_Post_sents = [[sent, "New York Post"] for sent in New_York_Post_5000.sents]

In [148]:
Atlantic_sents = [[sent, "Atlantic"] for sent in Atlantic_5000.sents]

In [149]:
National_Review_sents = [[sent, "National Review"] for sent in National_Review_5000.sents]

In [150]:
Talking_Points_Memo_sents = [[sent, "Talking Points Memo"] for sent in Talking_Points_Memo_5000.sents]

In [151]:
NPR_sents = [[sent, "NPR"] for sent in NPR_5000.sents]

In [152]:
Washington_Post_sents = [[sent, "Washington Post"] for sent in Washington_Post_5000.sents]

In [153]:
Reuters_sents = [[sent, "Reuters"] for sent in Reuters_5000.sents]

In [154]:
sentences = pd.DataFrame(Breitbart_sents+New_York_Times_sents+CNN_sents+New_York_Post_sents+National_Review_sents+Talking_Points_Memo_sents+NPR_sents+Washington_Post_sents+Reuters_sents)

In [248]:
sentences = pd.DataFrame(Breitbart_sents+New_York_Times_sents)

### Word Extraction

In [170]:
Breitbart_words = bag_of_words(Breitbart_5000)

In [171]:
New_York_Times_words = bag_of_words(New_York_Times_5000)

In [172]:
CNN_words = bag_of_words(CNN_5000)

In [173]:
New_York_Post_words = bag_of_words(New_York_Post_5000)

In [174]:
Atlantic_words = bag_of_words(Atlantic_5000)

In [175]:
National_Review_words = bag_of_words(National_Review_5000)

In [176]:
Talking_Points_Memo_words = bag_of_words(Talking_Points_Memo_5000)

In [177]:
NPR_words = bag_of_words(NPR_5000)

In [178]:
Washington_Post_words = bag_of_words(Washington_Post_5000)

In [179]:
Reuters_words = bag_of_words(Reuters_5000)

In [180]:
common_words = set(Breitbart_words + New_York_Times_words + CNN_words + New_York_Post_words + National_Review_words + Talking_Points_Memo_words + NPR_words + Washington_Post_words + Reuters_words)

In [249]:
common_words = set(Breitbart_words + New_York_Times_words)

In [250]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000
Processing row 5500
Processing row 6000
Processing row 6500
Processing row 7000
Processing row 7500
Processing row 8000
Processing row 8500
Processing row 9000
Processing row 9500
Processing row 10000
Processing row 10500
Processing row 11000


Unnamed: 0,cost,million,like,national,support,political,facebook,life,program,10,...,note,need,care,want,break,raise,target,attack,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(ecause, she, transitioned, from, a, man, to, ...",Breitbart
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Without, that, ,, it, ’s, hard, to, imagine, ...",Breitbart
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,(”),Breitbart
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"((, Mediaite, ))",Breitbart
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,"(Follow, Ian, Hanchett, on, Twitter, @IanHanch...",Breitbart


In [251]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=0)
cross_val_score(rfc, X_test, y_test, cv=5)

array([ 0.61634103,  0.63765542,  0.68028419,  0.62210339,  0.67736185])

In [252]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)

cross_val_score(train, X_test, y_test, cv=5)

array([ 0.68738899,  0.6642984 ,  0.71580817,  0.65418895,  0.67914439])

In [105]:
# Your code here. Experiment with hidden layers to build your own model.
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(3000), learning_rate='adaptive')
mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=3000, learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [106]:
cross_val_score(mlp, X_test, y_test, cv=5)

array([ 0.61310782,  0.64059197,  0.63424947,  0.65466102,  0.63829787])

### Most Common People

In [204]:
Breitbart_people = [entity.text for entity in list(Breitbart_5000.ents) if entity.label_ == "PERSON"]

In [206]:
Counter(Breitbart_people).most_common(10)

[('Trump', 99),
 ('Donald Trump', 97),
 ('Obama', 54),
 ('Kelly', 32),
 ('Flynn', 32),
 ('Twitter', 24),
 ('Hillary Clinton', 20),
 ('Obamacare', 19),
 ('Virgil', 19),
 ('Barack Obama', 18)]

In [207]:
New_York_Times_people = [entity.text for entity in list(New_York_Times_5000.ents) if entity.label_ == "PERSON"]

In [208]:
Counter(New_York_Times_people).most_common(10)

[('Trump', 177),
 ('Obama', 109),
 ('Kelly', 39),
 ('Donald J. Trump', 28),
 ('Clinton', 28),
 ('Netanyahu', 25),
 ('Hacking Team', 23),
 ('Corzine', 22),
 ('Hacking Team’s', 21),
 ('Clayton', 20)]

In [232]:
def word_filter(set1, set2):
    keeper=[]
    for word in set1:
        if not word in set2:
            keeper.append(word)
        else:
            pass
        
    for word2 in set2:
        if not word2 in set1:
            keeper.append(word2)
        else:
            pass
        
    return(keeper)

In [240]:
Unique_people_nyt_v_b = word_filter(New_York_Times_people, Breitbart_people)

In [241]:
Unique_words_nyt_v_b = word_filter(Breitbart_words,New_York_Times_words)

In [242]:
ts1 =  set(Unique_people_nyt_v_b + Unique_words_nyt_v_b)

In [243]:
df = pd.DataFrame(New_York_Post_sents + Breitbart_sents)

In [244]:
word_counts = bow_features(df, ts1)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000
Processing row 5500
Processing row 6000
Processing row 6500
Processing row 7000
Processing row 7500
Processing row 8000
Processing row 8500
Processing row 9000


Unnamed: 0,Rapoports,vote,Donald Trump,join,Le Pen,Vladimir Ashurkov,Jeff Merkley,Hankins,DeVos,Davis,...,Last Man Standing,F. B. I.,Rubio,Elena,Roof,— Donald J. Trump,Barbara Lee,Compliance Counsel,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(in, the, past, tense, .)",New York Post
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, think, of, her, in, the, present, tense, ,...",New York Post
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,(“),New York Post
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, ’m, angry, and, so, sad, .)",New York Post
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(”, ', ,, ', ATLANTA, —, Washington, wanted, t...",New York Post


In [246]:
rfc = ensemble.GradientBoostingClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=0)
cross_val_score(rfc, X_test, y_test, cv=5)

array([ 0.67659574,  0.67234043,  0.65957447,  0.66170213,  0.66239316])

In [247]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)

cross_val_score(train, X_test, y_test, cv=5)

array([ 0.67234043,  0.68297872,  0.6787234 ,  0.66595745,  0.66025641])

In [268]:
import string

In [269]:
# importing tool
from sklearn.feature_extraction.text import CountVectorizer