## Language Processing for Different Mags

### Importing Libs & Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import scipy
import sklearn
import spacy
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

In [3]:
article = pd.read_csv('articles1.csv')
article2 = pd.read_csv('articles2.csv')
article3 = pd.read_csv('articles3.csv')

In [4]:
article.drop(['url'],axis=1,inplace=True)
article2.drop(['url'],axis=1,inplace=True)
article3.drop(['url'],axis=1,inplace=True)

In [5]:
article.dropna(inplace=True)
article2.dropna(inplace=True)
article3.dropna(inplace=True)

In [6]:
a = article[article['author'] == 'Breitbart News'][['author','content']][0:500]

## Functions

In [7]:
nlp = spacy.load('en')

In [8]:
def body_of_work(content):
    body = []
    for work in content:
        body.append(work)
    return(body)

In [9]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [10]:
from collections import Counter

# Utility function to calculate how frequently words appear in the text.
def word_frequencies(text, include_stop=True):
    
    # Build a list of words.
    # Strip out punctuation and, optionally, stop words.
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    # Build and return a Counter object containing word counts.
    return Counter(words)

In [11]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(250)]

In [12]:
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

In [13]:
def word_filter(set1, set2):
    keeper=[]
    for word in set1:
        if not word in set2:
            keeper.append(word)
        else:
            pass
        
    for word2 in set2:
        if not word2 in set1:
            keeper.append(word2)
        else:
            pass
        
    return(keeper)

## Applying Functions to Top Ten Publishers
### Extracting Author and Content

In [14]:
Breitbart_doc = article[article['publication'] == 'Breitbart'][['author','content']][0:1000]

In [15]:
New_York_Times_doc = article[article['publication'] == 'New York Times'][['author','content']][0:100]

In [16]:
CNN_doc = article[article['publication'] == 'CNN'][['author','content']][0:100]

In [17]:
New_York_Post_doc = article2[article2['publication'] == 'New York Post'][['author','content']][0:100]

In [18]:
Atlantic_doc = article2[article2['publication'] == 'Atlantic'][['author','content']][0:75]

In [19]:
National_Review_doc = article2[article2['publication'] == 'National Review'][['author','content']][0:100]

In [20]:
Talking_Points_Memo_doc = article2[article2['publication'] == 'Talking Points Memo'][['author','content']][0:100]

In [21]:
NPR_doc = article3[article3['publication'] == 'NPR'][['author','content']][0:100]

In [22]:
Washington_Post_doc = article3[article3['publication'] == 'Washington Post'][['author','content']][0:83]

In [23]:
Reuters_doc = article3[article3['publication'] == 'Reuters'][['author','content']][0:100]

### Creating Single Document

In [24]:
Breitbart_doc_c = body_of_work(Breitbart_doc['content'])

In [25]:
New_York_Times_doc_c = body_of_work(New_York_Times_doc['content'])

In [26]:
CNN_doc_c = body_of_work(CNN_doc['content'])

In [27]:
New_York_Post_doc_c = body_of_work(New_York_Post_doc['content'])

In [28]:
Atlantic_doc_c = body_of_work(Atlantic_doc['content'])

In [29]:
National_Review_doc_c = body_of_work(National_Review_doc['content'])

In [30]:
Talking_Points_Memo_doc_c = body_of_work(Talking_Points_Memo_doc['content'])

In [31]:
NPR_doc_c = body_of_work(NPR_doc['content'])

In [32]:
Washington_Post_doc_c = body_of_work(Washington_Post_doc['content'])

In [33]:
Reuters_doc_c = body_of_work(Reuters_doc['content'])

### Changing Data Type

In [34]:
Breitbart_5000 = str(Breitbart_doc_c)

In [35]:
New_York_Times_5000 = str(New_York_Times_doc_c)

In [36]:
CNN_5000 = str(CNN_doc_c)

In [37]:
New_York_Post_5000 = str(New_York_Post_doc_c)

In [38]:
Atlantic_5000 = str(Atlantic_doc_c)

In [39]:
National_Review_5000 = str(National_Review_doc_c)

In [40]:
Talking_Points_Memo_5000 = str(Talking_Points_Memo_doc_c)

In [41]:
NPR_5000 = str(NPR_doc_c)

In [42]:
Washington_Post_5000 = str(Washington_Post_doc_c)

In [43]:
Reuters_5000 = str(Reuters_doc_c)

### Cleaning Text

In [44]:
Breitbart_5000_c = text_cleaner(Breitbart_5000)

In [45]:
New_York_Times_5000_c = text_cleaner(New_York_Times_5000)

In [46]:
CNN_5000_c = text_cleaner(CNN_5000)

In [47]:
New_York_Post_5000_c = text_cleaner(New_York_Post_5000)

In [48]:
Atlantic_5000_c  = text_cleaner(Atlantic_5000)

In [49]:
National_Review_5000_c = text_cleaner(National_Review_5000)

In [50]:
Talking_Points_Memo_5000_c = text_cleaner(Talking_Points_Memo_5000)

In [51]:
NPR_5000_c = text_cleaner(NPR_5000)

In [52]:
Washington_Post_5000_c = text_cleaner(Washington_Post_5000)

In [53]:
Reuters_5000_c = text_cleaner(Reuters_5000)

### Checking Word Count 

In [None]:
def word_count(text):
    count = 0
    for word in text:
        count += 1
    return(count)

In [113]:
word_count(Breitbart_5000)

558493

In [114]:
word_count(New_York_Times_5000)

512587

In [115]:
word_count(CNN_5000)

352166

In [116]:
word_count(New_York_Post_5000)

252519

In [127]:
word_count(Atlantic_5000)

521937

In [118]:
word_count(National_Review_5000)

563754

In [119]:
word_count(Talking_Points_Memo_5000)

293985

In [120]:
word_count(NPR_5000)

430998

In [135]:
word_count(Washington_Post_5000)

557560

In [122]:
word_count(Reuters_5000)

341686

### Applying Spacy

In [54]:
Breitbart_5000_s = nlp(Breitbart_5000_c)

In [55]:
New_York_Times_5000_s = nlp(New_York_Times_5000_c)

In [56]:
CNN_5000_s = nlp(CNN_5000_c)

In [57]:
New_York_Post_5000_s = nlp(New_York_Post_5000_c)

In [58]:
Atlantic_5000_s = nlp(Atlantic_5000_c)

In [59]:
National_Review_5000_s = nlp(National_Review_5000_c)

In [60]:
Talking_Points_Memo_5000_s = nlp(Talking_Points_Memo_5000_c)

In [61]:
NPR_5000_s = nlp(NPR_5000_c)

In [62]:
Washington_Post_5000_s = nlp(Washington_Post_5000_c)

In [63]:
Reuters_5000_s = nlp(Reuters_5000_c)

### Mapping Publisher to Sentences

In [64]:
Breitbart_sents = [[sent, "Breitbart"] for sent in Breitbart_5000_s.sents]

In [65]:
New_York_Times_sents = [[sent, "New York Times"] for sent in New_York_Times_5000_s.sents]

In [66]:
CNN_sents = [[sent, "CNN"] for sent in CNN_5000_s.sents]

In [67]:
New_York_Post_sents = [[sent, "New York Post"] for sent in New_York_Post_5000_s.sents]

In [68]:
Atlantic_sents = [[sent, "Atlantic"] for sent in Atlantic_5000_s.sents]

In [69]:
National_Review_sents = [[sent, "National Review"] for sent in National_Review_5000_s.sents]

In [70]:
Talking_Points_Memo_sents = [[sent, "Talking Points Memo"] for sent in Talking_Points_Memo_5000_s.sents]

In [71]:
NPR_sents = [[sent, "NPR"] for sent in NPR_5000_s.sents]

In [72]:
Washington_Post_sents = [[sent, "Washington Post"] for sent in Washington_Post_5000_s.sents]

In [73]:
Reuters_sents = [[sent, "Reuters"] for sent in Reuters_5000_s.sents]

In [74]:
sentences = pd.DataFrame(Breitbart_sents+New_York_Times_sents + CNN_sents + New_York_Post_sents + Atlantic_sents + National_Review_sents + Talking_Points_Memo_sents + NPR_sents + Washington_Post_sents + Reuters_sents)

In [75]:
sentences = pd.DataFrame(Breitbart_sents+New_York_Times_sents)

### Word Extraction

In [76]:
Breitbart_words = bag_of_words(Breitbart_5000_s)

In [77]:
New_York_Times_words = bag_of_words(New_York_Times_5000_s)

In [78]:
CNN_words = bag_of_words(CNN_5000_s)

In [79]:
New_York_Post_words = bag_of_words(New_York_Post_5000_s)

In [80]:
Atlantic_words = bag_of_words(Atlantic_5000_s)

In [81]:
National_Review_words = bag_of_words(National_Review_5000_s)

In [82]:
Talking_Points_Memo_words = bag_of_words(Talking_Points_Memo_5000_s)

In [83]:
NPR_words = bag_of_words(NPR_5000_s)

In [84]:
Washington_Post_words = bag_of_words(Washington_Post_5000_s)

In [85]:
Reuters_words = bag_of_words(Reuters_5000_s)

In [117]:
common_words = set(Breitbart_words + New_York_Times_words + CNN_words + New_York_Post_words+ Atlantic_words + National_Review_words + Talking_Points_Memo_words + NPR_words + Washington_Post_words + Reuters_words)

In [118]:
# common_words = set(Breitbart_words + New_York_Times_words)

In [119]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000
Processing row 5500
Processing row 6000
Processing row 6500
Processing row 7000
Processing row 7500
Processing row 8000
Processing row 8500
Processing row 9000
Processing row 9500
Processing row 10000
Processing row 10500
Processing row 11000


Unnamed: 0,appear,level,demand,investigation,pic,east,word,effect,strike,mutation,...,senior,water,death,focus,price,be,center,try,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(ecause, she, transitioned, from, a, man, to, ...",Breitbart
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Without, that, ,, it, ’s, hard, to, imagine, ...",Breitbart
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,(”),Breitbart
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"((, Mediaite, ))",Breitbart
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Follow, Ian, Hanchett, on, Twitter, @IanHanch...",Breitbart


In [120]:
tmap = {'Breitbart':0,'New York Times':1,'CNN':2,'New York Post':3
        ,'Atlantic':4,'National Review':5,'Talking Points Memo':6,
        'NPR':7,'Washington Post':8,'Reuters':9}

In [121]:
word_counts['target'] = word_counts['text_source'].map(tmap)

In [122]:
word_counts['target'].unique()

array([0, 1], dtype=int64)

In [123]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [124]:
rfc = ensemble.RandomForestRegressor()
Y = word_counts['target']
X = np.array(word_counts.drop(['text_sentence','text_source','target'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=0)

In [None]:
cross_val_score(rfc, X_test, y_test, cv=5)

In [97]:
from sklearn.linear_model import LogisticRegression

In [125]:
lr = LogisticRegression(solver='newton-cg',multi_class='multinomial')
train = lr.fit(X_train, y_train)
# 'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'

cross_val_score(train, X_test, y_test, cv=5)

array([ 0.67850799,  0.6749556 ,  0.72291297,  0.65953654,  0.67914439])

In [126]:
from sklearn.grid_search import GridSearchCV

In [127]:
# creating parameters to test
param_grid = {'solver':['newton-cg', 'lbfgs','sag'],
              'multi_class':['multinomial']}

In [128]:
# fitting grid with setting
grid = GridSearchCV(LogisticRegression(),param_grid,verbose=3)

In [129]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] multi_class=multinomial, solver=newton-cg .......................
[CV]  multi_class=multinomial, solver=newton-cg, score=0.708645 -   1.6s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s


[CV] multi_class=multinomial, solver=newton-cg .......................
[CV]  multi_class=multinomial, solver=newton-cg, score=0.714692 -   1.4s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.2s remaining:    0.0s


[CV] multi_class=multinomial, solver=newton-cg .......................
[CV]  multi_class=multinomial, solver=newton-cg, score=0.709505 -   1.5s
[CV] multi_class=multinomial, solver=lbfgs ...........................
[CV] .. multi_class=multinomial, solver=lbfgs, score=0.708645 -   1.1s
[CV] multi_class=multinomial, solver=lbfgs ...........................
[CV] .. multi_class=multinomial, solver=lbfgs, score=0.714692 -   1.1s
[CV] multi_class=multinomial, solver=lbfgs ...........................
[CV] .. multi_class=multinomial, solver=lbfgs, score=0.709505 -   1.1s
[CV] multi_class=multinomial, solver=sag .............................




[CV] .... multi_class=multinomial, solver=sag, score=0.709356 -   7.3s
[CV] multi_class=multinomial, solver=sag .............................




[CV] .... multi_class=multinomial, solver=sag, score=0.714692 -   7.3s
[CV] multi_class=multinomial, solver=sag .............................
[CV] .... multi_class=multinomial, solver=sag, score=0.709505 -   4.3s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   27.9s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'solver': ['newton-cg', 'lbfgs', 'sag'], 'multi_class': ['multinomial']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [130]:
# best parameter
grid.best_params_

{'multi_class': 'multinomial', 'solver': 'sag'}

In [131]:
lr = LogisticRegression(solver='sag',multi_class='multinomial')
train = lr.fit(X_train, y_train)

cross_val_score(train, X_test, y_test, cv=5)



array([ 0.67850799,  0.6749556 ,  0.72291297,  0.65953654,  0.67914439])

### Most Common People

In [106]:
def get_people(people):
    return([entity.text for entity in list(people.ents) if entity.label_ == "PERSON"])
    

In [107]:
Breitbart_people = get_people(Breitbart_5000_s)

In [108]:
New_York_Times_people = get_people(New_York_Times_5000_s)

In [109]:
CNN_people = get_people(CNN_5000_s)

In [110]:
New_York_Post_people = get_people(New_York_Post_5000_s)

In [111]:
National_Review_people = get_people(National_Review_5000_s)

In [112]:
Talking_Points_Memo_people = get_people(Talking_Points_Memo_5000_s)

In [113]:
NPR_people = get_people(NPR_5000_s)

In [114]:
Washington_Post_people = get_people(Washington_Post_5000_s)

In [115]:
Reuters_people = get_people(Reuters_5000_s)

In [116]:
Atlantic_people = get_people(Atlantic_5000_s)

In [132]:
common_people = set(Breitbart_people+New_York_Times_people+CNN_people+New_York_Post_people+National_Review_people+Talking_Points_Memo_people+NPR_people+Washington_Post_people+Reuters_people+Atlantic_people)

In [133]:
# Create our data frame with features. This can take a while to run.
people_counts = bow_features(sentences, common_people)
people_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000
Processing row 5500
Processing row 6000
Processing row 6500
Processing row 7000
Processing row 7500
Processing row 8000
Processing row 8500
Processing row 9000
Processing row 9500
Processing row 10000
Processing row 10500
Processing row 11000


Unnamed: 0,Aziz Osmanoglu,John Howard,McGraw,Dean,Jim,Alexey Pushkov,James K. Polk,Debbie,Chris Zaccarelli,Pentagon,...,Rachidi,Desmond Tutu,Aunt Joan,Scott Walker,Auschwitz,Checkpoint,Karzai,Khaled Abu Toameh,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(ecause, she, transitioned, from, a, man, to, ...",Breitbart
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Without, that, ,, it, ’s, hard, to, imagine, ...",Breitbart
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,(”),Breitbart
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"((, Mediaite, ))",Breitbart
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Follow, Ian, Hanchett, on, Twitter, @IanHanch...",Breitbart
