# Language Feature Analysis - News

### Author 
Stephen Lee

### Goal
Find language features of the following news articles: 
- Fox News
- Vox News
- PBS News

Features include: 
- word frequencies
- part of speech tagging and frequencies

### Date 
3.4.19

Updated: 6.13.19; 7.23.19

# Read Data

In [66]:
import pandas as pd
import numpy as np
import os

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

os.getcwd()

'/Users/stevelee/Dropbox/General/Projects/Thesis/code/analysis/features'

In [25]:
DATASET_PATH = "/Users/stevelee/Dropbox/General/Projects/Thesis/data/"
WRITE_PATH = "/Users/stevelee/Dropbox/General/Projects/Thesis/paper/figures/"

In [4]:
FILE = "cleaner_article_df.csv"

In [88]:
os.chdir(DATASET_PATH)

In [106]:
df = pd.read_csv(FILE, sep='|').drop('Unnamed: 0', axis=1)

In [107]:
df.head(2)

Unnamed: 0,article id,source,article,clean_articles,targets
0,fox_politics_166,Fox,<br>\nFormer New Jersey Gov. Chris Christie sa...,<br>\nFormer New Jersey Gov. Chris Christie sa...,2
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B...","FILE--In this July 28, 2016 file photo, Sen. B...",2


In [108]:
df = df.drop('article', axis=1)

In [109]:
df.groupby('source').count()

Unnamed: 0_level_0,article id,clean_articles,targets
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fox,661,661,661
PBS,1739,1739,1739
Vox,1027,1027,1027


In [110]:
# check no null values 
df[df['clean_articles'].isnull()].head()

Unnamed: 0,article id,source,clean_articles,targets


In [111]:
# check no duplicates
df.groupby("source").describe(include='all')

Unnamed: 0_level_0,article id,article id,article id,article id,article id,article id,article id,article id,article id,article id,article id,clean_articles,clean_articles,clean_articles,clean_articles,clean_articles,clean_articles,clean_articles,clean_articles,clean_articles,clean_articles,clean_articles,targets,targets,targets,targets,targets,targets,targets,targets,targets,targets,targets
Unnamed: 0_level_1,count,unique,top,freq,mean,std,min,25%,50%,75%,max,count,unique,top,freq,mean,std,min,25%,50%,75%,max,count,unique,top,freq,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2
Fox,661,661,fox_politics_417,1,,,,,,,,661,661,"Michael Cohen: President Trump is a racist, co...",1,,,,,,,,661.0,,,,2.0,0.0,2.0,2.0,2.0,2.0,2.0
PBS,1739,1739,pbs_politics_60,1,,,,,,,,1739,1739,House Speaker Nancy Pelosi is officially pos...,1,,,,,,,,1739.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Vox,1027,1027,vox_politics_1877,1,,,,,,,,1027,1027,When President Trump indelibly called African ...,1,,,,,,,,1027.0,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0


# Setup

In [12]:
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go 
from plotly import tools 
from wordcloud import STOPWORDS 
from collections import defaultdict 
py.init_notebook_mode(connected=True)

In [13]:
fox = df[df['source'] == "Fox"]
vox = df[df['source'] == "Vox"]
pbs = df[df['source'] == "PBS"]

#### generate n grams or 'tokens' of various length


In [14]:
def generate_ngrams(txt, n_gram=1):
    token = [t for t in txt.lower().split(' ') if ((t not in STOPWORDS) and (t.isalpha()))]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(g) for g in ngrams]

#### make a horizontal plot

In [15]:
def hz_chart(df, color):
    trace = go.Bar(
        y = df['token'].values[::-1],
        x = df['count'].values[::-1], 
        showlegend=False,
        orientation = 'h', 
        marker=dict(color=color))
    return trace

#### prepare the plot


In [16]:
def plt_freq(txt, n_grams=1, number=15, color='blue'):
    freq = defaultdict(int)
    for q in txt:
        for ngram in generate_ngrams(q, n_grams):
            freq[ngram] += 1
    df_sorted = pd.DataFrame(sorted(freq.items(), key=lambda x: x[1])[::-1])
    df_sorted.columns = ['token', 'count']
    return hz_chart(df_sorted.head(number), color)

#### get ngram frequencies

In [17]:
def ngram_freq(txt, n_grams=1, number=15):
    freq = defaultdict(int)
    for q in txt:
        for ngram in generate_ngrams(q, n_grams):
            freq[ngram] += 1
    df_sorted = pd.DataFrame(sorted(freq.items(), key=lambda x: x[1])[::-1])
    df_sorted.columns = ['token', 'count']
    return df_sorted

#### display plot

In [18]:
def disp_freq_plot(word_freqs, title):
    title = [title]
    fig = tools.make_subplots(rows=1, cols=1, vertical_spacing=0.04, subplot_titles=title)
    fig.append_trace(word_freqs, 1, 1)
    fig['layout'].update(height=900, width=600, title='Word Counts')
    py.iplot(fig, filename='counts')

In [23]:
def save_table(df, file_name): 
    curdir = os.getcwd()
    if curdir != WRITE_PATH: 
        os.chdir(WRITE_PATH)
    
    df.to_csv(file_name)
    
    os.chdir(curdir)

## Frequent Words (ngram = 1)

In [19]:
# fox_counts = plt_freq(fox['article'], n_grams=1, number=20)
# vox_counts = plt_freq(vox['article'], n_grams=1, number=20)
# pbs_counts = plt_freq(pbs['article'], n_grams=1, number=20)

# disp_freq_plot(fox_counts, "Fox News")
# disp_freq_plot(vox_counts, "Vox News")
# disp_freq_plot(pbs_counts, "PBS News")

In [26]:
fox_ngrams = ngram_freq(fox['clean_articles'], n_grams=1, number=20)
vox_ngrams = ngram_freq(vox['clean_articles'], n_grams=1, number=20)
pbs_ngrams = ngram_freq(pbs['clean_articles'], n_grams=1, number=20)

save_table(fox_ngrams, 'fox_word_freq.csv')
save_table(vox_ngrams, 'vox_word_freq.csv')
save_table(pbs_ngrams, 'pbs_word_freq.csv')

In [27]:
fox_ngrams.head(10)

Unnamed: 0,token,count
0,trump,2510
1,said,2009
2,president,1730
3,house,1656
4,new,1569
5,will,1414
6,democratic,1137
7,democrats,954
8,told,862
9,border,790


In [28]:
vox_ngrams.head(10)

Unnamed: 0,token,count
0,trump,5446
1,tax,5206
2,will,4098
3,people,4013
4,health,4003
5,bill,3138
6,republicans,2655
7,one,2573
8,new,2566
9,care,2565


In [29]:
pbs_ngrams.head(10)

Unnamed: 0,token,count
0,trump,7811
1,said,7383
2,president,3997
3,house,3495
4,will,3079
5,new,2277
6,white,2184
7,senate,2002
8,democrats,1963
9,campaign,1933


## Frequent Phrases (ngram = 2)

In [30]:
fox_ngrams = ngram_freq(fox['clean_articles'], n_grams=2, number=20)
vox_ngrams = ngram_freq(vox['clean_articles'], n_grams=2, number=20)
pbs_ngrams = ngram_freq(pbs['clean_articles'], n_grams=2, number=20)

save_table(fox_ngrams, 'fox_2gram_freq.csv')
save_table(vox_ngrams, 'vox_2gram_freq.csv')
save_table(pbs_ngrams, 'pbs_2gram_freq.csv')

In [31]:
fox_ngrams.head()

Unnamed: 0,token,count
0,white house,556
1,new york,359
2,president trump,318
3,green new,256
4,health care,160


In [32]:
vox_ngrams.head()

Unnamed: 0,token,count
0,health care,1654
1,white house,743
2,trump administration,672
3,donald trump,598
4,tax cuts,479


In [33]:
pbs_ngrams.head()

Unnamed: 0,token,count
0,white house,1683
1,president donald,1297
2,donald trump,1035
3,special counsel,613
4,supreme court,584


## Frequent Phrases (ngram = 3)

In [34]:
fox_ngrams = ngram_freq(fox['clean_articles'], n_grams=3, number=20)
vox_ngrams = ngram_freq(vox['clean_articles'], n_grams=3, number=20)
pbs_ngrams = ngram_freq(pbs['clean_articles'], n_grams=3, number=20)

save_table(fox_ngrams, 'fox_3gram_freq.csv')
save_table(vox_ngrams, 'vox_3gram_freq.csv')
save_table(pbs_ngrams, 'pbs_3gram_freq.csv')

In [35]:
fox_ngrams.head()

Unnamed: 0,token,count
0,green new deal,143
1,house speaker nancy,81
2,special counsel robert,72
3,partial government shutdown,63
4,speaker nancy pelosi,58


In [36]:
vox_ngrams.head()

Unnamed: 0,token,count
0,affordable care act,222
1,president donald trump,157
2,congressional budget office,127
3,health care bill,121
4,new york times,115


In [37]:
pbs_ngrams.head()

Unnamed: 0,token,count
0,president donald trump,785
1,special counsel robert,396
2,majority leader mitch,179
3,attorney general jeff,139
4,senate judiciary committee,137


# Find Avg Number of Words Per Sentence

In [77]:
def article_to_sentences(article): 
    return article.split('.')

def avg_word_count(lst_of_sentences): 
    
    # separate sentences into words [ [["hi"], ["bob"]], ..., [last_sentence]]
    sentences_by_word = [s.strip().split(" ") for s in lst_of_sentences] 
    
    # count how many words are in each sentence
    words_per_sentence = [len(w) for w in sentences_by_word]
    
    return np.mean(words_per_sentence)

def get_avg_wd_ct(article): 
    return avg_word_count(article_to_sentences(article))

# Test case 

tst_article = "Sentence one has 5 words. Sentence two has 6 words, also?"
avg_words = 5.5 
calc_words = get_avg_wd_ct(tst_article)
print("The sentence '{s}' has an average of {avg} words per sentence.".format(s=tst_article, avg=calc_words))

if avg_words == calc_words: 
    print("PASSED")
else: 
    print("FAILED. DO NOT CONTINUE!")

The sentence 'Sentence one has 5 words. Sentence two has 6 words, also?' has an average of 5.5 words per sentence.
PASSED


In [82]:
df_wps = df.copy()
df_wps.head(2)

Unnamed: 0,article id,source,clean_articles,targets
0,fox_politics_166,Fox,<br>\nFormer New Jersey Gov. Chris Christie sa...,2
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B...",2


In [83]:
avg_ct = df_wps['clean_articles'].apply(get_avg_wd_ct)
df_wps['avg_wd_ct'] = avg_ct
df_wps = df_wps[['article id', 'source', 'clean_articles', 'avg_wd_ct']]
df_wps.head(2)

Unnamed: 0,article id,source,clean_articles,avg_wd_ct
0,fox_politics_166,Fox,<br>\nFormer New Jersey Gov. Chris Christie sa...,16.55
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B...",16.9


In [84]:
save_table(df_wps, 'avg_word_cts.csv')

In [128]:
df_wrd_ct_table = df_wps[['source', 'avg_wd_ct']].groupby('source').describe()
df_wrd_ct_table

Unnamed: 0_level_0,avg_wd_ct,avg_wd_ct,avg_wd_ct,avg_wd_ct,avg_wd_ct,avg_wd_ct,avg_wd_ct,avg_wd_ct
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Fox,661.0,20.082732,66.360679,1.0,15.272727,17.354839,19.708333,1721.0
PBS,1739.0,17.991615,3.700427,4.333333,15.460499,17.9,20.278889,34.88
Vox,1027.0,21.316836,3.428075,6.0,19.142997,21.259259,23.471008,33.954545


In [129]:
save_table(df_wrd_ct_table, 'sum_table_avg_word_cts.csv')

# Parts of Speech

In [38]:
from nltk import word_tokenize, pos_tag

In [39]:
def remove_punct(tagged_doc):
    '''
       in... a list of pos tags -> [(This,DT),...,(.,.)]
       out.. a list of pos tags without punctuation
    '''
    return [i for i in tagged_doc if i[0].isalpha()]

def get_word_ct(tagged_doc):
    '''
       in... a list of pos tags -> [(This,DT),...,(.,.)]
       out.. integer count of total words
    '''
    return len(remove_punct(tagged_doc))

def get_pos_ct(tagged_doc, *args):
    '''
       in... a list of pos tags -> [(This,DT),(is,VBZ)...]
             a list of pos tags to find -> ['DT', 'JJ', ...]
       out.. an integer count of total words with that pos
    '''
    pos_to_find = list(args)
    return len([i for i in tagged_doc if i[1] in pos_to_find])

def get_pos_ratio(tagged_doc, *args):
    total    = float(get_word_ct(tagged_doc))
    if total == 0:
        return 0
    else:
        relevant = float(get_pos_ct(tagged_doc, *args))
        return relevant / total

In [112]:
# tokenize and tag words
df['pos'] = df['clean_articles'].apply(word_tokenize).apply(pos_tag).apply(remove_punct)
df.head(2)

Unnamed: 0,article id,source,clean_articles,targets,pos
0,fox_politics_166,Fox,<br>\nFormer New Jersey Gov. Chris Christie sa...,2,"[(br, NN), (Former, NNP), (New, NNP), (Jersey,..."
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B...",2,"[(FILE, NNP), (In, IN), (this, DT), (July, NNP..."


In [113]:
# find basic adjectives
args = ['JJ']
df['adjectives'] = df['pos'].apply(get_pos_ct, args=args)

# find comparative adjectives
args = ['JJR']
df['comparative'] = df['pos'].apply(get_pos_ct, args=args)

# find superlative adjectives
args = ['JJS']
df['superlative'] = df['pos'].apply(get_pos_ct, args=args)

df.head(2)

Unnamed: 0,article id,source,clean_articles,targets,pos,adjectives,comparative,superlative
0,fox_politics_166,Fox,<br>\nFormer New Jersey Gov. Chris Christie sa...,2,"[(br, NN), (Former, NNP), (New, NNP), (Jersey,...",16,1,0
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B...",2,"[(FILE, NNP), (In, IN), (this, DT), (July, NNP...",45,0,1


In [114]:
df['total_words'] = df['pos'].apply(get_word_ct)

# find basic adjectives
args = ['JJ']
df['adjectives'] = df['pos'].apply(get_pos_ct, args=args)

# find comparative adjectives
args = ['JJR']
df['comparative'] = df['pos'].apply(get_pos_ct, args=args)

# find superlative adjectives
args = ['JJS']
df['superlative'] = df['pos'].apply(get_pos_ct, args=args)

df.head(2)

Unnamed: 0,article id,source,clean_articles,targets,pos,adjectives,comparative,superlative,total_words
0,fox_politics_166,Fox,<br>\nFormer New Jersey Gov. Chris Christie sa...,2,"[(br, NN), (Former, NNP), (New, NNP), (Jersey,...",16,1,0,325
1,fox_politics_390,Fox,"FILE--In this July 28, 2016 file photo, Sen. B...",2,"[(FILE, NNP), (In, IN), (this, DT), (July, NNP...",45,0,1,640


In [115]:
df['superlative_adj_ratio'] = df['superlative'] / df['total_words']
df['comparative_adj_ratio'] = df['comparative'] / df['total_words']
df['adj_ratio'] = df['adjectives'] / df['total_words']

In [116]:
# find basic adverbs
args = ['RB']
df['adverbs'] = df['pos'].apply(get_pos_ct, args=args)

# find comparative adverbs
args = ['RBR']
df['comparative_adverb'] = df['pos'].apply(get_pos_ct, args=args)

# find superlative adverbs
args = ['RBS']
df['superlative_adverb'] = df['pos'].apply(get_pos_ct, args=args)

In [117]:
df['superlative_adv_ratio'] = df['superlative_adverb'] / df['total_words']
df['comparative_adv_ratio'] = df['comparative_adverb'] / df['total_words']
df['adv_ratio'] = df['adverbs'] / df['total_words']

In [118]:
# descriptive ratio
# i.e. using any form of an adverb or adjective
descriptive_pos = ['JJ','JJR','JJS','RB','RBR','RBS']
df['descriptive_ratio'] = df['pos'].apply(get_pos_ratio, args=descriptive_pos)

# colorful ratio
# i.e. using superlative adjectives or adverbs
colorful_pos = ['JJS','RBS']
df['colorful_ratio'] = df['pos'].apply(get_pos_ratio, args=colorful_pos)

In [119]:
df[df['source'] == 'Fox'].iloc[3:5]

Unnamed: 0,article id,source,clean_articles,targets,pos,adjectives,comparative,superlative,total_words,superlative_adj_ratio,comparative_adj_ratio,adj_ratio,adverbs,comparative_adverb,superlative_adverb,superlative_adv_ratio,comparative_adv_ratio,adv_ratio,descriptive_ratio,colorful_ratio
3,fox_politics_102,Fox,Student Union: Make UC Berkeley a sanctuary ca...,2,"[(Student, NN), (Union, NNP), (Make, NNP), (UC...",92,9,2,953,0.002099,0.009444,0.096537,43,1,2,0.002099,0.001049,0.045121,0.156348,0.004197
4,fox_politics_492,Fox,President Trump’s health care executive order:...,2,"[(President, NNP), (Trump, NNP), (s, VBZ), (he...",40,5,1,678,0.001475,0.007375,0.058997,17,0,0,0.0,0.0,0.025074,0.09292,0.001475


In [120]:
df[df['source'] == 'PBS'].iloc[3:5]

Unnamed: 0,article id,source,clean_articles,targets,pos,adjectives,comparative,superlative,total_words,superlative_adj_ratio,comparative_adj_ratio,adj_ratio,adverbs,comparative_adverb,superlative_adverb,superlative_adv_ratio,comparative_adv_ratio,adv_ratio,descriptive_ratio,colorful_ratio
1691,pbs_politics_1198,PBS,\nPresident Donald Trump says newly confirmed ...,0,"[(President, NNP), (Donald, NNP), (Trump, NNP)...",9,1,0,103,0.0,0.009709,0.087379,4,0,0,0.0,0.0,0.038835,0.135922,0.0
1692,pbs_politics_682,PBS,President Donald Trump is adding a new lawyer ...,0,"[(President, NNP), (Donald, NNP), (Trump, NNP)...",7,0,0,127,0.0,0.0,0.055118,3,0,0,0.0,0.0,0.023622,0.07874,0.0


In [121]:
df_sum = df[['source','total_words','superlative_adj_ratio','comparative_adj_ratio','adj_ratio','superlative_adv_ratio','comparative_adv_ratio','adv_ratio','colorful_ratio','descriptive_ratio']]
df_summary_table = df_sum.groupby('source').describe()
df_summary_table

Unnamed: 0_level_0,adj_ratio,adj_ratio,adj_ratio,adj_ratio,adj_ratio,adj_ratio,adj_ratio,adj_ratio,adv_ratio,adv_ratio,adv_ratio,adv_ratio,adv_ratio,adv_ratio,adv_ratio,adv_ratio,colorful_ratio,colorful_ratio,colorful_ratio,colorful_ratio,colorful_ratio,colorful_ratio,colorful_ratio,colorful_ratio,comparative_adj_ratio,comparative_adj_ratio,comparative_adj_ratio,comparative_adj_ratio,comparative_adj_ratio,comparative_adj_ratio,comparative_adj_ratio,comparative_adj_ratio,comparative_adv_ratio,comparative_adv_ratio,comparative_adv_ratio,comparative_adv_ratio,comparative_adv_ratio,comparative_adv_ratio,comparative_adv_ratio,comparative_adv_ratio,descriptive_ratio,descriptive_ratio,descriptive_ratio,descriptive_ratio,descriptive_ratio,descriptive_ratio,descriptive_ratio,descriptive_ratio,superlative_adj_ratio,superlative_adj_ratio,superlative_adj_ratio,superlative_adj_ratio,superlative_adj_ratio,superlative_adj_ratio,superlative_adj_ratio,superlative_adj_ratio,superlative_adv_ratio,superlative_adv_ratio,superlative_adv_ratio,superlative_adv_ratio,superlative_adv_ratio,superlative_adv_ratio,superlative_adv_ratio,superlative_adv_ratio,total_words,total_words,total_words,total_words,total_words,total_words,total_words,total_words
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2,Unnamed: 71_level_2,Unnamed: 72_level_2
Fox,660.0,0.065738,0.014907,0.0,0.055032,0.065353,0.075273,0.133333,660.0,0.033793,0.010338,0.0,0.027073,0.033643,0.040406,0.072289,661.0,0.00208,0.00228,0.0,0.0,0.001664,0.003145,0.015717,660.0,0.002414,0.002588,0.0,0.0,0.00198,0.003518,0.015152,660.0,0.001263,0.001631,0.0,0.0,0.000748,0.002078,0.009259,661.0,0.105132,0.01991,0.0,0.091797,0.104126,0.118098,0.183838,660.0,0.001653,0.002004,0.0,0.0,0.001179,0.002674,0.015707,660.0,0.00043,0.000982,0.0,0.0,0.0,0.0,0.009823,661.0,686.181543,403.614886,0.0,421.0,590.0,864.0,5081.0
PBS,1739.0,0.065924,0.016326,0.0,0.056235,0.064815,0.075245,0.141844,1739.0,0.031578,0.011357,0.0,0.024737,0.031509,0.038669,0.082474,1739.0,0.001948,0.002501,0.0,0.0,0.001215,0.00299,0.019802,1739.0,0.003502,0.003745,0.0,0.0,0.002749,0.005096,0.045593,1739.0,0.001292,0.001871,0.0,0.0,0.0,0.002062,0.015385,1739.0,0.104244,0.021439,0.0,0.092292,0.104592,0.116595,0.18845,1739.0,0.001606,0.002289,0.0,0.0,0.000939,0.002437,0.019802,1739.0,0.000342,0.000918,0.0,0.0,0.0,0.0,0.010638,1739.0,654.259344,492.754624,48.0,249.0,614.0,906.5,5060.0
Vox,1027.0,0.07259,0.014473,0.0,0.062801,0.071429,0.081087,0.1384,1027.0,0.045803,0.010446,0.0,0.039511,0.045296,0.05198,0.090604,1027.0,0.003155,0.002184,0.0,0.001629,0.002926,0.004194,0.014545,1027.0,0.005473,0.004124,0.0,0.002597,0.004566,0.007463,0.031835,1027.0,0.002592,0.001975,0.0,0.001155,0.002315,0.003749,0.011976,1027.0,0.129613,0.021951,0.0,0.115581,0.128638,0.142483,0.211618,1027.0,0.002422,0.001862,0.0,0.00112,0.002284,0.003259,0.012024,1027.0,0.000733,0.001045,0.0,0.0,0.000456,0.001159,0.014545,1027.0,1332.795521,779.304697,6.0,791.5,1211.0,1673.5,6126.0


In [122]:
df_sum.groupby('source').mean()

Unnamed: 0_level_0,total_words,superlative_adj_ratio,comparative_adj_ratio,adj_ratio,superlative_adv_ratio,comparative_adv_ratio,adv_ratio,colorful_ratio,descriptive_ratio
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Fox,686.181543,0.001653,0.002414,0.065738,0.00043,0.001263,0.033793,0.00208,0.105132
PBS,654.259344,0.001606,0.003502,0.065924,0.000342,0.001292,0.031578,0.001948,0.104244
Vox,1332.795521,0.002422,0.005473,0.07259,0.000733,0.002592,0.045803,0.003155,0.129613


In [123]:
df_sum.groupby('source').std()

Unnamed: 0_level_0,total_words,superlative_adj_ratio,comparative_adj_ratio,adj_ratio,superlative_adv_ratio,comparative_adv_ratio,adv_ratio,colorful_ratio,descriptive_ratio
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Fox,403.614886,0.002004,0.002588,0.014907,0.000982,0.001631,0.010338,0.00228,0.01991
PBS,492.754624,0.002289,0.003745,0.016326,0.000918,0.001871,0.011357,0.002501,0.021439
Vox,779.304697,0.001862,0.004124,0.014473,0.001045,0.001975,0.010446,0.002184,0.021951


In [126]:
df_sum.groupby('source').count()

Unnamed: 0_level_0,total_words,superlative_adj_ratio,comparative_adj_ratio,adj_ratio,superlative_adv_ratio,comparative_adv_ratio,adv_ratio,colorful_ratio,descriptive_ratio
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Fox,661,660,660,660,660,660,660,661,661
PBS,1739,1739,1739,1739,1739,1739,1739,1739,1739
Vox,1027,1027,1027,1027,1027,1027,1027,1027,1027


In [53]:
save_table(df_sum, 'analytical_summary.csv')

In [127]:
save_table(df_summary_table, 'grouped_summary.csv')