### Library import

In [20]:
import pandas as pd
import glob

from string import punctuation
from nltk.corpus import stopwords
from collections import defaultdict, Counter

### Data import

In [2]:
csv_files = glob.glob("TelevisionNews/*.csv")
df = []
for file in csv_files:
    csv = pd.read_csv(file, header=None, names=['URL', 'MatchDateTime', 'Station', 'Show', 'IAShowID','IAPreviewThumb','Snippet'])
    csv = csv.iloc[1:] #drow first row containing column names 
    df.append(csv)
df = pd.concat(df)

In [3]:
df= df.drop(['URL','Show','IAShowID', 'IAPreviewThumb'], axis=1)
df

Unnamed: 0,MatchDateTime,Station,Snippet
1,3/15/2011 15:11:06,FOXNEWS,forward. greg: i suppose worth pointing out th...
2,3/31/2011 13:16:41,FOXNEWS,threaten a government shutdown. that's what's ...
3,3/30/2011 17:29:04,FOXNEWS,"and less likely. in any case, president obama ..."
4,3/14/2011 22:09:55,FOXNEWS,environmental catastrophe in another part of t...
5,3/15/2011 8:09:55,FOXNEWS,environmental catastrophe in another part of t...
...,...,...,...
32,2/24/2012 14:24:07,CNN,will galvanize them to do stuff and understand...
33,2/23/2012 9:20:57,CNN,"ballot. still to come on wbt, china, the u.s.,..."
34,2/10/2012 15:57:13,CNN,or newt gingrich all that much at all. he was ...
35,2/29/2012 13:56:13,CNN,endless primary with hillary clinton. there we...


In [4]:
# change dtypes 
df['MatchDateTime'] = pd.to_datetime(df['MatchDateTime'])
df['Station'] = df['Station'].astype('string')
df['Snippet'] = df['Snippet'].astype('string')

df.dtypes

MatchDateTime    datetime64[ns]
Station          string[python]
Snippet          string[python]
dtype: object

### Descriptive statistics

In [22]:
# define functions

punctuation = set(punctuation) # speeds up comparison
sw = stopwords.words('english')

def remove_stopwords(tokens) :
    return [token for token in tokens if token not in sw]
    return(tokens)

def remove_punctuation(text) :
    return "".join(ch for ch in text if ch not in punctuation)

def tokenize(text) :
    tokens = text.split()
    return(tokens)

def descriptive_stats(tokens, verbose=True) :
    num_tokens=len(tokens)
    num_unique_tokens = len(set(tokens))
    lexical_diversity = num_unique_tokens/num_tokens
    num_characters = sum(len(token) for token in tokens)

    if verbose :
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")

        # print the five most common tokens
        counter = Counter(tokens)
        top_5_tokens = counter.most_common(5)
        print("Top 5 most common tokens:")
        for token, count in top_5_tokens:
            print(f"{token}: {count} occurrences")

    return([num_tokens, num_unique_tokens,
            lexical_diversity,
            num_characters])

In [6]:
print("Number of news snippets:", len(df))
print("News stations:", df['Station'].unique())
print("\nCount of snippets for each news station: \n",df['Station'].value_counts())

Number of news snippets: 94858
News stations: <StringArray>
['FOXNEWS', 'MSNBC', 'BBCNEWS', 'CNN']
Length: 4, dtype: string

Count of snippets for each news station: 
 Station
MSNBC      26429
FOXNEWS    25865
BBCNEWS    23260
CNN        19304
Name: count, dtype: Int64


In [17]:
# data preprocessing
df['Tokens'] = df['Snippet'].str.lower()
df['Tokens'] = df['Tokens'].apply(remove_punctuation)
df['Tokens'] = tokenize(df['Tokens'].str)
df['Tokens'] = df['Tokens'].apply(remove_stopwords)
df

Unnamed: 0,MatchDateTime,Station,Snippet,Tokens
1,2011-03-15 15:11:06,FOXNEWS,forward. greg: i suppose worth pointing out th...,"[forward, greg, suppose, worth, pointing, gene..."
2,2011-03-31 13:16:41,FOXNEWS,threaten a government shutdown. that's what's ...,"[threaten, government, shutdown, thats, whats,..."
3,2011-03-30 17:29:04,FOXNEWS,"and less likely. in any case, president obama ...","[less, likely, case, president, obama, would, ..."
4,2011-03-14 22:09:55,FOXNEWS,environmental catastrophe in another part of t...,"[environmental, catastrophe, another, part, wo..."
5,2011-03-15 08:09:55,FOXNEWS,environmental catastrophe in another part of t...,"[environmental, catastrophe, another, part, wo..."
...,...,...,...,...
32,2012-02-24 14:24:07,CNN,will galvanize them to do stuff and understand...,"[galvanize, stuff, understand, climate, change..."
33,2012-02-23 09:20:57,CNN,"ballot. still to come on wbt, china, the u.s.,...","[ballot, still, come, wbt, china, us, russia, ..."
34,2012-02-10 15:57:13,CNN,or newt gingrich all that much at all. he was ...,"[newt, gingrich, much, really, trying, say, fo..."
35,2012-02-29 13:56:13,CNN,endless primary with hillary clinton. there we...,"[endless, primary, hillary, clinton, policy, i..."


In [21]:
combined_tokens=[token for sublist in df['Tokens'] for token in sublist]
descriptive_stats(combined_tokens)

There are 2106506 tokens in the data.
There are 46276 unique tokens in the data.
There are 12859252 characters in the data.
The lexical diversity is 0.022 in the data.
Top 5 most common tokens:
climate: 85155 occurrences
change: 77697 occurrences
global: 24371 occurrences
warming: 21077 occurrences
president: 13685 occurrences


[2106506, 46276, 0.02196813111379697, 12859252]