### Library import

In [1]:
import pandas as pd
import glob

from string import punctuation
from nltk.corpus import stopwords
from collections import defaultdict, Counter

### Data import

In [2]:
csv_files = glob.glob("data/TelevisionNews/*.csv")
df = []
for file in csv_files:
    csv = pd.read_csv(file, header=None, names=['URL', 'MatchDateTime', 'Station', 'Show', 'IAShowID','IAPreviewThumb','Snippet'])
    csv = csv.iloc[1:] #drow first row containing column names 
    df.append(csv)
df = pd.concat(df)

In [3]:
df= df.drop(['URL','Show','IAShowID', 'IAPreviewThumb'], axis=1)
df

Unnamed: 0,MatchDateTime,Station,Snippet
1,1/31/2017 5:53:28,BBCNEWS,beena part to do. the airline industry has not...
2,1/18/2017 19:21:01,BBCNEWS,"it's beaten it by about 0.1, 0.12 degrees cels..."
3,1/5/2017 21:48:46,BBCNEWS,"contact more than expected, how. your co nta c..."
4,1/5/2017 21:13:33,BBCNEWS,"where every time a marketplace is closed down,..."
5,1/11/2017 3:11:51,BBCNEWS,"applause climate change, a controversial issue..."
...,...,...,...
198,1/7/2020 10:27:25,MSNBC,they could be facing. the climate change has m...
199,1/10/2020 1:57:54,MSNBC,fact that they've had a number of decisions th...
200,1/15/2020 5:53:44,MSNBC,i think there were missed opportunities to tal...
201,1/5/2020 3:55:58,MSNBC,potentially reshape the democratic primaries. ...


In [4]:
# change dtypes 
df['MatchDateTime'] = pd.to_datetime(df['MatchDateTime'])
df['Station'] = df['Station'].astype('string')
df['Snippet'] = df['Snippet'].astype('string')

df.dtypes

MatchDateTime    datetime64[ns]
Station          string[python]
Snippet          string[python]
dtype: object

### Descriptive statistics

In [5]:
# define functions

punctuation = set(punctuation) # speeds up comparison
sw = stopwords.words('english')

def remove_stopwords(tokens) :
    return [token for token in tokens if token not in sw]
    return(tokens)

def remove_punctuation(text) :
    return "".join(ch for ch in text if ch not in punctuation)

def tokenize(text) :
    tokens = text.split()
    return(tokens)

def descriptive_stats(tokens, verbose=True) :
    num_tokens=len(tokens)
    num_unique_tokens = len(set(tokens))
    lexical_diversity = num_unique_tokens/num_tokens
    num_characters = sum(len(token) for token in tokens)

    if verbose :
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")

        # print the five most common tokens
        counter = Counter(tokens)
        top_5_tokens = counter.most_common(5)
        print("Top 5 most common tokens:")
        for token, count in top_5_tokens:
            print(f"{token}: {count} occurrences")

    return([num_tokens, num_unique_tokens,
            lexical_diversity,
            num_characters])

In [6]:
print("Number of news snippets:", len(df))
print("News stations:", df['Station'].unique())
print("\nCount of snippets for each news station: \n",df['Station'].value_counts())

Number of news snippets: 94858
News stations: <StringArray>
['BBCNEWS', 'CNN', 'FOXNEWS', 'MSNBC']
Length: 4, dtype: string

Count of snippets for each news station: 
 Station
MSNBC      26429
FOXNEWS    25865
BBCNEWS    23260
CNN        19304
Name: count, dtype: Int64


In [7]:
# data preprocessing
df['Tokens'] = df['Snippet'].str.lower()
df['Tokens'] = df['Tokens'].apply(remove_punctuation)
df['Tokens'] = tokenize(df['Tokens'].str)
df['Tokens'] = df['Tokens'].apply(remove_stopwords)
df

Unnamed: 0,MatchDateTime,Station,Snippet,Tokens
1,2017-01-31 05:53:28,BBCNEWS,beena part to do. the airline industry has not...,"[beena, part, airline, industry, part, move, r..."
2,2017-01-18 19:21:01,BBCNEWS,"it's beaten it by about 0.1, 0.12 degrees cels...","[beaten, 01, 012, degrees, celsius, doesnt, se..."
3,2017-01-05 21:48:46,BBCNEWS,"contact more than expected, how. your co nta c...","[contact, expected, co, nta, ct, le, ns, expec..."
4,2017-01-05 21:13:33,BBCNEWS,"where every time a marketplace is closed down,...","[every, time, marketplace, closed, another, ap..."
5,2017-01-11 03:11:51,BBCNEWS,"applause climate change, a controversial issue...","[applause, climate, change, controversial, iss..."
...,...,...,...,...
198,2020-01-07 10:27:25,MSNBC,they could be facing. the climate change has m...,"[could, facing, climate, change, made, worse, ..."
199,2020-01-10 01:57:54,MSNBC,fact that they've had a number of decisions th...,"[fact, theyve, number, decisions, thrown, back..."
200,2020-01-15 05:53:44,MSNBC,i think there were missed opportunities to tal...,"[think, missed, opportunities, talk, things, l..."
201,2020-01-05 03:55:58,MSNBC,potentially reshape the democratic primaries. ...,"[potentially, reshape, democratic, primaries, ..."


In [10]:
# Store as csv
df.to_csv('data/news_cleaned.csv', index=False)

In [9]:
combined_tokens=[token for sublist in df['Tokens'] for token in sublist]
descriptive_stats(combined_tokens)

There are 2106506 tokens in the data.
There are 46276 unique tokens in the data.
There are 12859252 characters in the data.
The lexical diversity is 0.022 in the data.
Top 5 most common tokens:
climate: 85155 occurrences
change: 77697 occurrences
global: 24371 occurrences
warming: 21077 occurrences
president: 13685 occurrences


[2106506, 46276, 0.02196813111379697, 12859252]