# DaNewsRoom: Samfundsartikler @ TV2
This file covers how real news articles from DaNewsRoom were pre-processed

## SETUP: Packages & Data

In [1]:
import nltk
nltk.download("popular", quiet=True)

True

In [2]:
import gzip 
import pandas as pd 
from nltk.tokenize import word_tokenize, TreebankWordDetokenizer

In [4]:
#mount google drive (if run from google colab)
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
with gzip.open("/content/drive/MyDrive/000 bachelor-project/danewsroom/danewsroom.jsonl.gz") as f:
    raw_data = pd.read_json(f, lines=True)

In [6]:
data = raw_data.copy()

In [7]:
data = data[data["site"]=="nyhederne.tv2.dk"]
data = data.reset_index(drop=True)

In [8]:
urls = data["url"]

### Filter only SAMFUND articles

In [9]:
import re 

samfund_urls = []

for i in range(len(urls)): 
  match = re.search(r"\bsamfund\b", urls[i])
  if match is not None: 
    samfund_urls.append(urls[i])

In [12]:
samfund_urls_df = pd.DataFrame(samfund_urls, columns = ["url"])

In [13]:
samf_data = data.copy()
samf_data = samf_data.merge(samfund_urls_df, on="url", how="inner")

### Cleaning data (NB)

In [14]:
# select columns
samf_data = samf_data[["url", "title", "text"]]

In [15]:
#rename title column 
samf_data = samf_data.rename(columns = {'title':'header'})

In [16]:
samf_data

Unnamed: 0,url,header,text
0,http://nyhederne.tv2.dk/samfund/2014-10-19-eti...,Etisk Råd: Gentest kan give sort sky over hovedet,Der er sket en fordobling i antallet af danske...
1,http://nyhederne.tv2.dk/samfund/2014-09-03-fra...,Fra nedtur til optur: Danmark bedre til konkur...,Danmark rykker frem på rangliste over de mest ...
2,http://nyhederne.tv2.dk/samfund/2015-03-10-adv...,Advarsel: Drikkevandet skal koges i Vrads,Borgerne i Vrads syd for Silkeborg skal koge v...
3,http://nyhederne.tv2.dk/samfund/2015-11-28-asy...,Asylsøgere må vente 10 måneder på samtale,"Der bør kun gå ""kort tid"", før asylansøgere, d..."
4,http://nyhederne.tv2.dk/samfund/2014-01-05-eft...,Efter søns selvmord: Melder læge til politiet,Den 20-årige Danilo Terrida tog sit eget liv v...
...,...,...,...
4448,http://nyhederne.tv2.dk/samfund/2014-02-08-lyk...,Lykketoft om Israel: - Der er ingen diplomatis...,Folketingets formand Mogens Lykketoft (S) er b...
4449,http://nyhederne.tv2.dk/samfund/2014-08-27-ove...,Overborgmesteren vil tv-overvåge mere af Køben...,Der skal gives mere kommunal støtte til TV-ove...
4450,http://nyhederne.tv2.dk/samfund/2014-09-11-nyt...,Nyttejob overtager kommunalt ansattes arbejde,"Nyttejob-ordningen, der sender kontanthjælpsmo..."
4451,http://nyhederne.tv2.dk/samfund/2015-10-05-fla...,Flaskesamler vil ikke være i Danmark: Hjælp mi...,I 2010 satte nigerianske Kingsley Okoro livet ...


In [17]:
for i in range(len(samf_data["text"])):
  samf_data["text"][i] = samf_data["text"][i].replace("\n\n", " ")

### Subheaders & Cleaning Text Column

In [18]:
sub_headers = []

for i in range(len(samf_data["text"])):
  sentence_tokenize = nltk.sent_tokenize(samf_data["text"][i]) #tokenize but in sentences
  sub_headers.append(sentence_tokenize[0]) #append the first sentence to sub_headers lst 

# create sub_header col 
samf_data["sub_header"] = sub_headers

In [19]:
short_text = []

for i in range(len(samf_data["text"])):
  n_remove = len(samf_data["sub_header"][i]) + 1 #remove sub_header from text
  short_text.append(samf_data["text"][i][n_remove:])

# create text col with sub_header removed
samf_data["short_text"] = short_text

## Shorten Article Function

In [24]:
def shorten_article(article: str) -> str:
    '''
    Function to shorten an article to roughly 120 words using nltk (stops at the first natural punctuation after 120 words) 

    Parameters
    article (text to be shortened): str
    '''

    tokens = word_tokenize(article) #use nltk to tokenize the text
    punctuations = [".", "!", "?", ","]
    stops = [".", "!", "?",]

    counter = 0 
    
    short_article = []
    
    for i in range(len(tokens)): 
        if counter > 120 and tokens[i] in stops: # if counter is over 120 & token is a stop, break the loop (to not cut in middle of a sentence)
            short_article.append(tokens[i]) 
            break
        else: 
            if tokens[i] not in punctuations: # count the number of words (not punctuations)
                short_article.append(tokens[i])
                counter += 1
            else: 
                short_article.append(tokens[i]) # add punctuation to the list but not count it

    detokenize_article = TreebankWordDetokenizer().detokenize(short_article) # use nltk to detokenize the text
    detokenize_article = detokenize_article.replace(" .", ".") # fix detokenization errors

    return detokenize_article

def shorten_all_articles(text_column:pd.Series)-> list:
    '''
    Function to shorten all articles in a text column using the shorten_article function

    Parameters
    text_column (column with articles to be shortened): pd.Series
    '''
    short_articles = []
    for article in text_column:
        short_articles.append(shorten_article(article))
    return short_articles

## Shortening Articles

In [25]:
#shorten all articles
samf_data["short_text"] = shorten_all_articles(samf_data["short_text"])

## Save CSV

In [40]:
samf_data_gpt3 = samf_data[:428]

In [41]:
samf_data_human = samf_data[428:856]

In [42]:
samf_data_human.to_csv("/content/drive/MyDrive/000 bachelor-project/data/danewsroom_articles_human.csv")

In [43]:
samf_data_gpt3.to_csv("/content/drive/MyDrive/000 bachelor-project/data/danewsroom_articles_gpt3.csv")