In [14]:
import pandas as pd
import numpy as np
import re
from cleantext import clean
import matplotlib.pyplot as plt
import nltk
from nltk.stem.snowball import SnowballStemmer

nltk.download('punkt')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 150)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\astri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
news_data = pd.read_csv('data.csv')

In [9]:
#Saving 'content' column into variable
content_column = news_data['content']

date = re.compile(r"(([0-9]{4}-(0[0-9]|1[0-2])-([0-2][0-9]|[3[01])|[a-z]{,9} [0-9]{1,2}, [0-9]{2,4})|\b(\w+\s)(\d{2})(th)?,?(\s\d{4})\b)")
#tjek om regex virker
#datoer?
url = re.compile(r'(https?://)?(www\.)?\w+-?\.?\w+\.\w{2,3}/?')
content_column = content_column.apply(lambda x: re.sub(date,"_DATE_",x))
# content_column  = content_column.apply(lambda x: re.sub(url,"<URL>", x))
#^fanger flere urls end funktionen fra biblioteket (men får ikke nødvendigvis ender med )

#husk også at fjerne |-tegn

#Calling the clean method with appropriate arguments and assigning the result back to 'content_column'
content_column_cleaned = content_column.apply(lambda x : clean(x,
    fix_unicode=False,               # fix various unicode errors
    to_ascii=False,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
    no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=True,         # replace all phone numbers with a special token
    no_numbers=True,               # replace all numbers with a special token
    no_digits=True,                # replace all digits with a special token
    no_currency_symbols=True,      # replace all currency symbols with a special token
    no_punct=True,                 # remove punctuations
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="_URL_",
    replace_with_email="_EMAIL_",
    replace_with_phone_number="_PHONE_",
    replace_with_number="_NUMBER_",
    replace_with_digit="0",
    replace_with_currency_symbol="_CUR_",
    lang="en"                       # set to 'de' for German special handling
))
print(content_column_cleaned)
#række med indeks 1: der er en url som ikke bliver fanget



0      sometimes the power of christmas will make you do wild and wonderful things you do not need to believe in the holy trinity to believe in the posit...
1      awakening of number strands of dna reconnecting with you movie of readers think this story is fact add your two cents headline bitcoin blockchain ...
2      never hike alone a friday the 00th fan film usa | number | number min a fan tribute to friday the 00th never hike alone follows an adventure never...
3      when a rare shark was caught scientists were left blundering for answers this shark has a unique feature and its not that it can survive at extrem...
4      donald trump has the unnerving ability to ability to create his own reality and convince millions of americans that what he says it is true the pr...
5                    republicans and democrats alike are willing to turn over government coffers to bezos and his ilk and the rights of the people be damned
6      could you imagine waking up in the morgue i for one

In [10]:
#Tokenize
content_column_tokenized = content_column_cleaned.apply(lambda x: nltk.word_tokenize(x))

print(len(content_column_tokenized[1]))


180


In [13]:
#Remove stopwords
# from nltk.corpus import stopwords
# stopwords = nltk.corpus.stopwords.words('english')
stopwordsDF = pd.read_csv('stopwords.csv')  #571 stopord - den fra nltk har færre, men kan også bruge
stopwordsSeries = stopwordsDF.squeeze()
stopwords = set(stopwordsSeries)
# print(stopwords)
# print(content_column_tokenized.isin(stopwords))

def remove_stopwords(list):
     return [word for word in list if word not in stopwords]

content_column_no_stopwords = content_column_tokenized.apply(lambda x: remove_stopwords(x))
#print('length: ', len(content_column_no_stopwords[1]))
#print(content_column_no_stopwords[1])

def vocabulary_size(series):
     count=0
     words={}
     for field in series:
          for word in field:
               if word not in words:
                    words[word]=1
                    count+=1
     return count

vocabulary_size_with_stopwords = vocabulary_size(content_column_tokenized)
vocabulary_size_no_stopwords = vocabulary_size(content_column_no_stopwords)
print(vocabulary_size_with_stopwords)
print(vocabulary_size_no_stopwords)
# print(content_column_tokenized.isin(stopwords))

reduction_rate=vocabulary_size_no_stopwords/vocabulary_size_with_stopwords
print("Reduction rate of vocabulary size after removing stopwords:", reduction_rate)
#måske noget med at ordene er meget frekvente, så man kunne også kigge på reduktionen af antal ord.


16654
16168
Reduction rate of vocabulary size after removing stopwords: 0.9708178215443737


In [24]:
#Stemming
stemmer = SnowballStemmer("english")

def list_stemmer (wordlist): #stemmer hvert ord i en liste
    stemmed_list = []
    for word in wordlist:
        stemmed_list.append(stemmer.stem(word))
    return stemmed_list

content_column_stemmed = content_column_no_stopwords.apply(lambda x:list_stemmer(x))
print(content_column_stemmed)


vocabulary_size_no_stopwords = vocabulary_size(content_column_no_stopwords)
vocabulary_size_stemmed = vocabulary_size(content_column_stemmed)
print(vocabulary_size_stemmed)
print(vocabulary_size_no_stopwords)


0      [power, christma, make, wild, wonder, thing, holi, triniti, posit, power, good, simpl, act, give, receiv, lost, day, worri, money, success, hold, ...
1      [awaken, number, strand, dna, reconnect, movi, reader, stori, fact, add, cent, headlin, bitcoin, blockchain, search, exceed, trump, blockchain, st...
2      [hike, a, friday, 00th, fan, film, usa, |, number, |, number, min, a, fan, tribut, friday, 00th, hike, adventur, hike, a, friday, 00th, fan, film,...
3      [a, rare, shark, caught, scientist, left, blunder, answer, shark, a, uniqu, featur, surviv, extrem, depth, live, extend, period, time, shark, uniq...
4      [donald, trump, unnerv, abil, abil, creat, realiti, convinc, million, american, true, problem, presid, lie, believ, lie, a, poll, show, countri, d...
5                                                                          [republican, democrat, alik, turn, govern, coffer, bezo, ilk, right, peopl, damn]
6                             [imagin, wake, morgu, trauma

In [12]:
#from collections import Counter
#word_frequency_dict = content_column_no_stopwords.apply(lambda x: Counter(x))
#print(word_frequency_dict) 

0      {'power': 3, 'christmas': 4, 'make': 2, 'wild': 1, 'wonderful': 1, 'things': 1, 'holy': 3, 'trinity': 1, 'positive': 1, 'good': 3, 'simple': 3, 'a...
1      {'awakening': 4, 'number': 7, 'strands': 6, 'dna': 4, 'reconnecting': 3, 'movie': 1, 'readers': 1, 'story': 1, 'fact': 1, 'add': 1, 'cents': 1, 'h...
2      {'hike': 5, 'a': 7, 'friday': 7, '00th': 7, 'fan': 5, 'film': 5, 'usa': 2, '|': 4, 'number': 7, 'min': 2, 'tribute': 2, 'adventure': 2, 'blogger':...
3      {'a': 13, 'rare': 1, 'shark': 13, 'caught': 1, 'scientists': 2, 'left': 1, 'blundering': 1, 'answers': 1, 'unique': 2, 'feature': 2, 'survive': 1,...
4      {'donald': 1, 'trump': 4, 'unnerving': 1, 'ability': 3, 'create': 1, 'reality': 1, 'convince': 1, 'millions': 1, 'americans': 2, 'true': 1, 'probl...
5      {'republicans': 1, 'democrats': 1, 'alike': 1, 'turn': 1, 'government': 1, 'coffers': 1, 'bezos': 1, 'ilk': 1, 'rights': 1, 'people': 1, 'damned': 1}
6      {'imagine': 1, 'waking': 1, 'morgue': 1, 'traumatiz