In [50]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg, sparse
import matplotlib.pyplot as plt
import pandas as pd
import os
import re

In [16]:
def read_and_clean_df(file_name):
    with open(file_name, 'r') as f:
        x = f.read()
    lines = x.replace('\n', ' ')
    lines = x.replace('\n', ' ')
    patt = r'\d+/\d+/\d+.*?(?=\s*\b\d+/\d+/\d+|)(?=\s*\|)+'
    patt2 = r'(\d+/\d+/\d+.*?(?=\s*\b\d+/\d+/\d+|)(?=\s*\|)+)'
    lines2 = re.split(patt, lines)[1:]
    full_lines = [re.findall(patt2, lines), lines2]
    full_lines = [''.join(i).strip() for i in zip(*full_lines)]
    df = pd.DataFrame([l.split("||") for l in full_lines], columns=['date', 'type', 'name', 'post', 'delete'])
    df = df.drop('delete', axis=1)
    return df

In [17]:
politicians = ['jacindaardern', 'KelvinDavisLabour', 'simonjbridges', 'paulabennettUH']

In [18]:
full_df = pd.DataFrame()
for poli in politicians:
    df = read_and_clean_df('./data/'+poli+'/' + poli+'_posts.txt')
    df['politician'] = poli
    full_df = full_df.append(df, ignore_index=True)

In [19]:
#remove blank posts
full_df = full_df.dropna()

In [20]:
full_df.head()

Unnamed: 0,date,type,name,post,politician
0,"07/11/2019, 16:27",others,Jacinda Ardern,A pretty historic moment - today we pass our ...,jacindaardern
1,"28/11/2019, 23:03",others,Jacinda Ardern,Forty years ago today 257 people lost their l...,jacindaardern
2,"27/11/2019, 17:57",others,Jacinda Ardern,Mixed emotions today as we opened the new Sui...,jacindaardern
3,"21/11/2019, 17:04",others,Jacinda Ardern,Despite coming from a police family (my dad w...,jacindaardern
4,"18/11/2019, 16:02",others,Jacinda Ardern was live.,#LIVE: Post-Cabinet Press Conference 18 Novem...,jacindaardern


In [21]:
# # np.savetxt('input.txt', full_df['post'].values)
# with open('input.txt', 'a') as f:
#     f.write(full_df['post'].to_string(header = False, index = False))
# full_df['post'].to_string(header = False, index = False)

# SVD for topic modelling

In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk import stem

In [23]:
vectorizer = CountVectorizer(stop_words='english')

In [24]:
vectors = vectorizer.fit_transform(full_df['post']).todense()
vectors.shape

(2379, 6855)

In [25]:
vocab = np.array(vectorizer.get_feature_names())

In [26]:
vocab[:10]

array(['000', '01st', '035', '10', '100', '1000', '1000s', '1000th',
       '100kg', '100m'], dtype='<U26')

In [27]:
vectors

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [28]:
U, s, Vh = linalg.svd(vectors, full_matrices=False)

In [29]:
def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

In [30]:
num_topics, num_top_words = 3, 5

In [31]:
show_topics(Vh[:5])

['cray whuuu downwiththeyouth dodgy hoaket',
 'new government tax kiwis zealand',
 'government today tax national labour',
 'tourism government nz tax infrastructure',
 'today great national nz zealand']

In [34]:
def vectorize_and_topic_model_svd(politicians=[]):
    df = full_df[full_df['politician'].isin(politicians)]
    vectors = vectorizer.fit_transform(df['post']).todense()
    U, s, Vh = linalg.svd(vectors, full_matrices=False)
    return show_topics(Vh[:5])


In [35]:
vectorize_and_topic_model_svd(['jacindaardern'])

['500 brilliant bad coastlines atatu',
 'chair diverse dealt declares bosses',
 'chair alternativefacts dominion 31kg 8billion',
 'alternativefacts declares 8billion biennial caution',
 'declares diverse debt dogs dealt']

In [130]:
vectorize_and_topic_model_svd(['paulabennettUH'])

['abused hoopin break beeram asset',
 'intervention 7pm installation face kimmel',
 'intervention inn installation commerce chooks',
 'creatures ladies chooks commerce accomplishment',
 'inn 7pm chooks experienced bilateral']

In [131]:
vectorize_and_topic_model_svd(['simonjbridges'])

['gotta 39 controlled diversifying 1m',
 'flood 330km daughter crowd check',
 'foreign 330km gloooooowing decrease crowd',
 '330km daughter folau cleared crowd',
 'foreign flood dealt cleared cookies']

In [132]:
vectorize_and_topic_model_svd(['simonjbridges', 'paulabennettUH'])

['affairs buyers me clean pastoral',
 'ngarimu parental home needed achievements',
 'achievements energy heretaunga handling perimeter',
 'ngarimu energy damn natural perimeter',
 'energy needed perimeter multi developed']

In [36]:
vectorize_and_topic_model_svd(['jacindaardern', 'KelvinDavisLabour'])

['mountain guardianship passionately handwritten mess',
 'mess exhibitors gun meadowood fabulous',
 'broadwood money homes papanui handwritten',
 'mountain handwritten broadwood golf defending',
 'capped money mountain particularly homes']

# NMF

In [37]:
vectorizer_tfidf = TfidfVectorizer(stop_words='english')
vectors_tfidf = vectorizer_tfidf.fit_transform(full_df['post']) # (documents, vocab)

In [38]:
num_topics = 7
clf = decomposition.NMF(n_components=num_topics)

In [39]:
W1 = clf.fit_transform(vectors_tfidf)
H1 = clf.components_

In [40]:
show_topics(H1)

['new national zealand party document',
 'live press conference post cabinet',
 'great today day mp people',
 'crown holding minister awesome designing',
 'te reo hui tai tokerau',
 'tourism nz infrastructure industry new',
 'tax government labour kiwis car']

In [43]:
def vectorize_and_topic_model_nmf(politicians=[]):
    df = full_df[full_df['politician'].isin(politicians)]
    vectorizer_tfidf = TfidfVectorizer(stop_words='english')
    vectors_tfidf = vectorizer_tfidf.fit_transform(df['post']) # (documents, vocab)
    clf = decomposition.NMF(n_components=num_topics)
    W1 = clf.fit_transform(vectors_tfidf)
    H1 = clf.components_
    return show_topics(H1)


In [44]:
vectorize_and_topic_model_nmf(['jacindaardern'])

['9m club coincidental ago built',
 'chair dog circumstances dominion dollar',
 'commission depths disappointed documentary brings',
 '35 deal announced 10s cpnz',
 '2250 declares 60th buildings 3lb',
 '8billion cracks alternativefacts dinners 550',
 '136 built coincidental ago 32m']

In [45]:
vectorize_and_topic_model_nmf(['paulabennettUH'])

['colossal informal fairer dumped inclusiveness',
 'ends knocked festival inpatient ford',
 'employed extra hayes allegations impressive',
 'emails discovered eli current braved',
 'impersonate dependent colbert delivery aotea',
 'bikes kickoff culture competed details',
 'fox keyte ceo episode coffee']

# Sentiment

In [53]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [54]:
top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]

In [55]:
tokenized_text=nltk.word_tokenize(full_df['post'][2])
print(tokenized_text)
sid = SentimentIntensityAnalyzer()
pos_word_list=[]
neu_word_list=[]
neg_word_list=[]
for word in tokenized_text:
    if (sid.polarity_scores(word)['compound']) >= 0.5:
        pos_word_list.append(word)
    elif (sid.polarity_scores(word)['compound']) <= -0.5:
        neg_word_list.append(word)
    else:
        neu_word_list.append(word)                

print('Positive :',pos_word_list)        
print('Neutral :',neu_word_list)    
print('Negative :',neg_word_list) 

['Mixed', 'emotions', 'today', 'as', 'we', 'opened', 'the', 'new', 'Suicide', 'Prevention', 'Office', '.', 'There', '’', 's', 'so', 'much', 'support', 'for', 'the', 'work', 'to', 'end', 'suicide', 'in', 'New', 'Zealand', '-', 'people', 'like', 'Nehe', 'Milner-Skudder', 'who', 'is', 'working', 'in', 'mental', 'health', 'and', 'well-being', 'with', '@', 'headfirstnz', '.', 'All', 'of', 'us', 'want', 'a', 'country', 'where', 'an', 'office', 'like', 'this', 'isn', '’', 't', 'needed', ',', 'till', 'then', 'I', '’', 'm', 'grateful', 'for', 'all', 'the', 'people', 'working', 'to', 'make', 'that', 'world', 'possible', '(', 'like', 'Sir', 'Mason', 'Durie', 'who', 'has', 'kindly', 'agreed', 'to', 'chair', 'our', 'Maori', 'Advisory', 'Board', 'and', 'keep', 'sharing', 'all', 'his', 'wisdom', '.', ')']
Positive : ['wisdom']
Neutral : ['Mixed', 'emotions', 'today', 'as', 'we', 'opened', 'the', 'new', 'Prevention', 'Office', '.', 'There', '’', 's', 'so', 'much', 'support', 'for', 'the', 'work', 'to'

In [56]:
sid.polarity_scores('great')['compound']

0.6249

In [57]:
sid = SentimentIntensityAnalyzer()
def sentiment_analyser(text):
    tokenized_text = nltk.word_tokenize(text)
    sentiment_score = 0.
    for word in tokenized_text:
        sentiment_score = sentiment_score + sid.polarity_scores(word)['compound']
    return sentiment_score
sentiment_analyser(full_df['post'][2])    

2.3958

In [58]:
full_df['sentiment_score'] = full_df['post'].apply(lambda x: sentiment_analyser(x))

In [59]:
most_negative_posts = full_df.groupby('politician')['sentiment_score'].min().rename('sentiment_score').reset_index()
most_positive_posts = full_df.groupby('politician')['sentiment_score'].max().rename('sentiment_score').reset_index()

In [60]:
full_df.merge(most_negative_posts, how='inner', on=['politician','sentiment_score'])

Unnamed: 0,date,type,name,post,politician,post_clean,sentiment_score
0,"13/05/2019, 11:37",others,Jacinda Ardern,"In the wake of the 15th of March, we learned ...",jacindaardern,In wake 15th March learned happened Christchur...,-3.37
1,"20/03/2019, 21:12",others,Kelvin Davis,Last Friday this country experienced an unpre...,KelvinDavisLabour,Last Friday country experienced unprecedented ...,-3.529
2,"07/10/2019, 18:14",others,Simon Bridges,Gang numbers are up 26% under this soft-on-cr...,simonjbridges,Gang numbers 26 soft on crime Government Befor...,-4.318
3,"29/11/2019, 07:30",others,Paula Bennett,Gang patches and insignia are intimidating. W...,paulabennettUH,Gang patches insignia intimidating We believe ...,-2.8632


In [61]:
full_df.merge(most_positive_posts, how='inner', on=['politician','sentiment_score'])

Unnamed: 0,date,type,name,post,politician,post_clean,sentiment_score
0,"19/10/2019, 12:37",others,Jacinda Ardern,I can’t say I’ve ever said this before - but ...,jacindaardern,I can t say I ve ever said Nelson beautiful ai...,5.8033
1,"21/06/2019, 12:54",others,"Kelvin Davis is in Queenstown, New Zealand.",In Queenstown yesterday to help open the Wint...,KelvinDavisLabour,In Queenstown yesterday help open Winter Festi...,5.9465
2,"25/06/2019, 16:28",others,Simon Bridges,This morning my friend and colleague Amy Adam...,simonjbridges,This morning friend colleague Amy Adams MP tol...,4.6093
3,"30/08/2017, 09:53",others,Paula Bennett,Over 200 delegates are in Dunedin for the Hol...,paulabennettUH,Over 200 delegates Dunedin Holiday Parks Confe...,4.8395


In [62]:
full_df.merge(most_negative_posts, how='inner', on=['politician','sentiment_score'])

Unnamed: 0,date,type,name,post,politician,post_clean,sentiment_score
0,"13/05/2019, 11:37",others,Jacinda Ardern,"In the wake of the 15th of March, we learned ...",jacindaardern,In wake 15th March learned happened Christchur...,-3.37
1,"20/03/2019, 21:12",others,Kelvin Davis,Last Friday this country experienced an unpre...,KelvinDavisLabour,Last Friday country experienced unprecedented ...,-3.529
2,"07/10/2019, 18:14",others,Simon Bridges,Gang numbers are up 26% under this soft-on-cr...,simonjbridges,Gang numbers 26 soft on crime Government Befor...,-4.318
3,"29/11/2019, 07:30",others,Paula Bennett,Gang patches and insignia are intimidating. W...,paulabennettUH,Gang patches insignia intimidating We believe ...,-2.8632


In [64]:
# [len(word) for word in nltk.word_tokenize('dfgd dfgd')]
#     print(word)
def remove_one_letter_words(text):
    text_clean = ''
    for word in nltk.word_tokenize(text):
        if len(word)>1:
            text_clean = text_clean.join(word)
    return text_clean
remove_one_letter_words('sdfsdf sdfsd g kl')           

'kssdfsdfdsdfsdffsdfsdfssdfsdfdl'

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
from nltk.tokenize import RegexpTokenizer
reg_tokenizer = RegexpTokenizer(r'\w+')
full_df['post_clean'] = full_df['post'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
full_df['post_clean'] = full_df['post_clean'].apply(lambda x: ' '.join(word for word in reg_tokenizer.tokenize(x)))

#remove one letter words
full_df['post_clean'] = full_df['post_clean'].apply(lambda x: remove_one_letter_words(x))

In [14]:
top_N = 14
#if not necessary all lower
a = full_df['post_clean'].str.lower().str.cat(sep=' ')
words = nltk.tokenize.word_tokenize(a)
word_dist = nltk.FreqDist(words)
print (word_dist)
# <FreqDist with 17 samples and 20 outcomes>

rslt = pd.DataFrame(word_dist.most_common(top_N),
                    columns=['Word', 'Frequency'])
print(rslt)

KeyError: 'post_clean'