# Generate a wordcloud based on sipgate's Yammer messages

## First let's import some dependencies

In [None]:
import pandas as pd
import re
import string
import nltk

from wordcloud import WordCloud
from nltk.corpus import stopwords


## Read messages from CSV and store in Pandas dataframe

In [None]:
df = pd.read_csv("data/v2/MessagesClean.csv", parse_dates=['created_at', 'deleted_at'])
df = df.set_index('created_at')
df.info()
df.head()

## Show the first messages 

In [None]:
df.sort_values(by="created_at").head(50)

## Show top user

In [None]:
df.sender_name.value_counts().plot(kind="bar", figsize=(200, 16)).get_figure().savefig("yammer-posts-by-user.png", facecolor="white")

In [None]:
df.sender_name.value_counts().head(50)

## Remove bots

In [None]:
df = df[(df.sender_name != "Jenkins") & (df.sender_name != "zendesk connect")  & (df.sender_name != "Satisfaction and Social Media Reporter")]
df.sender_name.value_counts().head(20)

In [None]:
df.sender_name.value_counts().plot(kind="bar", figsize=(200, 16))

## Remove certain channels

In [None]:
df.group_name.value_counts().plot(kind="bar", figsize=(200, 16)).get_figure().savefig("yammer-posts-by-channel.png", facecolor="white")

In [None]:
df.group_name.value_counts().head(50)

In [None]:
df = df[(df.group_name != "Syslog")]
df.group_name.value_counts().head(50)

## Print dataframe to get an overview

In [None]:
df["body"]

## Generate one large message string

Note: We do not use Pandas to_string() here because it truncates the text.

In [None]:
messages = " ".join([row for row in df.body.dropna()])
messages[:4000]

## Clean

In [None]:
#messages_clean = messages.lower()
messages_clean = re.sub(r'\b[a-zöüäß]+\s*', "", messages)

# Remove newlines
messages_clean = re.sub('\n', ' ', messages_clean)
messages_clean = re.sub('\\n', ' ', messages_clean)
messages_clean = re.sub('\\\\n', ' ', messages_clean)
# Remove mentions ([[user:1234567]])
messages_clean = re.sub('\[.*?\]', '', messages_clean)
# Remove urls
messages_clean = re.sub('http\S+', '', messages_clean)
messages_clean = re.sub('\S*?\.sipgate\.net', '', messages_clean)
# Remove punctuation
messages_clean = re.sub('[%s]' % re.escape(string.punctuation), ' ', messages_clean)
messages_clean = re.sub('[€"“„]', ' ', messages_clean)
# Remove some special stuff
messages_clean = messages_clean.replace(u'\xa0', u' ')
messages_clean = messages_clean.replace(u'\xad', u' ')
messages_clean = messages_clean.replace(u'\u200d', u' ')
# Remove numbers
messages_clean = re.sub('\w*\d\w*', '', messages_clean)

# Remove single letter words
messages_clean = re.sub(r'(?:^| )\w(?:$| )', ' ', messages_clean)
messages_clean[:2000]

## Download nltk stopword data

In [None]:
nltk.download('stopwords')
german_stop_words = stopwords.words('german')


## Generate wordcloud

In [None]:
# mask=sipgate_mask,

wc = WordCloud(background_color="white", max_words=2000, width=800, height=400,
               stopwords=german_stop_words, contour_width=3, contour_color='steelblue')
wordcloud = wc.generate(messages_clean)
wordcloud.to_file("sipgate-nouns.png")
wordcloud.to_image()

