In [None]:
!pip install matplotlib --quiet
!pip install seaborn --quiet
!pip install wordcloud --quiet
!pip install plotly --quiet

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud
import datetime
import pandas as pd
import numpy as np

In [None]:
filename = "../raw_data/20220601_larger_data_b_scraped_clean.csv"
data = pd.read_csv(filename).drop(columns = {'Unnamed: 0', 'Content'})#.iloc[0:100]
data['date'] = pd.to_datetime(data['date'])
data.head(4)

**Data exploration**

In [None]:
sns.histplot(data['date'], bins = 50)

In [None]:
data['dir_1'].value_counts()

**Data Preprocessing**

In [None]:
# categories (dir_1) in descending frequencies
dir_1 = data['dir_1'].value_counts().index

In [None]:
# define a time frame
start_date = datetime.datetime.strptime("2018-01-01", '%Y-%m-%d')
end_date = datetime.datetime.strptime("2021-12-31", '%Y-%m-%d')

data_subset = data[np.logical_and(data['date'] >= start_date, data['date'] <= end_date)]
#data_subset = data #if no subset is wanted

In [None]:
timesampling = "M" # "M" for month, "Y" for year

#create dataframe for the others to append to and rename col to dir name
df = data_subset[data_subset['dir_1'] == dir_1[0]].resample(timesampling, on='date')['title'].count().reset_index().rename(columns={'title':dir_1[0]})

# create dataframe with publications per topic
for i in range(1,len(dir_1)):
    category = dir_1[i]
    temp = data_subset[data_subset['dir_1'] == category].resample('M', on='date')['title'].count().reset_index().rename(columns={'title':category})
    df = df.merge(temp, how='left', on='date').fillna(0)
data_publications = pd.concat([df['date'], df.drop(columns = "date").astype('Int64')], axis=1)

**Data Visualization**

Comparison of Topic Frequency in Pie Chart

In [None]:
piedata = data_publications.drop(columns='date').sum().reset_index()

fig = px.pie(piedata, values=0, names='index', title='Frequency of published topics')
fig.show()

Lineplot over Time (selected Categories)

In [None]:
subset = dir_1[0:5]
for i in subset:
    sns.lineplot(data = data_publications, x = 'date', y = i)

Stacked Area Chart

In [None]:
x = data_publications['date'].tolist() 
y = data_publications.drop(columns = {"date"}).T.values.tolist()
labels = data_publications.drop(columns = {"date"})

In [None]:
# matplotlib
fig = plt.figure(figsize=(12,7))
plt.stackplot(x,y, labels=labels)
plt.legend(loc='upper left')
plt.show()

In [None]:
# plotly
fig = px.area(x=x, y=y, title='Topics over time')
fig.show()

Stacked Area Chart Normalized

In [None]:
#normalize data
df = data_publications.drop(columns = 'date')
data_publications_normalized = df.div(df.sum(axis=1), axis=0)
y_norm = data_publications_normalized.T.values.tolist()

x_norm = data_publications['date'].tolist() 
labels = data_publications.drop(columns = {"date"})

In [None]:
# matplotlib
fig = plt.figure(figsize=(12,7))
plt.stackplot(x_norm, y_norm, labels=labels)
#plt.legend(loc='upper left')
plt.show()

In [None]:
# plotly
fig = px.area(x=x_norm, y=y_norm, title='Topics over time, Normalized')
fig.show()

Wordcloud

In [None]:
# lower max_font_size, change the maximum number of word and lighten the background:
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(" ".join(data['dir_1']))
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()