In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import pickle
import networkx as nx

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
import statsmodels
import warnings
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")

## Load data

In [4]:
lang_codes = ["ja", "ko", "tr", "sv" , "no", "fi", "da", "sr", "it" , "ca", "nl", "fr", "de"]
ctr_codes = ["JPN", "KOR", "TUR", "SWE", "NOR", "FIN", "DEN", "SER", "ITA", "SPA", "NET", "FRA", "GER"]
code_dict = dict(zip(lang_codes, ctr_codes))
country_region = {'East Asia': ['ja', 'ko'], 'West Asia': ['tr'], 'Northern Europe': ['sv', 'no', 'fi', 'da'],
        'Southern Europe': ['sr', 'it', 'ca'], 'Western Europe':['nl', 'fr', 'de'], 'North America': ['en']}

agg = pd.read_json("data/aggregated_timeseries.json.gz")
hofstede = pd.read_csv("data/6-dimensions-for-website-2015-08-16.csv", sep=";")
hofstede = hofstede[hofstede.ctr.isin(ctr_codes)].set_index('ctr').drop('country', axis=1).astype(int)

In [5]:
dfs = []
codes = lang_codes + [x + '.m' for x in lang_codes]

dfs = []

for language in codes:
    language_data = agg[language]
    all_sum = pd.Series(agg[language]['sum'])
    if 0 in all_sum.values:
        all_sum.replace(0, 1, inplace=True)
    
    for topic, topic_data in language_data["topics"].items():
        topic_sum = pd.Series(topic_data['sum']) # number of pageviews per day for a topic
        topic_df = pd.DataFrame({
            'pageviews': topic_sum.values,
            'language': [language] * len(topic_sum),
            'date': topic_sum.index,
            'topic': [topic] * len(topic_sum),
            'proportion': np.divide(topic_sum.values, all_sum.values)
        })
        dfs.append(topic_df)
        
df = pd.concat(dfs, ignore_index=True)
df.date = pd.to_datetime(df.date)

In [6]:
df_desktop = df[df.language.apply(lambda l : l in lang_codes)]
df_mobile = df[df.language.apply(lambda l : l[-2:] == ".m")]
df_mobile.loc[:,"language"] = df_mobile.language.apply(lambda l: l[:-2])
df = pd.concat([df_desktop, df_mobile]).groupby(["language", "topic", "date"]).sum().reset_index()

In [9]:
df["log_views"] = np.log(df.pageviews)
df_simple = df[df.topic.apply(lambda t: (t[-1] != "*") & (not t.startswith("Geography")))]

## Detect outliers

In [10]:
from sklearn.ensemble import IsolationForest

def outlier_detection(x):
    clf = IsolationForest(contamination=0.05, random_state=42) 
    return clf.fit_predict(x.to_numpy().reshape((-1, 1)))

df_simple.loc[:, "valid"] = df_simple.groupby(["language", "topic"]).pageviews.transform(outlier_detection)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_simple.loc[:, "valid"] = df_simple.groupby(["language", "topic"]).pageviews.transform(outlier_detection)


In [12]:
df_simple.loc[:,"month"] = df_simple.date.to_numpy().astype('datetime64[M]')
df_simple.loc[:,'week'] = df_simple.date - df_simple.date.dt.weekday * np.timedelta64(1, 'D')
df_simple.loc[:,"normalized_log_views"] = df_simple[df_simple.valid == 1].groupby(["language", "topic"]).log_views.transform(lambda x : (x - x.mean())/x.std())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_simple.loc[:,"month"] = df_simple.date.to_numpy().astype('datetime64[M]')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_simple.loc[:,'week'] = df_simple.date - df_simple.date.dt.weekday * np.timedelta64(1, 'D')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_simple.loc[:,"normalized_log_vi

In [35]:
with open("results/corr_timeseries.pkl", "rb") as f:
    corr_dfs = pickle.load(f)
    
del corr_dfs['Culture.Biography.Women']

In [38]:
sum_df = None
for topic, df in corr_dfs.items():
    if sum_df is None:
        sum_df = df
    else:
        sum_df = sum_df + df

sum_df /= len(corr_dfs)

In [45]:
G = nx.from_numpy_array(sum_df.to_numpy())

In [82]:
com = nx.community.louvain_communities(G, resolution=0.9)

In [87]:
commu = []
for g in [list(c) for c in com]:
    cm = []
    for gi in g:
        if sum_df.index[gi] != 'sv':
            cm.append(sum_df.index[gi])
    if len(cm) > 0:
        commu.append(cm)

In [88]:
commu

[['da', 'de', 'fi', 'nl', 'no'], ['ca', 'sr', 'fr', 'it'], ['tr', 'ja', 'ko']]