In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans


In [36]:
views = pd.read_csv('data/Philos_lang.csv')

In [21]:
views['timestamp'][0]

2018010100

In [24]:
views

Unnamed: 0,project,article,timestamp,views,subject,code
0,fr.wikipedia,Absurde,2018010100,10119,Absurdism,fr
1,fr.wikipedia,Absurde,2018020100,8507,Absurdism,fr
2,fr.wikipedia,Absurde,2018030100,9485,Absurdism,fr
3,fr.wikipedia,Absurde,2018040100,9772,Absurdism,fr
4,fr.wikipedia,Absurde,2018050100,11073,Absurdism,fr
...,...,...,...,...,...,...
154616,fi.wikipedia,Zurvalaisuus,2022080100,64,Naïve realism,fi
154617,fi.wikipedia,Zurvalaisuus,2022090100,51,Naïve realism,fi
154618,fi.wikipedia,Zurvalaisuus,2022100100,33,Naïve realism,fi
154619,fi.wikipedia,Zurvalaisuus,2022110100,36,Naïve realism,fi


In [59]:
# Rename rows with wrong country code 
views['code'] = views['project'].str.split('.').str[0]

# Group by country code and language, compute total page log for each group
grouped = views.groupby(['code', 'project']).size().reset_index(name='views_count_by_language')

# Compute total page log for each country code and each article
articles = views.groupby(['code','subject']).size().reset_index(name='views_count_by_article')

# Merge DataFrames on the 'language' column
merged_df = pd.merge(articles, grouped[['code','views_count_by_language']], on='code', how='left')

# Compute the fraction of views for each article within its language
merged_df['fraction_of_views'] = merged_df['views_count_by_article'] / merged_df['views_count_by_language']



In [66]:
# Retrieve the 5 article with the highest fraction of views for each language

top_articles = (merged_df.groupby('code', group_keys=False)
                .apply(lambda x: x.nlargest(4, 'fraction_of_views'))
                .reset_index(drop=True))
top_articles

Unnamed: 0,code,subject,views_count_by_article,views_count_by_language,fraction_of_views
0,da,Biosophy,122,10690,0.011413
1,da,Cartesianism,122,10690,0.011413
2,da,Free will,122,10690,0.011413
3,da,Informal logic,122,10690,0.011413
4,de,Dualistic cosmology,183,15964,0.011463
5,de,Informal logic,183,15964,0.011463
6,de,Juche,183,15964,0.011463
7,de,Western esotericism,183,15964,0.011463
8,fi,Antinatalism,183,15577,0.011748
9,fi,Afrocentrism,122,15577,0.007832


In [75]:
kmeans = KMeans(n_clusters=5, n_init= 10)
kmeans.fit(merged_df[['fraction_of_views']])

labels = kmeans.labels_

In [76]:
merged_df['cluster'] = labels

In [77]:
merged_df

Unnamed: 0,code,subject,views_count_by_article,views_count_by_language,fraction_of_views,cluster
0,da,Absurdism,61,10690,0.005706,2
1,da,Actual idealism,61,10690,0.005706,2
2,da,Actualism,40,10690,0.003742,0
3,da,Aesthetic Realism,61,10690,0.005706,2
4,da,Aesthetics,81,10690,0.007577,4
...,...,...,...,...,...,...
2232,sv,Philosophy of motion,61,13632,0.004475,0
2233,sv,Philosophy of music,61,13632,0.004475,0
2234,sv,School of Names,61,13632,0.004475,0
2235,sv,School of Naturalists,61,13632,0.004475,0
