# WikiStory 


In [1]:
import pandas as pd

### Convert to parquet

First a converesion from csv is parquet makes everything faster. If the file is in place, you don't need to run it again.

In [None]:
csv_data = pd.read_csv('data.csv', quotechar ='|')
csv_data.to_parquet('data/input.parquet', engine='pyarrow')

### Add days to parquet
We need to add days to our parquet as that's how we will do clustering of words for later rendering

In [None]:
input_data = pd.read_parquet('data/input.parquet', engine='pyarrow')

input_data['timestamp'] = pd.to_datetime(input_data['timestamp'])
input_data['day'] = input_data.timestamp.dt.date

In [None]:
print(input_data.head())

In [None]:
input_data.to_parquet('data/with_date.parquet',engine='pyarrow')

### Cleanup and Transformation

In [212]:
with_dates = pd.read_parquet('data/with_date.parquet')
# with_dates = with_dates[with_dates.page_title != 'Tartışma:Anasayfa']
grouped_by_date_and_title = with_dates.groupby(['day','page_title']).count()
grouped_by_date_and_title = grouped_by_date_and_title.reset_index()

### Filter out unnecessary words

In [226]:
words_to_filter=[
    "Vikipedia:",
    "Kullanıcı:",
    "wiki:",
    "mesaj:"
]
filters = list(map(lambda word: (~grouped_by_date_and_title["page_title"].str.contains(word)), words_to_filter))
filters

[0           True
 1           True
 2           True
 3           True
 4           True
             ... 
 13308156    True
 13308157    True
 13308158    True
 13308159    True
 13308160    True
 Name: page_title, Length: 13308161, dtype: bool,
 0            True
 1            True
 2            True
 3            True
 4           False
             ...  
 13308156     True
 13308157     True
 13308158     True
 13308159     True
 13308160     True
 Name: page_title, Length: 13308161, dtype: bool,
 0           True
 1           True
 2           True
 3           True
 4           True
             ... 
 13308156    True
 13308157    True
 13308158    True
 13308159    True
 13308160    True
 Name: page_title, Length: 13308161, dtype: bool,
 0           True
 1           True
 2           True
 3           True
 4           True
             ... 
 13308156    True
 13308157    True
 13308158    True
 13308159    True
 13308160    True
 Name: page_title, Length: 13308161, dtype: boo

### Save Results as Parquet

In [2]:
grouped_by_date_and_title.to_parquet('data/grouped.parquet', engine='pyarrow')

NameError: name 'grouped_by_date_and_title' is not defined

### Save daily CSV's

In [185]:
grouped_by_date_and_title = pd.read_parquet('data/grouped.parquet')

In [208]:
more_than_two = grouped_by_date_and_title[grouped_by_date_and_title['page_id']>1]
more_than_two = more_than_two.reset_index()
grouped_by_day = more_than_two.groupby('day')
grouped_by_day.head()

Unnamed: 0,day,page_title,page_id,page_ns,revision_id,timestamp,contributor_id,contributor_name,bytes
0,2002-12-05,Anasayfa,2,2,2,2,2,2,2
1,2002-12-13,Anasayfa,2,2,2,2,2,2,2
2,2002-12-16,Anasayfa,2,2,2,2,2,2,2
3,2003-01-25,Kullanıcı:Patrick,2,2,2,2,2,2,2
4,2003-06-25,Anasayfa,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...
2573473,2020-03-01,1 Mart,2,2,2,2,2,2,2
2573474,2020-03-01,1367,2,2,2,2,2,2,2
2573475,2020-03-01,14. İnsansız Uçak Sistemleri Üs Meydan Komutan...,3,3,3,3,3,3,3
2573476,2020-03-01,1818 Antlaşması,2,2,2,2,2,2,2


##### Cleanup data folder

In [204]:
import shutil
import pathlib

amcharts_data_path = "../wikistoryweb/public/data"
shutil.rmtree(amcharts_data_path, ignore_errors=True)
pathlib.Path(amcharts_data_path).mkdir(parents=True, exist_ok=True)


In [205]:
import concurrent.futures
import json
import numpy as np



def write_head(arg):
    index = arg[0]
    group = arg[1]
    filename = index
    # tags = group.apply(lambda x: {x.page_id})
    df = group.rename(columns={"page_id": "revisions", "page_title":"title"})
    df = df[["title", "revisions"]]
    mean = np.mean(df["revisions"])
    df = df[df["revisions"] >= mean]
    df.to_json(f'{amcharts_data_path}/{index}.json', orient='records')


    
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
   executor.map(write_head, grouped_by_day)
    
dates = list(map(lambda x: x[0].strftime("%Y-%m-%d"), grouped_by_day.day))
    
index_dict = {'dates': dates}
with open(f'{amcharts_data_path}/index.json',"w+") as outfile:
  json.dump(index_dict, outfile)

# json.dumps(index_dict)
index_dict

{'dates': ['2002-12-05',
  '2002-12-13',
  '2002-12-16',
  '2003-01-25',
  '2003-06-25',
  '2003-07-05',
  '2003-07-06',
  '2003-08-02',
  '2003-08-03',
  '2003-08-04',
  '2003-08-05',
  '2003-08-06',
  '2003-08-08',
  '2003-08-10',
  '2003-09-05',
  '2003-09-06',
  '2003-09-12',
  '2003-09-14',
  '2003-09-16',
  '2003-09-22',
  '2003-09-24',
  '2003-09-29',
  '2003-10-03',
  '2003-10-11',
  '2003-10-29',
  '2003-11-06',
  '2003-11-09',
  '2003-11-10',
  '2003-11-18',
  '2003-11-20',
  '2003-11-28',
  '2003-12-03',
  '2003-12-05',
  '2003-12-08',
  '2003-12-10',
  '2003-12-17',
  '2003-12-19',
  '2003-12-20',
  '2003-12-21',
  '2003-12-23',
  '2003-12-24',
  '2003-12-27',
  '2004-01-02',
  '2004-01-03',
  '2004-01-04',
  '2004-01-07',
  '2004-01-08',
  '2004-01-09',
  '2004-01-10',
  '2004-01-11',
  '2004-01-13',
  '2004-01-14',
  '2004-01-15',
  '2004-01-16',
  '2004-01-17',
  '2004-01-18',
  '2004-01-19',
  '2004-01-20',
  '2004-01-21',
  '2004-01-22',
  '2004-01-23',
  '2004-01-24',