# WikiStory 


### Setup

Libraries and define constants.

In [1]:
import pandas as pd
import concurrent.futures
import json
import numpy as np
import shutil
import pathlib

### Convert to parquet

First a converesion from csv is parquet makes everything faster. If the file is in place, you don't need to run it again.

In [2]:
backup_date = "20200301"
lang = "pl"
csv_path = f"data/{lang}/{lang}wiki-{backup_date}-stub-meta-history1.csv"
csv_data = pd.read_csv(csv_path, quotechar ='|')
# csv_data.to_parquet('data/input.parquet', engine='pyarrow')
input_data = csv_data

### Add days to parquet
We need to add days to our parquet as that's how we will do clustering of words for later rendering

In [3]:
#input_data = pd.read_parquet('data/input.parquet', engine='pyarrow')

input_data['timestamp'] = pd.to_datetime(input_data['timestamp'])
input_data['day'] = input_data.timestamp.dt.date

In [4]:
print(input_data.head())

   page_id page_title  page_ns  revision_id                 timestamp  \
0        2        AWK        0            4 2001-10-09 11:11:04+00:00   
1        2        AWK        0            5 2001-10-30 22:29:16+00:00   
2        2        AWK        0        22391 2002-11-22 11:05:09+00:00   
3        2        AWK        0        22395 2002-12-08 08:46:25+00:00   
4        2        AWK        0        22396 2002-12-08 09:22:15+00:00   

  contributor_id              contributor_name  bytes         day  
0              0         imported>Paweł Jochym   3136  2001-10-09  
1              0  chello062179000014.chello.pl   3098  2001-10-30  
2              0             conversion script   3098  2002-11-22  
3             56                         LiNiO   3055  2002-12-08  
4              4                         Kpjas   3349  2002-12-08  


In [5]:
# input_data.to_parquet('data/with_date.parquet',engine='pyarrow')
with_dates = input_data

### Cleanup and Transformation

In [None]:
# with_dates = pd.read_parquet('data/with_date.parquet')
# with_dates = with_dates[with_dates.page_title != 'Tartışma:Anasayfa']
grouped_by_date_and_title = with_dates.groupby(['day','page_title']).count()
grouped_by_date_and_title = grouped_by_date_and_title.reset_index()

### Filter out unnecessary words

In [None]:
words_to_filter = {
    "tr": [
        "Vikipedi",
        "Kullanıcı:",
        "wiki:",
        "mesaj:",
        "Anasayfa",
        "Tartışma:",
        "Kategori:",
        "Şablon:"
          ],
    "en":[
        "Wikipedia:",
        "User:",
        "wiki:",
        "Talk:",
        "Message:",
        "Category:",
        "Template:",
        "Portal:"
    ],
    "de": [
        "Diskussion:",
        "Benutzer:",
    ],
    "ru": [
        
    ],
    "es":[
        
    ],
    "pt":[
        
    ]
}

grouped_by_date_and_title["ignore"] = False

for word in words_to_filter[lang]:
    grouped_by_date_and_title["ignore"] = grouped_by_date_and_title["ignore"] | grouped_by_date_and_title.page_title.str.contains(word)



grouped_by_date_and_title = grouped_by_date_and_title.loc[grouped_by_date_and_title.ignore == False]

### Save Results as Parquet

In [None]:
# grouped_by_date_and_title.to_parquet('data/grouped.parquet', engine='pyarrow')

### Save daily JSON's

In [None]:
# grouped_by_date_and_title = pd.read_parquet('data/grouped.parquet')

In [None]:
more_than_two = grouped_by_date_and_title[grouped_by_date_and_title['page_id']>1]
more_than_two = more_than_two.reset_index()
grouped_by_day = more_than_two.groupby('day')
grouped_by_day.head()

##### Cleanup data folder

In [None]:
amcharts_data_path = "./data/export"
shutil.rmtree(amcharts_data_path, ignore_errors=True)
pathlib.Path(amcharts_data_path).mkdir(parents=True, exist_ok=True)


In [None]:
def write_head(arg):
    index = arg[0]
    group = arg[1]
    filename = index
    # tags = group.apply(lambda x: {x.page_id})
    df = group.rename(columns={"page_id": "revisions", "page_title":"title"})
    df = df[["title", "revisions"]]
    mean = np.mean(df["revisions"])
    df = df[df["revisions"] >= mean]
    df[:100].to_json(f'{amcharts_data_path}/{index}.json', orient='records')


    
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
   executor.map(write_head, grouped_by_day)
    
dates = list(map(lambda x: x[0].strftime("%Y-%m-%d"), grouped_by_day.day))
    
index_dict = {'dates': dates}
with open(f'{amcharts_data_path}/index.json',"w+") as outfile:
  json.dump(index_dict, outfile)
