In [19]:
import wikipedia
import pandas as pd
import numpy as np
import os
import sys
from bs4 import BeautifulSoup


In [27]:
sys.path.insert(0, '../src/data')
from etl import *

def xml_to_soup(fp):
    '''
    Processes xml data using beautiful soup and
    returns list of data for each page
    '''
    content = []
    with open(fp, encoding = 'utf8') as file:

        content = file.readlines()
        content = "".join(content)
        soup = BeautifulSoup(content, "xml")
        
    pages = soup.findAll("page")
    return pages

def soup_to_df(pages):
    '''
    Converts soupified xml data for Wiki pages 
    into dataframe
    
    pages: list of xml data for each page
    '''
    data = {}
    for page in pages:
        title = page.title.text
        revisions = page.findAll("revision")

        for revision in revisions:
            r_id = revision.id.text 
            time = revision.timestamp.text
            try:
                try:
                    username = revision.contributor.username.text
                except: 
                    username = revision.contributor.ip.text
            except:
                username = 'N/A'
            text = revision.format.next_sibling.next_sibling.text
            if title in data:
                data[title].append([title, r_id, time, username, text])
            else:
                data[title] = [[title, r_id, time, username, text]]
    
    dframes = []
    for page in data:

        df = pd.DataFrame(data[page], columns = ['title', 'id', 'time', 'username', 'text'])

        hist = [] #history of text
        version = [] #edit version
        username = []
        revert = [] #0 or 1
        curr = 1 #to keep track of version

        for idx, row in df.iterrows():
            if row.text not in hist: # not a revert
                hist.append(row.text)
                version.append(curr)
                username.append(row.username)
                revert.append('0')
                curr += 1
            else: #is revert
                temp = hist.index(row.text)
                version.append(version[temp])
                username.append(row.username)

                #if self revert
                if row.username == username[version[temp]]:
                    revert.append('0')
                else:
                    revert.append('1')


        df['version'] = version
        df['revert'] = revert
        dframes.append(df)

    return dframes

def df_to_ld(dframes, outpath):
    '''
    Given a list of cleaned dataframes from xml data,
    produces light dump file into data/raw
    '''
    
    light_dump = ''
    for df in dframes:
        title = df.title[0]
        light_dump = light_dump + title + '\n'
        for idx, row in df.iterrows():
            line = '^^^_' + row.time + ' ' + row.revert + ' ' + str(row.version) + ' ' + row.username
            light_dump = light_dump + line + '\n'
    with open(outpath, 'w') as f:
        f.write(light_dump)
    repo = 'XML Converted to light dump at ' + outpath
    print(repo)
    
    return

def xml_to_light_dump(fp, outfp):
    '''
    Given an input file path and output path, 
    turns the xml file into a light dump 
    and stores it at the output file path
    '''
    #create light dump directory first
    if not os.path.isdir("../data/raw/light_dump"):
        os.mkdir("../data/raw/light_dump")
    
    #convert to light dump
    soup = xml_to_soup(fp)
    dframes = soup_to_df(soup)
    return df_to_ld(dframes, outfp)

In [120]:
fp = "../data/raw/BTS-albums.xml"
outfp = "../data/raw/light_dump/bts_albums.txt"

xml_to_light_dump(fp, outfp)


XML Converted to light dump at ../data/raw/light_dump/bts_albums.txt


### Album release dates

* Dark & Wild: August 19, 2014

In [76]:
#get range
def date_range(release_date):
    '''
    Given a string with the album release date,
    calculates the given time range to extract data from
    '''
#     date = datetime.strptime(release_date, '%B %d, %Y')
    date = pd.to_datetime(release_date)
    start = date - timedelta(days=2)
    end = date + timedelta(days = 14)
    return start, end
    

In [88]:
start, end = date_range("August 19, 2014")

In [124]:
def lightdump_read_n(fp, n = 100):
    '''
	Reads in n lightdump pages and returns a list of all titles 
    read and their corresponding data as a DataFrame
	:param fp: input filepath
	:param n: number of articles to read
	:return: list of article titles, list of corresponding article lightdump data as DataFrame
	'''
    titles = []
    dataframes = []

    with open(fp) as file:
        df = pd.DataFrame(columns = ['timestamp', 'revert', 'revision_id', 'user'])
        page = 0
        for line in file:
            if '^^^_' not in line:
                title = line.strip('\n').strip()
                titles.append(title)

                if title != titles[page]:
                    page += 1
                    
                    df['timestamp'] = pd.to_datetime(df['timestamp'])
                    
                    dataframes.append(df)
                    
                    df = pd.DataFrame(columns = ['timestamp', 'revert', 'revision_id', 'user'])

                    if page == n:
                        break
            else:
                data = line.strip("^^^_").strip('\n').split()
                row = pd.Series(dtype = 'object')

                row['timestamp'] = data[0]
                row['revert'] = int(data[1])
                row['revision_id'] = int(data[2])
                row['user'] = data[3]

                df = df.append(row, ignore_index = True)
    dataframes.append(df)

    return titles, dataframes

In [125]:
#Check bts album time series data
fp = "../data/raw/light_dump/bts_albums.txt"
titles, dfs = lightdump_read_n(fp, 5)


In [131]:
titles[0]

'Dark & Wild'

In [132]:
dfs[0]

Unnamed: 0,timestamp,revert,revision_id,user
0,2014-11-12 17:21:59+00:00,0,1,Htruc
1,2014-11-12 17:33:23+00:00,0,2,Starcheerspeaksnewslostwars
2,2014-11-12 17:33:55+00:00,0,3,Starcheerspeaksnewslostwars
3,2014-11-12 18:43:50+00:00,0,4,Karlhard
4,2014-11-12 18:46:06+00:00,0,5,Karlhard
...,...,...,...,...
280,2021-01-10 13:30:35+00:00,0,263,112.200.38.129
281,2021-01-10 13:36:43+00:00,1,234,Carlobunnie
282,2021-01-10 13:57:08+00:00,1,248,112.200.38.129
283,2021-01-10 14:01:59+00:00,1,234,Carlobunnie


In [133]:
titles[4]

'Be (BTS album)'

In [134]:
dfs[4]

Unnamed: 0,timestamp,revert,revision_id,user
0,2020-09-27T15:37:41Z,0,1,Lirim.Z
1,2020-09-27T15:40:52Z,0,2,Lirim.Z
2,2020-09-27T15:41:31Z,0,3,Lirim.Z
3,2020-09-27T15:41:46Z,0,4,Lirim.Z
4,2020-09-27T15:43:51Z,0,5,Lirim.Z
...,...,...,...,...
568,2021-01-14T02:06:29Z,0,518,Carlobunnie
569,2021-01-14T05:06:16Z,0,519,Carlobunnie
570,2021-01-14T05:18:06Z,0,520,Carlobunnie
571,2021-01-14T09:45:55Z,0,521,Carlobunnie


In [140]:
start, end = date_range("November 20, 2020")
start

Timestamp('2020-11-18 00:00:00')

In [141]:
be = dfs[4]
be.timestamp = pd.DatetimeIndex(pd.to_datetime(be.timestamp)).tz_localize(None)
be[(be.timestamp > start) & (be.timestamp < end)]


Unnamed: 0,timestamp,revert,revision_id,user
201,2020-11-18 00:05:11,0,169,Hiroctzen
202,2020-11-18 01:27:39,1,143,EN-Jungwon
203,2020-11-19 05:29:58,0,170,Lisa19980325
204,2020-11-19 12:00:34,0,171,WikiCleanerBot
205,2020-11-19 15:04:51,0,172,Cedric
...,...,...,...,...
492,2020-12-02 19:20:59,0,445,Lk95
493,2020-12-02 20:21:51,0,446,Ïvana
494,2020-12-03 02:07:20,0,447,Cornerstonepicker
495,2020-12-03 12:50:45,0,448,Dzony336


In [82]:
#check bts main page time series data
fp = "../data/raw/light_dump/Kpop_ld.txt"
titles, dfs = lightdump_read_n(fp, 0)

In [83]:
titles[0]

'BTS'

In [84]:
bts.head()

Unnamed: 0,timestamp,revert,revision_id,user
0,2013-07-04 19:45:15+00:00,0,1,Hinorisakamachi
1,2013-07-04 19:47:39+00:00,0,2,Hinorisakamachi
2,2013-07-04 19:59:17+00:00,0,3,Hinorisakamachi
3,2013-07-04 19:59:53+00:00,0,4,Hinorisakamachi
4,2013-07-04 20:19:54+00:00,0,5,39.198.179.240


In [100]:
bts = dfs[0]

In [102]:

bts.timestamp = pd.DatetimeIndex(pd.to_datetime(bts.timestamp,unit='ms')).tz_localize(None)

In [104]:
bts[(bts.timestamp > start) & (bts.timestamp < end)]

Unnamed: 0,timestamp,revert,revision_id,user
520,2014-08-18 10:03:07,0,433,2.230.170.213
521,2014-08-18 10:03:59,0,434,2.230.170.213
522,2014-08-18 10:11:11,0,435,2.230.170.213
523,2014-08-18 10:13:53,0,436,2.230.170.213
524,2014-08-18 10:14:11,0,437,2.230.170.213
...,...,...,...,...
605,2014-08-31 22:38:10,0,498,Kpopperjagger
606,2014-08-31 22:44:02,0,499,Kpopperjagger
607,2014-09-01 00:42:54,1,415,Dr.K.
608,2014-09-01 23:09:31,0,500,95.93.196.122


In [96]:
bts.timestamp[0].tz_localize(None)

Timestamp('2013-07-04 19:45:15')

In [97]:
start

Timestamp('2014-08-17 00:00:00')