In [68]:
import wikipedia
import pandas as pd
import numpy as np
import os
from bs4 import BeautifulSoup


### Find related articles

In [41]:
def find_related(article_name, n):
    '''
    Given an article name, 
    returns n most related articles
    '''
    return wikipedia.search(article_name, results = n)

In [42]:
#pages related to BTS
find_related("BTS", 30)

['BTS',
 'Jimin (singer, born 1995)',
 'Be (BTS album)',
 'Dynamite (BTS song)',
 'Run BTS',
 'BTS albums discography',
 'J-Hope',
 'Kim Seok-jin',
 'BTS videography',
 'V (singer)',
 'List of awards and nominations received by BTS',
 'Map of the Soul: 7',
 'Wings (BTS album)',
 'Life Goes On (BTS song)',
 'List of BTS live performances',
 'Suga (rapper)',
 'BTS Skytrain',
 'BTS (disambiguation)',
 'Love Yourself World Tour',
 'Fake Love (BTS song)',
 'Jungkook',
 'Base transceiver station',
 'Idol (BTS song)',
 'DNA (BTS song)',
 'Serendipity (BTS song)',
 'Love Yourself: Tear',
 'Map of the Soul: Persona',
 'Savage Love (Laxed – Siren Beat)',
 'List of most-liked tweets',
 'BTS Group']

### Converting XML to LD

In [61]:
def xml_to_soup(fp):
    '''
    Processes xml data using beautiful soup and
    returns list of data for each page
    '''
    content = []
    with open(fp, encoding = 'utf8') as file:

        content = file.readlines()
        content = "".join(content)
        soup = BeautifulSoup(content, "xml")
        
    pages = soup.findAll("page")
    return pages[0]

In [62]:
def soup_to_df(page):
    '''
    Converts soupified xml data for Wiki pages 
    into dataframe
    
    pages: list of xml data for each page
    '''
    data = []

    title = page.title.text
    revisions = page.findAll("revision")

    for revision in revisions:
        r_id = revision.id.text 
        time = revision.timestamp.text
        try:
            try:
                username = revision.contributor.username.text
            except: 
                username = revision.contributor.ip.text
        except:
            username = 'N/A'
        text = revision.format.next_sibling.next_sibling.text
        data.append([title, r_id, time, username, text])

    df = pd.DataFrame(data, columns = ['title', 'id', 'time', 'username', 'text'])

    hist = [] #history of text
    version = [] #edit version
    username = []
    revert = [] #0 or 1
    curr = 1 #to keep track of version
    length = [] #length of text

    for idx, row in df.iterrows():
        if row.text not in hist: # not a revert
            hist.append(row.text)
            version.append(curr)
            username.append(row.username)
            length.append(len(row.text))
            revert.append('0')
            curr += 1
        else: #is revert
            temp = hist.index(row.text)
            version.append(version[temp])
            username.append(row.username)
            length.append(len(row.text))

            #if self revert
            if row.username == username[version[temp]]:
                revert.append('0')
            else:
                revert.append('1')

    df['version'] = version
    df['revert'] = revert
    df['length'] = length
    return df

In [63]:
def df_to_ld(df, outpath):
    '''
    Given a list of cleaned dataframes from xml data,
    produces light dump file into data/raw
    '''
    
    light_dump = ''
    
    title = df.title[0]
    light_dump = light_dump + title + '\n'
    for idx, row in df.iterrows():
        line = '^^^_' + row.time + ' ' + row.revert + ' ' + str(row.version) + ' ' + str(row.length) + ' ' + row.username 
        light_dump = light_dump + line + '\n'
    
    with open(outpath, 'w') as f:
        f.write(light_dump)
    repo = 'XML Converted to light dump at ' + outpath
    print(repo)
    
    return

In [64]:
def xml_to_light_dump(fp, outfp):
    '''
    Given an input file path and output path, 
    turns the xml file into a light dump 
    and stores it at the output file path
    '''
    #create light dump directory first
    if not os.path.isdir("../data/raw/light_dump"):
        os.mkdir("../data/raw/light_dump")
    
    #convert to light dump
    soup = xml_to_soup(fp)
    df = soup_to_df(soup)
    return df_to_ld(df, outfp)

In [65]:
fp = "../data/raw/artists/Blackpink_wiki.xml"
outfp = "../data/raw/light_dump/Blackpink_light_dump.txt"
xml_to_light_dump(fp, outfp)

XML Converted to light dump at ../data/raw/light_dump/Blackpink_light_dump.txt


In [67]:
files = [
    ['../data/raw/artists/Blackpink_wiki.xml', '../data/raw/light_dump/Blackpink_light_dump.txt'],
    ['../data/raw/artists/BTS_wiki.xml', '../data/raw/light_dump/BTS_light_dump.txt'],
    ['../data/raw/BTS_albums/Be_(BTS_album)_wiki.xml', '../data/raw/light_dump/BTS_be_light_dump.txt'],
    ['../data/raw/BTS_albums/Dark_%26_Wild_wiki.xml', '../data/raw/light_dump/BTS_dark_wild_light_dump.txt'],
    ['../data/raw/BTS_albums/Love_Yourself:_Tear_wiki.xml', '../data/raw/light_dump/BTS_love_yourself_light_dump.txt'],
    ['../data/raw/BTS_albums/Map_of_the_Soul:_7_wiki.xml', '../data/raw/light_dump/BTS_mots7_light_dump.txt'],
    ['../data/raw/BTS_albums/Wings_(BTS_album)_wiki.xml', '../data/raw/light_dump/BTS_wings_light_dump.txt'],
    ['../data/raw/All_Lives_Matter_wiki.xml', '../data/raw/light_dump/All_Lives_Matter_light_dump.txt']
]
for [fp, outfp] in files:
    xml_to_light_dump(fp, outfp)

XML Converted to light dump at ../data/raw/light_dump/Blackpink_light_dump.txt
XML Converted to light dump at ../data/raw/light_dump/BTS_light_dump.txt
XML Converted to light dump at ../data/raw/light_dump/BTS_be_light_dump.txt
XML Converted to light dump at ../data/raw/light_dump/BTS_dark_wild_light_dump.txt
XML Converted to light dump at ../data/raw/light_dump/BTS_love_yourself_light_dump.txt
XML Converted to light dump at ../data/raw/light_dump/BTS_mots7_light_dump.txt
XML Converted to light dump at ../data/raw/light_dump/BTS_wings_light_dump.txt
XML Converted to light dump at ../data/raw/light_dump/All_Lives_Matter_light_dump.txt


In [69]:
files = [
    ['../data/raw/artists/Backstreet_Boys_wiki.xml', '../data/raw/light_dump/Backstreet_Boys_light_dump.txt'],
    ['../data/raw/artists/Girls%27_Generation_wiki.xml', '../data/raw/light_dump/Girls_Generation_light_dump.txt'],
    ['../data/raw/artists/Justin_Bieber_wiki.xml', '../data/raw/light_dump/Justin_Bieber_light_dump.txt'],
    ['../data/raw/artists/Taylor_Swift_wiki.xml', '../data/raw/light_dump/Taylor_Swift_light_dump.txt'],
    ['../data/raw/Black_Lives_Matter_wiki.xml', '../data/raw/light_dump/Black_Lives_Matter_light_dump.txt'],
    ['../data/raw/Blue_Lives_Matter_wiki.xml', '../data/raw/light_dump/Blue_Lives_Matter_light_dump.txt']

]
for [fp, outfp] in files:
    xml_to_light_dump(fp, outfp)

XML Converted to light dump at ../data/raw/light_dump/Backstreet_Boys_light_dump.txt
XML Converted to light dump at ../data/raw/light_dump/Girls_Generation_light_dump.txt
XML Converted to light dump at ../data/raw/light_dump/Justin_Bieber_light_dump.txt
XML Converted to light dump at ../data/raw/light_dump/Taylor_Swift_light_dump.txt
XML Converted to light dump at ../data/raw/light_dump/Black_Lives_Matter_light_dump.txt
XML Converted to light dump at ../data/raw/light_dump/Blue_Lives_Matter_light_dump.txt


### Store revision content in separate txt file

In [151]:
def soup_to_df_with_content(pages):
    '''
    Converts soupified xml data for Wiki pages 
    into dataframe
    
    pages: list of xml data for each page
    '''
    data = {}
    for page in pages:
        title = page.title.text
        revisions = page.findAll("revision")

        for revision in revisions:
            r_id = revision.id.text 
            time = revision.timestamp.text
            try:
                try:
                    username = revision.contributor.username.text
                except: 
                    username = revision.contributor.ip.text
            except:
                username = 'N/A'
            text = revision.format.next_sibling.next_sibling.text
            if title in data:
                data[title].append([title, r_id, time, username, text])
            else:
                data[title] = [[title, r_id, time, username, text]]
    
    dframes = []
    for page in data:

        df = pd.DataFrame(data[page], columns = ['title', 'id', 'time', 'username', 'text'])

        hist = [] #history of text
        version = [] #edit version
        username = []
        revert = [] #0 or 1
        curr = 1 #to keep track of version

        for idx, row in df.iterrows():
            if row.text not in hist: # not a revert
                hist.append(row.text)
                version.append(curr)
                username.append(row.username)
                revert.append('0')
                curr += 1
            else: #is revert
                temp = hist.index(row.text)
                version.append(version[temp])
                username.append(row.username)

                #if self revert
                if row.username == username[version[temp]]:
                    revert.append('0')
                else:
                    revert.append('1')


        df['version'] = version
        df['revert'] = revert
        df['text'] = text
        dframes.append(df)

    return dframes

In [152]:
def df_to_content(dframes, outpath):
    '''
    Given a list of cleaned dataframes from xml data,
    produces light dump file into data/raw
    '''
    
    content = ''
    for df in dframes:
        title = df.title[0]
        content = content + title + '\n'
        for idx, row in df.iterrows():
            line = '^^^_' + str(row.version) + ' ' + row.username + '\n' + row.text
            content = content + line + '\n'
    with open(outpath, 'w') as f:
        f.write(content)
    repo = 'XML Converted to content at ' + outpath
    print(repo)
    
    return

In [153]:
def store_xml_content(fp, outfp):
    '''
    Given an input file path and output path, 
    stores the revision content in the xml file
    at the output file path
    '''
    #create content directory first
    if not os.path.isdir("../data/raw/content"):
        os.mkdir("../data/raw/content")
    
    #convert to light dump
    soup = xml_to_soup(fp)
    dframes = soup_to_df_with_content(soup)
    return df_to_content(dframes, outfp)

In [154]:
fp = "../data/raw/bts-current-revision.xml"
outfp = "../data/raw/content/bts_content.txt"
store_xml_content(fp, outfp)

XML Converted to content at ../data/raw/content/bts_content.txt


### try on larger kpop file

In [155]:
fp = "../data/raw/BTS-blackpink-girlsgen-jbieber-tswift.xml"
outfp = "../data/raw/light_dump/kpop.txt"
xml_to_light_dump(fp, outfp)

XML Converted to light dump at ../data/raw/light_dump/kpop.txt


In [156]:
outfp = "../data/raw/content/kpop_content.txt"
store_xml_content(fp, outfp)

XML Converted to content at ../data/raw/content/kpop_content.txt
