In [1]:
import wikipedia
import pandas as pd
import numpy as np
import os
import re
from bs4 import BeautifulSoup
from pywikibot import Site, User

In [2]:
#given a username, find all their contributions

def find_contributions(user):
    '''
    Given a username, 
    find all their contributions on Wikipedia
    '''
    site = Site('en', 'wikipedia')  # The site we want to run our bot on
    user = User(site, user)

    contrib = []
    for page, oldid, ts, comment in user.contributions():
        # for each edit, yields (pywikibot.Page, oldid, pywikibot.Timestamp, comment)
        contrib.append(page.title() + " ^^^ " + comment)
        
    return contrib

In [3]:
user = 'Danbloch'
len(find_contributions(user))

500

In [4]:
def find_users(fp, article_name):
    '''
    Given a light dump file and an article name, 
    find unique list of users who contributed
    '''

    #find usernames from ld
    with open(fp) as f:
        content = f.readlines()

    users = {}

    for line in content:
        line = line.strip()

        if line[:4] != "^^^_":  # line is title
            title = line
            users[title] = []
        else:  # line is revision
            users[title].append(line.split(" ")[3])        

    lst = users[article_name]
    return [i for n, i in enumerate(lst) if i not in lst[:n]] 
#     return users[article_name]

In [5]:
fp = "../data/raw/light_dump/Kpop_ld.txt"
find_users(fp, "BTS")

['Hinorisakamachi',
 '39.198.179.240',
 'FrescoBot',
 'Melonkelon',
 '99.236.74.160',
 '112.208.67.181',
 '31.22.98.132',
 '61.4.77.89',
 '217.118.64.55',
 '122.3.141.104',
 '178.149.154.220',
 '86.152.144.157',
 'DiaClass',
 '72.179.180.171',
 '174.113.19.57',
 '206.248.163.65',
 '2602:306:CF4D:AE20:6C52:E105:AAE6:3300',
 'Jiminswife',
 '65.75.85.52',
 '76.103.174.142',
 'Dr.K.',
 'Tsirip',
 '60.48.108.60',
 '175.140.141.26',
 'T1r510',
 'XLinkBot',
 'Dianatran2828',
 'Ishaarontrinh',
 '84.135.92.88',
 '187.14.15.211',
 '99.120.227.246',
 '98.219.107.37',
 '76.93.136.241',
 'Lowwerhigherguy8754',
 '219.74.29.181',
 '38.121.245.98',
 '166.48.168.221',
 '112.205.218.188',
 '88.187.120.159',
 '151.231.106.187',
 '75.128.55.71',
 'Shidomi',
 '175.38.207.106',
 '84.94.187.39',
 'Kmjnmyn',
 '68.7.82.97',
 '24.79.164.203',
 '69.140.163.147',
 '173.178.203.54',
 '84.135.92.4',
 'Bangtan',
 '216.58.78.42',
 'PotatoeBacon',
 '207.210.56.212',
 'Evanswag22',
 'Kanghuitari',
 'AvicBot',
 '37.208.

In [6]:
users = find_users(fp, "BTS")
'N/A' in users

True

In [7]:
def user_contributions(fp, article_name):
    users = find_users(fp, article_name)
    contributions = {}
    for user in users:
        try:
            contributions[user] = find_contributions(user)
        except:
            continue
    return contributions

In [8]:
data = user_contributions(fp, "BTS")



In [9]:
def build_df(data):
    '''
    Given user contribution data, build a user contibution dataframe
    '''
    d = []
    user = data.keys()
    for user in data.keys():
        total_contrib = len(data[user])
        for page in data[user]:
            # clean page title
            page = str(re.findall('^[^\^^^]+', page)[0][:-1])
            d.append([user, total_contrib, page])
            
    df = pd.DataFrame(d, columns = ['users', 'total_edits', 'article_name'])
    return df

In [10]:
df = build_df(data)
print(df.shape)
df.head()

(37617, 3)


Unnamed: 0,users,total_edits,article_name
0,Hinorisakamachi,500,Draft:P Nation
1,Hinorisakamachi,500,Asian Television Awards
2,Hinorisakamachi,500,Asian Television Awards
3,Hinorisakamachi,500,Asian Television Awards
4,Hinorisakamachi,500,Asian Television Awards


In [11]:
u = df[df.users == 'Hinorisakamachi'].groupby(['users', 'article_name']).count()
tmp = u.sort_values(by='total_edits', ascending = False)
tmp

Unnamed: 0_level_0,Unnamed: 1_level_0,total_edits
users,article_name,Unnamed: 2_level_1
Hinorisakamachi,Speed (South Korean band),31
Hinorisakamachi,Asian Television Awards,15
Hinorisakamachi,BTS,14
Hinorisakamachi,Gene Shinozaki,14
Hinorisakamachi,Draft:Bigman (beatboxer),12
Hinorisakamachi,...,...
Hinorisakamachi,Talk:100% Ver.,1
Hinorisakamachi,Talk:G.O (singer),1
Hinorisakamachi,Talk:Heo Ga-yoon,1
Hinorisakamachi,Produce 101,1


In [12]:
articles = [article_name for (user, article_name) in tmp.index]
articles

['Speed (South Korean band)',
 'Asian Television Awards',
 'BTS',
 'Gene Shinozaki',
 'Draft:Bigman (beatboxer)',
 'Superior Speed',
 'Mir (singer)',
 'Speed Circus',
 'MBLAQ',
 'Rocket Girls 101',
 'Thunder (singer)',
 'List of South Korean idol groups (2010s)',
 'Reeps One',
 '100% Ver.',
 'Produce X 101',
 'Bigman (beatboxer)',
 'List of Produce 101 (Chinese TV series) contestants',
 'Speed of Light (Speed song)',
 'High School Rapper',
 'MBK Entertainment',
 'FNC Entertainment',
 'Park Sung-hoon (singer)',
 'Lee Joon',
 'Seung Ho',
 'Template:MBK Entertainment',
 'U-KISS',
 "The King's Avatar (2019 TV series)",
 'Grand Beatbox Battle',
 'Beatbox House',
 'User talk:Lullabying',
 'Arthdal Chronicles',
 'High School Rapper (season 1)',
 'User talk:Hinorisakamachi',
 'JYP Entertainment',
 'Template:Produce 101',
 'Nguyen Cuong',
 'Big Hit Entertainment',
 'Play the Siren',
 'R1SE',
 'Stone Music Entertainment',
 'Oui Entertainment',
 'Sunye',
 'Weird Genius',
 'Kwon So-hyun',
 'Kim Yo

### Determine if page is kpop related or not

In [13]:
def find_related(article_name, n):
    '''
    Given an article name, 
    returns n most related articles
    '''
    articles = []
    articles.append(article_name)
    results = wikipedia.search(article_name, results = n)
    articles += results
#     for res in results:
#         articles += wikipedia.search(res, results = int(n/100))
    return articles

In [14]:
kpop_related = find_related("K-pop", n=500)
bts_related = find_related("BTS", n=500)
related = kpop_related + bts_related

In [15]:
related[:5]

['K-pop', 'K-pop', 'List of male K-pop artists', 'K-pop Star', 'Simply K-Pop']

In [19]:
[x for x in articles if x in related]

['BTS',
 'FNC Entertainment',
 'Lee Joon',
 'U-KISS',
 'JYP Entertainment',
 'Big Hit Entertainment',
 'Hyuna',
 'YG Entertainment',
 'Wanna One',
 'X1 (band)',
 'Exo (group)',
 'List of South Korean idol groups (2000s)',
 'Kim Yu-bin (musician)',
 'IKon',
 'SM Entertainment',
 'Iz*One',
 'Produce 101']

In [29]:
def kpop_fan(user, df):
    '''
    Given a username and dataframe of user contributions,
    determines whether a user is a kpop fan or not
    '''
    u = df[df.users == user].groupby(['users', 'article_name']).count()
    tmp = u.sort_values(by='total_edits', ascending = False)
    #get articles that they contributed in
    total_articles = [article_name for (user, article_name) in tmp.index]
    articles = [x for x in total_articles if x in related]
    #get percentage of kpop contributions
    if len(articles) / len(total_articles) > 0.1:
        return 1
    else:
        return 0

In [42]:
#given article, determine if users are kpop fans or not
article_name = 'BTS'
fp = "../data/raw/light_dump/Kpop_ld.txt"

#transform data into dataframe format
data = user_contributions(fp, article_name)
df = build_df(data)

#get all related article titles
kpop_related = find_related("K-pop", n=500)
bts_related = find_related("BTS", n=500)
related = kpop_related + bts_related

#analyze each user
unique_users = df.users.unique()
fans = [user for user in unique_users if kpop_fan(user,df) == 1 ]
df['fan'] = df.users.apply(lambda x: 1 if x in fans else 0)

#get percentage of fans
print(len(fans) / len(unique_users))



0.8340336134453782


In [46]:
def percentage_fans(fp, article_name, search_related):
    '''
    Given an article revision history,
    gets percentage of fans from its users.
    '''
    #transform data into dataframe format
    data = user_contributions(fp, article_name)
    df = build_df(data)

    #get all related article titles
    related = []
    for query in search_related:
        related += find_related(query, n=500)

    #analyze each user
    unique_users = df.users.unique()
    fans = [user for user in unique_users if kpop_fan(user,df) == 1 ]
    df['fan'] = df.users.apply(lambda x: 1 if x in fans else 0)

    #get percentage of fans
    return len(fans) / len(unique_users)
    

In [47]:
article_name = 'BTS'
fp = "../data/raw/light_dump/Kpop_ld.txt"
search_related = ['K-pop', 'BTS']

percentage_fans(fp, article_name, search_related)



0.8340336134453782