In [1]:
import pandas as pd
from datetime import datetime
from civilservant.wikipedia.utils import make_cached_df

In [2]:
images = pd.read_csv('output/all-images-20200104.csv')

In [3]:
import mwapi
mw = mwapi.Session(host='https://commons.wikimedia.org')

Sending requests with default User-Agent.  Set 'user_agent' on mwapi.Session to quiet this message.


In [4]:
images.head()

Unnamed: 0,user_name,user_id,img_name,year
0,Islahaddow,1499491,Wiki_Loves_Africa.pdf,2014
1,D Malesi,4163374,Home_Chef_-_African_food_and_cuisine.jpg,2014
2,TOUMOU,3391181,سوق_بمدينة_المنستير_التونسية_1.JPG,2014
3,TOUMOU,3391181,سوق_بمدينة_المنستير_التونسية_2.JPG,2014
4,TOUMOU,3391181,سوق_بمدينة_المنستير_التونسية_3.JPG,2014


In [30]:
@make_cached_df('imageinfo')
def get_original_uploader(img_name):
    img_name_f = f'File:{img_name}'
    # example https://commons.wikimedia.org/w/api.php?action=query&titles=File:Musa%20with%20wheelbarrow.jpg&prop=imageinfo&iilimit=max
    # example action=query&titles=File:Musa%20with%20wheelbarrow.jpg&prop=imageinfo&iilimit=max
    ret  = mw.get(action='query', titles=img_name_f, prop='imageinfo', iilimit='max')
    imgs = ret['query']['pages']
    img_key = list(iter(imgs.keys()))[0]
    if img_key == '-1':
        return pd.DataFrame([float('nan')])
    versions = imgs[img_key]['imageinfo']
    for version in versions:
        version['timestamp'] = datetime.strptime(version['timestamp'], '%Y-%m-%dT%H:%M:%SZ')
    sorted_versions = sorted(versions, key=lambda d: d['timestamp'])
    original_uploader = sorted_versions[0]['user']
    original_uploader_df = pd.DataFrame([original_uploader])
    return original_uploader_df

def get_original_uploader_wrapper(img_name):
    return get_original_uploader(img_name).iloc[0][0]

In [31]:
images.shape

(47063, 4)

In [32]:
images.head(1)

Unnamed: 0,user_name,user_id,img_name,year
0,Islahaddow,1499491,Wiki_Loves_Africa.pdf,2014


In [33]:
images_sm = images[:1000]

In [34]:
%time
images_sm['original_uploader_df'] = images_sm['img_name'].apply(get_original_uploader_wrapper)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 9.3 µs


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [38]:
images['original_uploader'] = images['img_name'].apply(get_original_uploader_wrapper)

In [41]:
images['misattribution_error'] = images['user_name'] != images['original_uploader']

In [44]:
images['misattribution_error'].value_counts()

False    45765
True      1298
Name: misattribution_error, dtype: int64

In [45]:
misattributed = images[images['misattribution_error']]

In [47]:
misattributed.head()

Unnamed: 0,user_name,user_id,img_name,year,original_uploader_df,original_uploader,misattribution_error
50,SteinsplitterBot,3714013,"Fried_Rice,_Jollof_rice_and_salad,_served_with...",2014,Justdifference,Justdifference,True
51,SteinsplitterBot,3714013,"Jamaican_Sauce_served_with_Boiled_Yam,_fried_r...",2014,Justdifference,Justdifference,True
92,Reda Kerbouche,983191,Rabbit_Grilled_from_Algeria.JPG,2014,Nasimanilsen25,Nasimanilsen25,True
146,Ji-Elle,30885,Dried_Beans.jpg,2014,Gushadumi,Gushadumi,True
148,Ji-Elle,30885,Dried_Vegetables_in_Zambia.jpg,2014,Gushadumi,Gushadumi,True


In [60]:
all_users = set(images['user_name'])
got_attrib = set(misattributed['user_name'])
un_attrib = set(misattributed['original_uploader'])

In [59]:
len(un_attrib)

377

In [62]:
len(un_attrib - all_users)

82

In [68]:
## who didn't get attributed for some reason but still got an invite for another reason
len(un_attrib.intersection(all_users))

295

In [70]:
len(got_attrib)

53

In [69]:
# who got an invite for stealing an attribution but shouldnt have
len(got_attrib - all_users)

0

In [71]:
len(got_attrib.intersection(all_users))

53

In [66]:
images[images['original_uploader']=='ABDOULWAHABDIAK']

Unnamed: 0,user_name,user_id,img_name,year,original_uploader_df,original_uploader,misattribution_error
32988,ABDOULWAHABDIAK,6917496,ABDOULWAHABDIAK_(1).jpg,2017,ABDOULWAHABDIAK,ABDOULWAHABDIAK,False
32989,ABDOULWAHABDIAK,6917496,ABDOULWAHABDIAK_(2).jpg,2017,ABDOULWAHABDIAK,ABDOULWAHABDIAK,False
32990,SteinsplitterBot,3714013,ABDOULWAHABDIAK_(3).jpg,2017,ABDOULWAHABDIAK,ABDOULWAHABDIAK,True
32991,SteinsplitterBot,3714013,ABDOULWAHABDIAK_(5).jpg,2017,ABDOULWAHABDIAK,ABDOULWAHABDIAK,True
32992,SteinsplitterBot,3714013,ABDOULWAHABDIAK_(4).jpg,2017,ABDOULWAHABDIAK,ABDOULWAHABDIAK,True
32993,ABDOULWAHABDIAK,6917496,ABDOULWAHABDIAK_(6).jpg,2017,ABDOULWAHABDIAK,ABDOULWAHABDIAK,False
32994,ABDOULWAHABDIAK,6917496,ABDOULWAHABDIAK_(7).jpg,2017,ABDOULWAHABDIAK,ABDOULWAHABDIAK,False
32995,ABDOULWAHABDIAK,6917496,ABDOULWAHABDIAK_(8).jpg,2017,ABDOULWAHABDIAK,ABDOULWAHABDIAK,False


In [73]:
images[images['original_uploader']=='Embedded Data Bot']

Unnamed: 0,user_name,user_id,img_name,year,original_uploader_df,original_uploader,misattribution_error


In [72]:
misattributed.groupby('user_name').size().sort_values(ascending=False).head(10)

user_name
Embedded Data Bot    762
SteinsplitterBot     266
Ji-Elle               83
Moumou82              77
Jdx                   22
FlickreviewR 2        18
Dyolf77                8
TOUMOU                 8
Reda Kerbouche         4
GTaliska               3
dtype: int64