diff --git a/3. Using WikiWho to analyze an editor in the context of a page.ipynb b/3. Using WikiWho to analyze an editor in the context of a page.ipynb index 8e0645c..4948e7f 100644 --- a/3. Using WikiWho to analyze an editor in the context of a page.ipynb +++ b/3. Using WikiWho to analyze an editor in the context of a page.ipynb @@ -304,8 +304,8 @@ "outputs": [], "source": [ "from visualization.owned_listener import OwnedListener\n", - "owned = calculator.all_actions\n", - "listener = OwnedListener(owned, editor_inputname)\n", + "all_actions = calculator.all_actions\n", + "listener = OwnedListener(all_actions, editor_inputname)\n", "\n", "traces = ['Tokens Owned', 'Tokens Owned (%)']\n", "\n", @@ -315,9 +315,24 @@ "from ipywidgets.widgets import Dropdown\n", "\n", "interact(listener.listen,\n", - " _range = get_date_slider_from_datetime(owned['rev_time']),\n", + " _range = get_date_slider_from_datetime(all_actions['rev_time']),\n", " granularity=Dropdown(options=['Yearly', 'Monthly', 'Daily'], value='Monthly'),\n", - " trace=Dropdown(options=traces, value='Tokens Owned (%)'))\n" + " trace=Dropdown(options=traces, value='Tokens Owned (%)'))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = all_actions\n", + "df = df.sort_values(['token_id', 'rev_time'], ascending=True).set_index('token_id')\n", + "last_action = df.groupby('token_id').last()\n", + "surv = last_action[last_action['action'] != 'out']\n", + "#sum(surv['o_editor'] == editor_inputname)\n", + "surv\n" ] }, { @@ -354,7 +369,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.6" + "version": "3.6.7" } }, "nbformat": 4, diff --git a/develop.py b/develop.py new file mode 100644 index 0000000..d06f866 --- /dev/null +++ b/develop.py @@ -0,0 +1,51 @@ +from external.wikipedia import WikipediaDV, WikipediaAPI +wikipedia_dv = WikipediaDV(WikipediaAPI(domain='en.wikipedia.org')) +the_page = wikipedia_dv.get_page('The Camp of the Saints') + +from wikiwho_wrapper import WikiWho + + + +wikiwho = WikiWho(lng='en') +agg_actions = wikiwho.dv.edit_persistence(the_page.page_id) + + +editors = wikipedia_dv.get_editors(agg_actions['editor_id'].unique()).rename(columns = { + 'userid': 'editor_id'}) + +# Merge the namesof the editors to the aggregate actions dataframe +agg_actions = agg_actions.merge(editors[['editor_id', 'name']], on='editor_id') +agg_actions.insert(3, 'editor', agg_actions['name']) +agg_actions = agg_actions.drop(columns=['name']) +agg_actions['editor'] = agg_actions['editor'].fillna("Unregistered") + +all_content = wikiwho.dv.all_content(the_page['page_id']) + + + +revisions = wikiwho.dv.rev_ids_of_article(the_page['page_id']) + +from metrics.conflict import ConflictManager +calculator = ConflictManager(all_content, revisions) +calculator.calculate() + +editors_conflicts = calculator.get_conflict_score_per_editor() + +editors['editor_id'] = editors['editor_id'].astype(str) +if len(editors_conflicts) > 0: + editors_conflicts = editors[['editor_id', 'name']].merge(editors_conflicts, + right_index=True, left_on='editor_id').set_index('editor_id') + + + +from visualization.owned_listener import OwnedListener +owned = calculator.all_actions +listener = OwnedListener(owned, '28921814') + + +listener.listen( + _range = (owned['rev_time'].dt.date.min(), owned['rev_time'].dt.date.max()), + granularity='Monthly', + trace='Tokens Owned (%)') + +import ipdb; ipdb.set_trace() # breakpoint b86e2bcc // diff --git a/visualization/owned_listener.py b/visualization/owned_listener.py index ff2aeba..b1dc4a3 100644 --- a/visualization/owned_listener.py +++ b/visualization/owned_listener.py @@ -1,27 +1,67 @@ import pandas as pd import plotly from plotly import graph_objs - +import datetime class OwnedListener(): def __init__(self, df, editor): self.df = df.sort_values(['token_id', 'rev_time'], ascending=True).set_index('token_id') self.editor = editor + self.days = days = pd.Series(df.loc[df['o_editor'] == editor, 'rev_time'].dt.to_period('D').unique()).sort_values(ascending=False) + + days = self.days.dt.to_timestamp('D') + pd.DateOffset(1) + + _all = [] + _abs = [] + df = self.df + for rev_time in days: + df = df[df['rev_time'] <= rev_time] + last_action = df.groupby('token_id').last() + surv = last_action[last_action['action'] != 'out'] + _abs.append(sum(surv['o_editor'] == self.editor)) + _all.append(len(surv)) + + self.summ = pd.DataFrame({ + 'day': days, + 'abs': _abs, + 'all': _all + }) + self.summ['res'] = 100 * self.summ['abs'] / self.summ['all'] + self.df_plotted = None def listen(self, _range, granularity, trace): df = self.df df = df[(df.rev_time.dt.date >= _range[0]) & - (df.rev_time.dt.date <= _range[1])] - - self.doi = df.loc[df['editor'] == self.editor, 'rev_time'].dt.to_period( - granularity[0]).dt.to_timestamp(granularity[0]).sort_values(ascending=False).unique() + (df.rev_time.dt.date <= _range[1] + datetime.timedelta(days=1))] + self.doi = pd.Series(self.days.dt.to_timestamp(granularity[0]).unique()) + pd.DateOffset(1) self.traces = [] self.is_norm_scale = True - df = self.__add_trace(df, trace, 'rgba(0, 0, 0, 1)') + + if trace == 'Tokens Owned': + self.is_norm_scale = False + _df = self.summ + _df['time'] = _df['day'].dt.to_period(granularity[0]).dt.to_timestamp(granularity[0]) + _df = _df[~_df.duplicated(subset='time', keep='first')] + _y = _df['abs'] + + elif trace == 'Tokens Owned (%)': + _df = self.summ + _df['time'] = _df['day'].dt.to_period(granularity[0]).dt.to_timestamp(granularity[0]) + _df = _df[~_df.duplicated(subset='time', keep='first')] + _y = _df['res'] + + self.traces.append( + graph_objs.Scatter( + x=_df['time'], y=_y, + name=trace, + marker=dict(color='rgba(255, 0, 0, .5)')) + ) + + self.__add_trace(df, trace, 'rgba(0,0,255, .5)') _range = None if self.is_norm_scale: @@ -61,7 +101,7 @@ def __add_trace(self, df, trace, color): self.traces.append( graph_objs.Scatter( - x=pd.Series(self.doi), y=_y, + x=self.doi, y=_y, name=trace, marker=dict(color=color)) )