# Univariate analysis of the price
We do that because it's a good way to get closer to the truth.

The problem we reduce you can formulate as:
At every single point in time the buybox can be won by an arbitrary number of nonprime listings.
Or an arbitrary number of prime listings. Or a random combination of both.

So we differ not by sellerid, instead we build the mean and count the buybox winning listings split by prime and non prime listings.

The result will be a correctly forward filled timeseries.

In [None]:
from mlrepricer import match, setup, helper
import altair as alt
import numpy as np
import warnings
alt.data_transformers.enable('default', max_rows=1000000)

In [None]:
df = helper.load_dataframe('alldata')

In [None]:
df = df[df.isbuyboxwinner==1]
df['prime_price'] = np.where(df['isprime']==1, df['price'], np.nan)
df['nonprime_price'] = np.where(df['isprime']==0, df['price'], np.nan)

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    result = df.groupby(['time_changed', 'asin']).agg({'prime_price': [np.nanmean, 'count'], 'nonprime_price': [np.nanmean, 'count']}).sort_index()

In [None]:
result.head()

In [None]:
result['diff'] = np.where(result[('prime_price', 'count')]==result[('nonprime_price', 'count')], result[('prime_price', 'nanmean')]-result[('nonprime_price', 'nanmean')], np.nan)
result['quotient'] = np.where(result[('prime_price', 'count')]==result[('nonprime_price', 'count')], result[('prime_price', 'nanmean')]/result[('nonprime_price', 'nanmean')], np.nan)

In [None]:
m.head()

In [None]:
import statsmodels.api as sm

X = m['prime_price nanmean']
y = m['nonprime_price nanmean']
X = sm.add_constant(X)

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

In [None]:
alt.Chart(m).mark_point().encode(
    x=alt.X('prime_price nanmean'),
    y=alt.Y('nonprime_price nanmean'),
    tooltip=('asin'),
    color='asin:N').interactive()

In [None]:
m = result[result['diff'].notnull()][[('prime_price', 'nanmean'), ('nonprime_price', 'nanmean')]]

In [None]:
m.columns.get_level_values(1)

In [None]:
m.columns = [' '.join(col).strip() for col in m.columns.values]

In [None]:
m.reset_index(inplace=True)