# Select random reviews for Coding

We want to select a random sample of reviews for a first round of coding.

## Review Counts

In [91]:
import pandas as pd

reviews = pd.read_pickle('data/combined.pkl')

In [92]:
reviews['date'] = pd.to_datetime(reviews['date'], utc=True)

# groups for both PlayStore and AppStore
by_source = reviews.groupby('source')

appS = by_source.get_group('AppStore')
playS = by_source.get_group('PlayStore')

# group into days
appS_by_day = appS.groupby(appS['date'].dt.date)
playS_by_day = playS.groupby(playS['date'].dt.date)

In [93]:
import plotly.graph_objects as go
fig = go.Figure(
    data=[go.Scatter(x=appS_by_day.count()['date'].index.values,
                     y=appS_by_day.count()['date'], name='AppStore'),
         go.Scatter(x=playS_by_day.count()['date'].index.values,
                     y=playS_by_day.count()['date'], name='PlayStore')],
    layout_title_text="New Daily Reviews"
)
fig.show()

Reminder: Review count drop fast after first few days. We could set April 26th as last day of our review period as after that combined review counts drop below 100/day.

In [94]:
import plotly.graph_objects as go

by_version_appS = appS.groupby('version')
by_version_playS = playS.groupby('version')
by_version = reviews.groupby('version')


fig = go.Figure(data=[
    go.Bar(name='PlayStore', x=by_version_playS.count().index, 
           y=by_version_playS['score'].count(), text = by_version_playS['score'].size()),
    go.Bar(name='AppStore', x=by_version_appS['score'].mean().index, 
           y=by_version_appS['score'].count(), text = by_version_appS['score'].size()),
    go.Bar(name='Combined', x=by_version['score'].mean().index, 
           y=by_version['score'].count(), text = by_version['score'].size())
], layout_title_text="Review count by app version and store")
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()

Versions 1.0.1 and 1.0.5 would need to be excluded if we wanted to split the reviews 50/50 between stores, since they were released for only one of the stores.

In [95]:
import plotly.graph_objects as go
appS_by_day = appS.groupby(appS['date'].dt.date)

# create lines for every version
scatters = []
for v in by_version:
    d = v[1].groupby(v[1]['date'].dt.date)
    scatters.append(go.Scatter(x=d['date'].count().index.values, y=d['date'].count(), name=v[0]))

fig = go.Figure(
    data = scatters,
    layout_title_text="New Daily Reviews by Version"
)
fig.show()

Here we see which version was reviewed at which point in time. In general, version overlaps are rather small. That means we have to expect confounding between versions and external events (e.g. lockdown).

## How to sample reviews

One selection method could be
- Exclude reviews after April 26
- For each version released until then, select 100 reviews
    - 50 App Store / 50 Play Store
    - 10 reviews for every possbile rating (1-5)

In [96]:
# remove reviews after April 26
mask = (reviews['date'] < '2020-4-27')
reviews_filtered_date = reviews.loc[mask]

In [97]:
sample_set = pd.DataFrame()
for v in reviews_filtered_date.groupby('version'):
    print('Version {}: {}'.format(v[0], len(v[1])))
    by_store = v[1].groupby('source')
    for s in by_store:
        print('  {}: {}'.format(s[0], len(s[1])))
        by_score = s[1].groupby('score')
        for r in by_score:
            l = len(r[1])
            if(l < 10):
                print('!!!', end = " ")
            print('     {}*: {}'.format(r[0], l))
            if(l > 10):
                res = r[1].sample(10)
            else:
                res = r[1]
            sample_set = sample_set.append(res)

Version 1.0.1: 3789
  PlayStore: 3789
     1*: 1773
     2*: 626
     3*: 633
     4*: 227
     5*: 530
Version 1.0.2: 2332
  AppStore: 147
     1*: 76
     2*: 17
     3*: 30
     4*: 14
     5*: 10
  PlayStore: 2185
     1*: 1074
     2*: 312
     3*: 286
     4*: 168
     5*: 345
Version 1.0.3: 815
  AppStore: 549
     1*: 229
     2*: 86
     3*: 96
     4*: 50
     5*: 88
  PlayStore: 266
     1*: 100
     2*: 21
     3*: 33
     4*: 33
     5*: 79
Version 1.0.4: 414
  AppStore: 102
     1*: 13
!!!      2*: 6
!!!      3*: 7
     4*: 15
     5*: 61
  PlayStore: 312
     1*: 129
     2*: 48
     3*: 33
     4*: 36
     5*: 66
Version 1.0.5: 80
  AppStore: 80
     1*: 18
!!!      2*: 6
     3*: 11
     4*: 13
     5*: 32
Version 1.0.6: 781
  AppStore: 182
     1*: 49
     2*: 18
     3*: 32
     4*: 18
     5*: 65
  PlayStore: 599
     1*: 267
     2*: 70
     3*: 75
     4*: 67
     5*: 120
Version 1.0.7: 66
  AppStore: 22
     1*: 13
!!!      3*: 3
!!!      5*: 6
  PlayStore: 44
  

In [98]:
sample_set.to_csv('sample_set.csv')