In [1]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import re
import statistics as s
from calendar import monthrange

# PREPROCESSING
* Remove columns where values are predominantly NaN
* Remove rows where reviews are not of type str
* Remove rows where reviews are less than 4 words
* Remove non ASCII reviews
* Change representation of 'replyDate' column from date of reply to a boolean (was replied to)

# DATA

We've selected 4 applications for google.play.com:
* Viber
* Soundcloud
* Reddit
* Snapchat

### ID's
Each application on google.play.com has its own unique identifier, we store this in a dictionary:

In [2]:
app_attrs = {
    'viber': {
        'id': 'com.viber.voip'
    }
#     'soundcloud': {
#         'id': 'com.soundcloud.android'
#     },
#     'reddit': {
#         'id': 'com.reddit.frontpage'
#     },
#     'snapchat': {
#         'id': 'com.snapchat.android'
#     }
}

### STRUCTURE

In [3]:
dfs = {} # put all imported data into a "master" dataframe

### IMPORT

In [4]:
year = '2019' # take user reviews from the year 2019 only

for k, v in app_attrs.items():
    print(f"Importing {app_attrs[k]['id']}")
    dfs[k] = pd.read_csv("data/gplay_reviews_er/" + app_attrs[k]['id'] + ".csv",
                        index_col="timestamp",
                        parse_dates=True)
dfs['viber'].head()

Importing com.viber.voip


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0_level_0,username,rating,title,version,review,likes,replyDate,replyText
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-05-22 09:40:49.735000+00:00,Ayman Diab,3,,,good but there is way better.,0,,
2020-05-22 09:11:35.185000+00:00,Mar Peralta,5,,,Wonderful,0,,
2020-05-22 09:03:35.591000+00:00,Khum Kumari Grg,5,,,I can contact my relatives,0,,
2020-05-22 08:58:10.984000+00:00,Ana Peichl,1,,,An update sent all my contacts message that I ...,0,,
2020-05-22 08:48:01.516000+00:00,Aye Chan Myat,5,,,application is very useful,0,,


In [5]:
dfs['viber'].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 918107 entries, 2020-05-22 09:40:49.735000+00:00 to 2011-07-20 07:05:25.238000+00:00
Data columns (total 8 columns):
username     918106 non-null object
rating       918107 non-null int64
title        0 non-null float64
version      0 non-null float64
review       902353 non-null object
likes        918107 non-null int64
replyDate    8460 non-null object
replyText    0 non-null float64
dtypes: float64(3), int64(2), object(3)
memory usage: 63.0+ MB


The total amount of reviews for this particular dataset (Viber) totals 920K, but we only look at 2019, hence the "low" number of reviews.

From the info() command we see that we have some NaN values, these rows must be removed.

- 10 reviews are NaN (remove NaN rows)
- All replyText, title, and version are NaN (drop columns)

In [6]:
# remove columns that are NaN
def drop_nan_columns(df):
    nan_columns = []
    for column in df:
        if df[str(column)].isnull().sum(axis=0) == len(df):
            nan_columns.append(column)
    return df.drop(nan_columns, axis=1)    

In [7]:
dfs['viber'] = drop_nan_columns(dfs['viber'])
dfs['viber'].head()

Unnamed: 0_level_0,username,rating,review,likes,replyDate
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-05-22 09:40:49.735000+00:00,Ayman Diab,3,good but there is way better.,0,
2020-05-22 09:11:35.185000+00:00,Mar Peralta,5,Wonderful,0,
2020-05-22 09:03:35.591000+00:00,Khum Kumari Grg,5,I can contact my relatives,0,
2020-05-22 08:58:10.984000+00:00,Ana Peichl,1,An update sent all my contacts message that I ...,0,
2020-05-22 08:48:01.516000+00:00,Aye Chan Myat,5,application is very useful,0,


In [8]:
# remove rows with NaN reviews
def drop_nan_string_rows(df, col):
    for index, row in df.iterrows():
        if type(row[col]) != str or row[col] == "" or len(row[col]) < 1:
            df = df.drop([index])
    return df

In [None]:
dfs['viber'] = drop_nan_string_rows(dfs['viber'], 'review')
dfs['viber'].info()

In [None]:
def drop_short_reviews(df, col, min_words):
    for index, row in df.iterrows():
        words_in_review = row[col].split(" ")
        if len(words_in_review) < min_words:
            df = df.drop([index])
    return df

In [None]:
dfs['viber'] = drop_short_reviews(dfs['viber'], 'review', 4)
dfs['viber'].info()

In [None]:
dfs['viber'].head(20)

In [None]:
# if NaN set to 0, else 1
def replydate_to_bool(df):
    for index, row in df.iterrows():
        if type(row['replyDate'] == float) and str(row['replyDate']) == "nan":
            df[index]['replyDate'] = 0
        else:
            df[index]['replyDate'] = 1
    return df

In [None]:
len(dfs['viber'].loc['2019-02-01'])

In [None]:
len(dfs['viber'].loc['2019-02-01'])

In [None]:
def pie_chart(sizes, labels, title):
    fig, ax = plt.subplots()
    plt.style.use('fast')
    fig=plt.figure(figsize=(14, 10), dpi=80, facecolor='w', edgecolor='w')
    ax.pie(sizes,
          labels=labels,
          autopct='%1.1f%%',
          shadow=True,
          startangle=90)

    ax.axis('equal')
    ax.set(aspect="equal", title=title)
    plt.show()

In [None]:
sizes = [
    len(dfs['viber'].loc['2020']),
    len(dfs['viber'].loc['2019']),
    len(dfs['viber'].loc['2018']),
    len(dfs['viber'].loc['2017'])
]

labels = [2020, 2019, 2018, 2017]

pie_chart(sizes, labels, 'review count distribution - viber')