# 对App review的分析

## 0. 引用

In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import analyze_review
from analyze_review import red
import sample_size
import keywords_search
from pprint import pprint

## 1. 对UI-related和UI-unrelated review进行采样，并人工总结出false positive

## 目录

### 1.1 读取整体的数据集（在服务器上运行）

### 1.2 读取UI-related数据集 （在服务器上运行）

### 1.3 从整体的去除UI-related的数据，获得UI-unrelated的数据 （在服务器上运行）

### 1.4 整理好两部分的数据集后，固定随机种子，并采样（在服务器上运行）

### 1.5 读取采样的数据（可以在本地运行）

### 1.1 读取整体的数据集(在服务器上运行)

数据量非常大，需要在服务器上面跑

#### Free review的数量

In [2]:
# 测试加载时间
# %%timeit
# free = pd.read_csv("/home/qiuyuanchen/data/UI_data/not_used/Free_Apps_Reviews.csv.gz", compression="gzip")
# 3min 4s ± 1.82 s per loop (mean ± std. dev. of 7 runs, 1 loop each)

3min 4s ± 1.82 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
free = pd.read_csv("/home/qiuyuanchen/data/UI_data/not_used/Free_Apps_Reviews.csv.gz", compression="gzip")
len(free)

75422963

#### Non-free review的数量

In [4]:
# %%timeit
# non_free = pd.read_csv("/home/qiuyuanchen/data/UI_data/not_used/Non_Free_Apps_Reviews.csv.gz", compression="gzip")
# 11.6 s ± 35.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

11.6 s ± 35.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
non_free = pd.read_csv("/home/qiuyuanchen/data/UI_data/not_used/Non_Free_Apps_Reviews.csv.gz", compression="gzip")
len(non_free)

3333370

#### 所有review的数量

In [6]:
# all=pd.concat([free, non_free])
# len(all)
# assert len(all) == (len(free) + len(non_free))

#### 直接读取所有的review，并check

In [8]:
check_all = pd.read_csv("/home/qiuyuanchen/data/UI_data/ALL_APP_REVIEW.csv.gz", compression="gzip")
assert len(all) == len(check_all)
len(check_all)

78756333

### 1.2 读取所有UI的数据

#### 所有UI的数量，以及free ui和non-free ui的数量

In [10]:
ui = pd.read_csv("/home/qiuyuanchen/data/UI_data/ALL_UI.csv")
red("UI review的数量")
print(len(ui))
red("Free UI review的数量")
free_ui = ui[ui["IS_FREE"] == 1]
print(len(free_ui))
red("Non free UI review的数量")
non_free_ui = ui[ui["IS_FREE"] == 0]
print(len(non_free_ui))

[31mUI review的数量[0m
3355145
[31mFree UI review的数量[0m
3035518
[31mNon free UI review的数量[0m
319627


### 1.3 从整体的去除UI-related的数据，获得UI-unrelated的数据 （需要在服务器上跑）

In [11]:
non_ui = all[~all["REVIEW_ID"].isin(ui["REVIEW_ID"])]

In [12]:
red("所有review的数量")
print(len(all))
red("UI review的数量")
print(len(ui))
red("UI-unrelated review的数量")

[31m所有review的数量[0m
78756333
[31mUI review的数量[0m
3355145
[31mUI-unrelated review的数量[0m
75401188


#### 直接读取处理好的UI-unrelated review 数据

In [27]:
check_non_ui = pd.read_csv("/home/qiuyuanchen/data/UI_data/ALL_NON_UI.csv.gz", compression="gzip")
assert len(check_non_ui) == len(non_ui)
print(len(check_non_ui))

75401188


#### 1.4 整理好两部分的数据集后，固定随机种子，并采样

In [13]:
red("UI review 采样的数量")
ui_size = sample_size.calculate_size(2.58, 0.03, len(ui))
print(ui_size)
red("UI-unrelated review 采样的数量")
non_ui_size = sample_size.calculate_size(2.58, 0.02, len(non_ui))
print(non_ui_size)

[31mUI review 采样的数量[0m
1848
[31mUI-unrelated review 采样的数量[0m
4160


In [35]:
RANDOM_SEED = 2333
sample_ui = ui.sample(ui_size, random_state=RANDOM_SEED).reset_index()
sample_non_ui = non_ui.sample(non_ui_size, random_state=RANDOM_SEED).reset_index()

In [82]:
pprint(sample_ui["REVIEW_TEXT"][0])
pprint(sample_non_ui["REVIEW_TEXT"][0])

("I've tried so many apps  this is the best for auto selection ( similar to "
 'magic wand function in Photoshop)  highly recommended')
'Very good'


#### 1.5 读取采样的数据（可以在本地运行）
输出成为html方便读

In [15]:
keywords = keywords_search.get_keywords("keywords.txt")

In [41]:
sample_ui = pd.read_csv("sample_ui.csv")
sample_non_ui = pd.read_csv("sample_non_ui.csv")
sample_ui['content'] = sample_ui["REVIEW_TITLE"].fillna("No title") + "\n" + sample_ui["REVIEW_TEXT"].fillna("No title")
sample_non_ui['content'] = sample_non_ui["REVIEW_TITLE"].fillna("No title") + "\n" + sample_non_ui["REVIEW_TEXT"].fillna("No title")

In [42]:
pprint(sample_ui['content'][0])
pprint(sample_non_ui['content'][0])

('No title\n'
 "I've tried so many apps  this is the best for auto selection ( similar to "
 'magic wand function in Photoshop)  highly recommended')
'No title\nVery good'


In [25]:
keywords_search.color_html(sample_ui['content'], keywords, "sample_ui.html")
keywords_search.color_html(sample_non_ui['content'], keywords, "sample_non_ui.html")

#### 分析采样的数据

In [80]:
red("在UI review的采样中，是UI review的数量以及比例")
label_ui = pd.read_csv("label_ui_false_positive.txt", header=None, squeeze=True, names=["is_ui"])
pprint(label_ui.value_counts())
pprint(label_ui.value_counts(normalize=True))
red("在UI-unrelated review的采样中，不是UI review的数量以及比例")
label_non_ui = pd.read_csv("label_non_ui_false_positive.txt", header=None, squeeze=True, names=["is_not_ui"])
pprint(label_non_ui.value_counts())
pprint(label_non_ui.value_counts(normalize=True))

[31m在UI review的采样中，是UI review的数量以及比例[0m
1    1812
0      36
Name: is_ui, dtype: int64
1    0.980519
0    0.019481
Name: is_ui, dtype: float64
[31m在UI-unrelated review的采样中，不是UI review的数量以及比例[0m
1    4133
0      27
Name: is_not_ui, dtype: int64
1    0.99351
0    0.00649
Name: is_not_ui, dtype: float64


## 2. 分析实验中采样的数据

### 2.1 UI review的采样数据及统计

In [83]:
all = pd.read_csv("label_app_review.csv", encoding="utf_8_sig")
# print(all["CATEGORY"].value_counts())
# print(all["SUBCATEGORY"].value_counts())
all["SUBCATEGORY"] = all["SUBCATEGORY"].fillna("False")

def count_category():
    print("统计各个UI category （包括多标签）")
    category_dict = {
        "appearance": 0,
        "interaction": 0,
        "experience": 0,
        "others": 0,
        "False": 0,
    }
    for review in all["CATEGORY"]:
        for c in str(review).split("/"):
            category_dict[c] += 1
    return category_dict


def count_subcategory():
    print("统计各个UI types（包括多标签）")
    subcategory_dict = {
        "layout": 0,
        "color": 0,
        "typography": 0,
        "iconography": 0,
        "image": 0,
        "navigation": 0,
        "notification": 0,
        "motion": 0,
        "gesture": 0,
        "accessibility": 0,
        "redundancy": 0,
        "customization limitation": 0,
        "advertisement": 0,
        "feedback": 0,
        "generic evaluation": 0,
        "comparative review": 0,
        "material": 0,
        "False": 0,
    }
    total = 0
    for review in all["SUBCATEGORY"]:
        for c in str(review).split("/"):
            subcategory_dict[c] += 1
            total += 1
    print(subcategory_dict)
    print("False positive的数量以及比例")
    print(subcategory_dict["False"])
    # 总共有1447
    print(subcategory_dict["False"] / (subcategory_dict["False"] + 1447))
    # return subcategory_dict
# count_category()
count_subcategory()

统计各个UI types（包括多标签）
{'layout': 62, 'color': 50, 'typography': 73, 'iconography': 73, 'image': 73, 'navigation': 75, 'notification': 117, 'motion': 35, 'gesture': 70, 'accessibility': 22, 'redundancy': 112, 'customization limitation': 108, 'advertisement': 53, 'feedback': 94, 'generic evaluation': 232, 'comparative review': 142, 'material': 95, 'False': 22}
False positive的数量以及比例
22
0.014976174268209666


In [81]:
from glob import glob
files = sorted(glob("review_reply_*.xlsx"))
data_list = [pd.read_excel(f) for f in files]
all_review_reply = pd.concat(data_list)
all_review_reply.dropna(subset=["REPLY", "CATEGORY"], inplace=True)

In [84]:
print(all_review_reply["CATEGORY"].value_counts())
print(len(all_review_reply))
def count_category(data):
    print("统计各个UI category （包括多标签）")
    category_dict = {
        "appearance": 0,
        "interaction": 0,
        "experience": 0,
        "others": 0,
        "False": 0,
    }
    for review in data["CATEGORY"]:
        for c in str(review).split("/"):
            category_dict[c] += 1
    return category_dict

def count_reply(data):
    print("统计各个pattern（包括多标签）")
    print(all_review_reply["REPLY"].value_counts())
    reply_dict = {
        "apology or appreciation": 0,
        "advice": 0,
        "information request": 0,
        "justify": 0,
        "promise": 0,
        "inform": 0,
        "unspecify": 0,
        "False": 0
    }
    total = 0
    for review in data["REPLY"]:
        for c in str(review).split("/"):
            reply_dict[c] += 1
            total += 1
    pprint(reply_dict)
    print("False positive的数量")
    print(reply_dict["False"])
    # print(len(all_review_reply))
    # 总共有764个数据
    print(reply_dict["False"] / (764 + reply_dict["False"]))

count_category(all_review_reply)
count_reply(all_review_reply)

appearance                182
experience                165
others                    157
interaction               148
False                      12
appearance/interaction      8
experience/appearance       4
interaction/experience      3
interaction/appearance      3
appearance/experience       2
others/appearance           1
experience/interaction      1
Name: CATEGORY, dtype: int64
686
统计各个UI category （包括多标签）
统计各个pattern（包括多标签）
apology or appreciation                        220
information request                            106
advice                                          84
inform                                          59
promise                                         57
justify                                         55
unspecify                                       34
apology or appreciation/information request     23
False                                           12
justify/advice                                   7
advice/information request                       6
apo