In [None]:
import google_play_scraper
import play_scraper
import pandas as pd
from matplotlib.pyplot import style
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

german_stop_words = stopwords.words('german')

In [None]:
# Enter country code and language
country = "de"
lang = "de"

app_name= "Xing" #e.g Xing

# Fetch a list of autocompleted query suggestions
play_scraper.suggestions("car")

In [None]:
# Fetch a list of applications matching a search query. 
search_query = app_name
app_list = play_scraper.search(search_query, gl=country, hl=lang)

app_list = pd.DataFrame(app_list)

# Fetch app_id
app_list = app_list["app_id"].values.tolist()

review = list()
total_reviews = pd.DataFrame()

for id in app_list:
    for score in range(1,6):
        result, _ = google_play_scraper.reviews(id,
                                lang=lang, # defaults to 'en'
                                country=country, # defaults to 'us'
                                sort=google_play_scraper.Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
                                count=5000, # defaults to 100
                                filter_score_with=score) # defaults to None(means all score)
                        
        review.extend(result)

reviews = pd.DataFrame(review)
total_reviews = total_reviews.append(reviews)

# drop columns not needed
total_reviews = total_reviews.drop(columns=[
                    "userName", 
                    "userImage", 
                    "reviewCreatedVersion", 
                    "replyContent", 
                    "repliedAt",
                    "at"
                ]
            )

total_reviews.head(2)

In [None]:
df = total_reviews.dropna().copy()

In [None]:
raw_words = []
for i in df["content"]:
    words = i.split(" ")
    raw_words.extend(words)

raw_words
# Number of unique words in raw reviews
raw_word_count = len(set(raw_words))

# Preprocessing: Remove stopwords
vectorizer = CountVectorizer(stop_words = german_stop_words) 
X = vectorizer.fit_transform(df["content"])
pro_words = vectorizer.get_feature_names()

# Number of unique words in preprocessed reviews
pro_word_count= len(set(pro_words))

In [None]:
style.use("seaborn")

x = (0, 1)
wc = [raw_word_count, pro_word_count]

plt.bar(x, wc)
plt.xticks(x, ('Raw Reviews', 'Preprocessed Reviews'), fontsize=14)
plt.ylabel("Word Count", fontsize=14)
plt.yticks(fontsize=14)
plt.show()

In [None]:
# Plot word cloud
text = " ".join(review for review in df["content"])
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
plt.figure(figsize=(15,30))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
wordcloud.to_file("./img/raw_review_wordcloud.png")

In [None]:
# Plot word cloud
## Create stopword list:
stopwords = set(STOPWORDS)
stopwords.update(german_stop_words)

text = " ".join(review for review in df["content"])
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
plt.figure(figsize=(15,30))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
wordcloud.to_file("./img/preprocessed_review_wordcloud.png")

In [None]:
def remove_stopwords(x):
    token = x.split(" ")
    return " ".join([w for w in token if not w.lower() in german_stop_words])
    
df["preprocessContent"] = df["content"].apply(remove_stopwords)

In [None]:
df.to_excel("./data/reviews_german.xlsx")
df.head(1)