In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

**To scrape the data we will use BeautifulSoup**

In [None]:
soup = BeautifulSoup("http", 'html.parser')

#Let's start on the main page of the website
URL = 'https://www.androidrank.org/android-most-popular-google-play-apps'
r = requests.get(URL)
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser')

# Initialization of an empty dataframe with the desired columns (URL is ou unique ID)
new_columns = ["URL", 'Rank','Name', 'Nb_Rating', 'Installs', 'Avg_Rating', "Ev_30", "Ev_60", "Price", "Category"]
app_pd = pd.DataFrame(columns=new_columns)

# We go throught all the categories and take all the given app and their ranking in that category
nav = soup.find("nav")
start = nav.find("b", text="Comics") #1
for b in start.find_next_siblings("b"):
    a = b.find("a")
    Category = a.get_text()
    
    # Now we move on the webpage of the desired category
    URL = 'https://www.androidrank.org/'+a["href"]
    r = requests.get(URL)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    
    # We select all the app on the webpage with all their informations
    table = soup.find("table", id="ranklist")
    apps = table.find_all("tr")[1:]
    for app in apps:
        
        # Ev_30 and Ev_60 corresponds to the growth of the number of ratings over the 30 and 60 respectively
        URL, Rank, Name, Nb_Rating, Installs, Avg_Rating, Ev_30, Ev_60, Price = '','','','','','','','',''
        p = app.find_all("td")
        Rank = p[0].text
        URL = p[1].find('a', href=True)["href"]
        Name = p[1].find('a', href=True).text
        Nb_Rating = p[3].text
        Installs = p[4].text
        Avg_Rating = p[5].text
        Ev_30 = p[6].text
        Ev_60 = p[7].text
        Price = p[8].text
        
        # We create a "dummy" dataframe to organize the collected data
        characteristics_pd = pd.DataFrame([[URL, Rank, Name, Nb_Rating, Installs, Avg_Rating,
                                            Ev_30, Ev_60, Price,Category]], columns = new_columns)
        
        # We append this dataframe to the global dataframe
        app_pd = app_pd.append(characteristics_pd,ignore_index=True)
    
    #we get the the URL for the next page in the same category
    nextpage = soup.find_all('small')[1].find("a", text="Next >")
    
    # We continue this until there is no next page in the given category anymore
    while(len(nextpage['href'])):
        
        urlnext_page = "https://www.androidrank.org" + nextpage['href']
        r = requests.get(urlnext_page)
        page_body = r.text
        soup = BeautifulSoup(page_body, 'html.parser')

        table = soup.find("table", id="ranklist")
        apps = table.find_all("tr")[1:]

        for app in apps:
            URL, Rank, Name, Nb_Rating, Installs, Avg_Rating, Ev_30, Ev_60, Price = '','','','','','','','',''
            p = app.find_all("td")
            Rank = p[0].text
            URL = p[1].find('a', href=True)["href"]
            Name = p[1].find('a', href=True).text
            Nb_Rating = p[3].text
            Installs = p[4].text
            Avg_Rating = p[5].text
            Ev_30 = p[6].text
            Ev_60 = p[7].text
            Price = p[8].text
            characteristics_pd = pd.DataFrame([[URL, Rank, Name, Nb_Rating, Installs, Avg_Rating,
                                                Ev_30, Ev_60, Price, Category]], columns = new_columns)
            app_pd = app_pd.append(characteristics_pd,ignore_index=True)
            
            # To avoid a crash due to the return of a None when there is no next page we manually set it to ""
            if(soup.find_all('small')[1].find("a", text="Next >")):
                nextpage = soup.find_all('small')[1].find("a", text="Next >")
            else:
                nextpage["href"] = ""

**Due to the excess of requests, we had to relaunch the previous cell and modify line #1 to restart from the desired category**

In [None]:
# Read the first csv containing data
df_app = pd.read_csv('data/1.csv', index_col=[0])

# Iterate through the 7 remaining files
for i in range(2,9):
    df_temp = pd.read_csv('data/' + str(i) + '.csv', index_col=[0])
    df_app = pd.concat([df_app, df_temp], ignore_index=True, join='inner')
    
# Dropping duplicates since the parsing was sometimes stopped during the analysis of one category of apps
df_app.drop_duplicates(inplace = True)

**At the end we have the following dataset**

In [None]:
df_app.head(10)

In [None]:
print('URL: ' + str(df_app.URL.isnull().values.any()))
print('Rank: ' + str(df_app.Rank.isnull().values.any()))
print('Name: ' + str(df_app.Name.isnull().values.any()))
print('Nb_Rating: ' + str(df_app.Nb_Rating.isnull().values.any()))
print('Installs: ' + str(df_app.Installs.isnull().values.any()))
print('Avg_Rating: ' + str(df_app.Avg_Rating.isnull().values.any()))
print('Ev_30: ' + str(df_app.Ev_30.isnull().values.any()))
print('Ev_60: ' + str(df_app.Ev_60.isnull().values.any()))
print('Price: ' + str(df_app.Price.isnull().values.any()))
print('Category: ' + str(df_app.Category.isnull().values.any()))

**The only column with null values is the Category. It corresponds to the global ranking of all apps that were parsed, where the category was not mentionned. Since these apps already appear in their own category (if they are in the global top 500, they are also in their own category top 500), they are duplicates. So let's drop them.**

In [None]:
df_app.dropna(inplace=True)
df_app.isna().values.any()

**Isn't it weird that we don't have 24 500 rows since we have a top 500 in each category and 49 categories?**
**Let's have a closer look**

In [None]:
df_app.Category.value_counts().tail(10)

**By looking on the website, we see that this is not a scraping problem but that some categories have less app in the Top. This is already a good thing to highlights the difference of diversity between each categories**

**Little bit of cleaning let's convert back the number from string to int**

In [None]:
df_app["Nb_Rating"] = df_app["Nb_Rating"].str.replace(',', '')
df_app["Nb_Rating"] = df_app["Nb_Rating"].astype(int)
df_app["Installs"] = df_app["Installs"].str.replace(' ', '')
df_app["Installs"] = (df_app["Installs"].str.replace(r'[kM]+$', '', regex=True).astype(float) * \
                        df_app["Installs"].str.extract(r'[\d\.]+([kM]+)', expand=False)
                         .fillna(1)
                          .replace(['k','M'], [10**3, 10**6]).astype(int))
df_app["Avg_Rating"] = df_app["Avg_Rating"].astype(float)
df_app["Ev_30"] = df_app["Ev_30"].str.replace('%', '').astype(float)
df_app["Ev_60"] = df_app["Ev_60"].str.replace('%', '').astype(float)
df_app.head(5)

**We save the cleaned dataset**

In [None]:
df_app.to_csv("data/final_dataset.csv")

**We found a new API that gives us even more information about the app such as the developper, whether the app contains add or not so let's use it.**

In [None]:
from google_play_scraper import app

#It works as follow 

result = app(
    'com.facebook.katana',
    lang='fr', # defaults to 'en'
    country='fr' # defaults to 'us'
)
print(result)

**We add new columns to our datasets**

In [None]:
df_app["histogram"] = np.empty((len(df_app), 0)).tolist()
df_app["size"] = np.nan
df_app["androidVersion"] = np.nan
df_app["developer"] = np.nan
df_app["developerId"] = np.nan
df_app["containsAds"] = np.nan
df_app["released"] = np.nan
df_app["updated"] = np.nan
df_app["version"] = np.nan
df_app["comments"] = np.empty((len(df_app), 0)).tolist()
df_app["icon"] = np.nan

**First let's see which app are not from the us store to avoid crashing in the API due to the 404 not found**

In [None]:
#The variable allows us to have a countdown and also to start back from where we stoped
i = 0
tot_page = 24500
for url in df_app["URL"][i:]:
    i += 1
    printed= i/tot_page*100
    stdout.write("\r%f %%" % printed)
    stdout.flush()
    codename = url.split('/')[-1]
    url_search = 'https://play.google.com/store/apps/details?id='+codename+ '&hl=en&gl=us'
    r=requests.get(url_search)
    if r.status_code == 404:
        df_app.loc[df_app["URL"]==url,"Parsable"]= False
    else:
        df_app.loc[df_app["URL"]==url,"Parsable"]= True


indexs = df_app[df_app['Parsable'] == False ].index
 
# Delete these row that are not in the us store indexes from dataFrame
df_app.drop(indexs , inplace=True)
df_app.reset_index(inplace=True, drop = True)

**Now we add all the desired information except for the comments that will be scraped only for the most rated app**

In [None]:
i = 0
tot_page = 25636
for url in df_app["URL"]:
    i += 1
    printed= i/tot_page*100
    stdout.write("\r%f %%" % printed)
    stdout.flush()
    if df_app.loc[df_app["URL"]==url,"Parsable"].values[0]:
        codename = url.split('/')[-1]
        result = app(
            codename,
            lang='en', 
            country='us' 
        )
        df_app.at[df_app.index[df_app["URL"]==url].tolist()[0],"histogram"] = result["histogram"]
        df_app.loc[df_app["URL"]==url,"size"] = result["size"]
        df_app.loc[df_app["URL"]==url,"androidVersion"] = result["androidVersion"]
        df_app.loc[df_app["URL"]==url,"developer"] = result["developer"]
        df_app.loc[df_app["URL"]==url,"developerId"] = result["developerId"]
        df_app.loc[df_app["URL"]==url,"containsAds"] = result["containsAds"]
        df_app.loc[df_app["URL"]==url,"released"] = result["released"]
        df_app.loc[df_app["URL"]==url,"updated"] = result["updated"]
        df_app.loc[df_app["URL"]==url,"version"] = result["version"]
        df_app.loc[df_app["URL"]==url,"icon"] = result["icon"]

In [3]:
df_app.isna().any()

URL               False
Rank              False
Name              False
Nb_Rating         False
Installs          False
Avg_Rating        False
Ev_30             False
Ev_60             False
Price             False
Category          False
Parsable          False
histogram          True
size              False
androidVersion     True
developer         False
developerId       False
containsAds        True
released           True
updated           False
version            True
comments           True
icon              False
dtype: bool

**We still have some nan values let's have a look at it**

In [9]:
df_app[df_app["histogram"].isna()].shape

(1, 22)

In [11]:
df_app[df_app["androidVersion"].isna()].shape

(11, 22)

In [12]:
df_app[df_app["containsAds"].isna()].shape

(7030, 22)

In [13]:
df_app[df_app["released"].isna()].shape

(1689, 22)

In [15]:
df_app[df_app["version"].isna()].shape

(2, 22)

**We see that the only significant one that can be due to an error is the containsAds. Let's have a look at it**

In [20]:
df_app["containsAds"].value_counts()

True    15716
Name: containsAds, dtype: int64

**We see that the nan value corresponds to the False let's change this.**

In [22]:
df_app.loc[df_app["containsAds"].isna(),"containsAds"] = False

In [23]:
df_app["containsAds"].value_counts()

True     15716
False     7030
Name: containsAds, dtype: int64

In [None]:
# We save this final dataset
df_app.to_csv("data/final_dataset.csv")