<a href="https://colab.research.google.com/github/ethamCS/Bert-Medical-Apps/blob/main/AppProcessor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
!pip install google_play_scraper



In [24]:
!pip install transformers



In [None]:
import os
import subprocess
import json
import time
import concurrent.futures
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from google_play_scraper import app

In [75]:
class AppProcessor:
    """
    A class for processing and classifing mobile health apps (mhealth apps).
    It includes methods for building a dataset by retrieving app descriptions
    and identifying mhealth apps using a fine-tuned deberta model.

    Attributes:
    - app_id_list: List of unique app IDs to be processed.
    - dataset_df: DataFrame containing app details ('app_id' and 'description').
    - num_cores: Number of CPU cores available on the system.
    - max_workers: Maximum number of worker threads for parallel processing.
    - current_app_file_path: Path to file storing current app IDs.

    Methods:
    - build_dataset(): Builds a dataset by fetching app details in parallel and saving the result to a CSV file.
    - read_app_ids(file_path): Reads a set of app IDs from a file.
    - write_app_ids(file_path, app_ids): Writes a set of app IDs to a file.
    - parse(): Parses app names from a CSV file, removing duplicates and updating the app_id_list.
    - parse_apps(): Uses a pre-trained machine learning model to identify and categorize health-related apps,
    storing the results in the mhealth_apps list.
    """
    def __init__(self):
        self.app_id_list = None
        self.dataset_df = None
        self.num_cores = os.cpu_count()
        self.max_workers = self.num_cores if self.num_cores else 1
        self.current_app_file_path = '/content/current_apps.txt'

    def build_dataset(self):
        apps_not_found = []
        apps_added_count = 0
        apps_not_found_count = 0
        total_apps = len(self.app_id_list)

        df = pd.DataFrame(columns=['app_id', 'description'])
        print(f'Adding apps...')

        def fetch_app_details(app_name):
            nonlocal apps_added_count, apps_not_found_count
            # if apps_added_count > 50:
            #   return None
            try:
                result_app_details = app(
                    app_name,
                    lang='en',
                    country='it' # Italy
                )
            except:
                apps_not_found.append(app_name)
                apps_not_found_count += 1
                return None

            apps_added_count += 1
            if apps_added_count % 10 == 0:
                print(f'{apps_added_count}/{total_apps} apps processed ({apps_added_count / total_apps * 100}% complete)')

            description = result_app_details['description']
            url = result_app_details['url']

            return [app_name, description]

        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            results = list(executor.map(fetch_app_details, self.app_id_list))

        for result in results:
            if result is not None:
                df.loc[len(df.index)] = result

        self.dataset_df = df
        self.dataset_df.to_csv('apps_df.csv')

    def read_app_ids(self,file_path):
            with open(file_path, 'r') as file:
                app_ids = file.read().splitlines()
            return set(app_ids)

    def write_app_ids(self,file_path, app_ids):
            with open(file_path, 'w') as file:
                file.write('\n'.join(app_ids))

    def parse(self):
        df = pd.read_csv("/content/androzoo_nov_25_cleaned.csv")
        app_id_list = df['app_name'].tolist()
        total_apps = len(app_id_list)
        unique_apps = len(set(app_id_list))
        print(f'Removed {total_apps - unique_apps} duplicate app names ({(unique_apps / total_apps)* 100:.3}% unique)')

        self.app_id_list = list(set(app_id_list))
        print(f'Apps to parse: {len(self.app_id_list)}')

        file_path_2 = self.current_app_file_path

        app_ids_set_1 = set(self.app_id_list)
        app_ids_set_2 = self.read_app_ids(file_path_2)

        common_app_ids = app_ids_set_1.intersection(app_ids_set_2)
        all_app_ids = app_ids_set_1
        self.app_id_list = all_app_ids - common_app_ids
        self.app_id_list = list(self.app_id_list)

    def parse_apps(self):
        print('Parsing Apps...')

        self.mhealth_apps = []
        id2label = {0: "NON-MHEALTH", 1: "MHEALTH"}

        if torch.cuda.is_available():
            device = 'cuda'
            print('Using GPU.')
        else:
            device = 'cpu'
            print('GPU not available. Using CPU.')

        tokenizer = AutoTokenizer.from_pretrained('etham13/MHealth_app_classifier')

        model = AutoModelForSequenceClassification.from_pretrained('etham13/MHealth_app_classifier')
        model.to(device)  # use gpu if available

        for index, row in self.dataset_df.iterrows():

            if 'description' in row and pd.notna(row['description']):
                encoding = tokenizer(row['description'], return_tensors="pt", max_length=512, truncation=True, padding=True)
                encoding = {k: v.to(device) for k, v in encoding.items()}

                outputs = model(**encoding)
                predictions = outputs.logits.squeeze().cpu()

                sigmoid = torch.nn.Sigmoid()
                probs = sigmoid(torch.Tensor(predictions))
                predictions[np.where(probs >= 0.5)] = 1
                predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
                if predicted_labels and predicted_labels[0] == 'MHEALTH':
                    self.mhealth_apps.append(row['app_id'])

                    if len(self.mhealth_apps) > 1 and len(self.mhealth_apps) % 100 == 0:
                        print(f'{len(self.mhealth_apps)} / {index} mhealth apps processed ({len(self.mhealth_apps) / index * 100:.3}% of proccessed apps)')
                        print(f'{index / len(self.dataset_df)* 100:.3}% complete')


In [76]:
processor = AppProcessor()
processor.parse()
processor.build_dataset()

Removed 2184217 duplicate app names (56.3% unique)
Apps to parse: 2815783
Adding apps...
10/2806625 apps processed (0.0003562998263038347% complete)
20/2806625 apps processed (0.0007125996526076694% complete)
30/2806625 apps processed (0.001068899478911504% complete)
40/2806625 apps processed (0.0014251993052153387% complete)
50/2806625 apps processed (0.0017814991315191733% complete)


In [77]:
print(processor.dataset_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 0 to 50
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   app_id       51 non-null     object
 1   description  51 non-null     object
dtypes: object(2)
memory usage: 1.2+ KB
None


In [78]:
# remove unnecessary columns
processor.dataset_df = processor.dataset_df[['app_id', 'description']]
processor.parse_apps()

Parsing Apps...
GPU not available. Using CPU.


In [80]:
processor.mhealth_apps

['com.marcoparedesnunez.displasiatactil',
 'com.trainerize.bfitpt',
 'arproductions.andrew.headachelog']

In [81]:
df_new = processor.dataset_df[processor.dataset_df['app_id'].isin(processor.mhealth_apps)][['app_id', 'description']]
df_new

Unnamed: 0,app_id,description
2,com.marcoparedesnunez.displasiatactil,"The application ""Touch Displasia"" is designed ..."
7,com.trainerize.bfitpt,"With this fitness app, you can start tracking ..."
8,arproductions.andrew.headachelog,Headache Log makes it <b>quick</b> and <b>easy...


In [82]:
processor.write_app_ids('/content/new_mhealth_apps.txt', processor.mhealth_apps)