In [15]:
import pandas as pd

# List of file paths
files = [
    "/Users/goknurarican/Downloads/Babbel-rating-1-reviews.csv",
    "/Users/goknurarican/Downloads/Babbel-rating-2-reviews.csv",
    "/Users/goknurarican/Downloads/Babbel-rating-5-reviews.csv",
    "/Users/goknurarican/Downloads/Cake-rating-1-reviews.csv",
    "/Users/goknurarican/Downloads/Cake-rating-2-reviews.csv",
    "/Users/goknurarican/Downloads/Cake-rating-5-reviews.csv",
    "/Users/goknurarican/Downloads/Mondly-rating-1-reviews.csv",
    "/Users/goknurarican/Downloads/Mondly-rating-2-reviews.csv",
    "/Users/goknurarican/Downloads/Mondly-rating-5-reviews.csv"
]

for file_path in files:
    # Read the current file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Drop duplicate reviews, keeping the first occurrence
    df_deduplicated = df.drop_duplicates(subset=['review'], keep='first')
    
    # Construct a new file path for the deduplicated file
    new_file_path = file_path.replace('.csv', '-deduplicated.csv')
    
    # Save the deduplicated DataFrame to the new file
    df_deduplicated.to_csv(new_file_path, index=False)

# This script reads each file, removes duplicate reviews, and saves the deduplicated DataFrame to a new file.


In [13]:
from app_store_scraper import AppStore
import pandas as pd
from langdetect import detect
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import ssl
import random  

class SSLAdapter(HTTPAdapter):
    """An adapter that uses a custom SSL context."""
    def __init__(self, ssl_context, **kwargs):
        self.ssl_context = ssl_context
        super().__init__(**kwargs)

    def init_poolmanager(self, *args, **kwargs):
        kwargs['ssl_context'] = self.ssl_context
        super().init_poolmanager(*args, **kwargs)

# Updated retry strategy
retry_strategy = Retry(
    total=5,  # Toplam yeniden deneme sayısını artırıyoruz
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"],
    backoff_factor=2  # Backoff faktörünü artırıyoruz
)
# Create a custom SSL context
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE

# Session setup with SSLAdapter and retry strategy
http = requests.Session()
adapter = SSLAdapter(ssl_context)
http.mount("https://", adapter)
http.mount('http://', HTTPAdapter(max_retries=retry_strategy))

# Set a User-Agent to mimic a web browser request
http.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'})

# Your existing code for making requests follows...
# Note: Make sure to use the `http` session object for requests.



review_number=10000000
apps = [
     {"id": 987873536, "name": "mondly", "app_name": "Mondly"},
     {"id": 1350420987, "name": "cake", "app_name": "Cake"},
    {"id": 829587759, "name": "babbel", "app_name": "Babbel"},
    
] 

country_mapping = {
    'il': 'Israel',
    'ae': 'United Arab Emirates',
    'ar': 'Argentina',
    'au': 'Australia',
    'br': 'Brazil',
    'ca': 'Canada',
    'cl': 'Chile',
    'cn': 'China',
    'de': 'Germany',
    'dk': 'Denmark',
    'fr': 'France',
    'gl': 'Greenland',
    'hk': 'Hong Kong',
    'ie': 'Ireland',
    'in': 'India',
    'ir': 'Iran',
    'it': 'Italy',
    'jp': 'Japan',
    'kr': 'South Korea',
    'mx': 'Mexico',
    'nl': 'Netherlands',
    'no': 'Norway',
    'nz': 'New Zealand',
    'ru': 'Russia',
    'sa': 'Saudi Arabia',
    'se': 'Sweden',
    'tr': 'Turkey',
    'uk': 'United Kingdom',
    'us': 'United States',
    'fi': 'Finland',
    'es': 'Spain',
    'et': 'Ethiopia',
    'za': 'South Africa',
    'jo': 'Jordan',
    'lb': 'Lebanon'
}
#languages = ['he', 'ar', 'en', 'pt', 'es', 'de', 'da', 'fr', 'zh', 'ja', 'ko', 'nl', 'no', 'ru', 'sv', 'tr', 'fi', 'et', 'af']

languages = ['en', 'tr']

ratings_to_scrape = [1, 2, 5]

def fetch_reviews(app_id, app_name, country_code, language, rating):
    try:
        app_store = AppStore(country=country_code, app_id=app_id, app_name=app_name)
        app_store.review(how_many=review_number, after=None, sleep=random.uniform(1, 3))  # Rastgele bekleme süresi

        filtered_reviews = [review for review in app_store.reviews if review.get('rating') == rating]
        language_reviews = filter_reviews_by_language(filtered_reviews, language)
        for review in language_reviews:
            review['Country'] = country_code
        return language_reviews
    except Exception as e:
        print(f"Error fetching reviews for {app_name} in {country_code}: {e}")
        time.sleep(5)  # Hata durumunda bekleme süresi
        return []

def filter_reviews_by_language(reviews, language):
    language_reviews = []
    for review in reviews:
        content = review.get("review")
        if content and len(content) > 20:#REVIEW MIKTARI
            try:
                if detect(content) == language:
                    if language == 'en':
                        content = clean_text(content)  # cleaning up the strange characters for english reviews
                    review['review'] = content
                    review['Language'] = language
                    language_reviews.append(review)
            except:
                pass
    return language_reviews

def clean_text(text):
    replacements = {
        "â€™": "'",
        "â€": "",
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text


for app_info in apps:
    app_id = app_info["id"]
    app_name = app_info["app_name"]
    all_reviews = []
    for rating in ratings_to_scrape:
        for country_code in country_mapping.keys():
            reviews = fetch_reviews(app_id, app_name, country_code, 'en', rating)  # Örnek olarak 'en' dilini kullandık
            all_reviews.extend(reviews)
        # CSV'ye kaydetme işlemi
        reviews_df = pd.DataFrame(all_reviews)
        file_name = f'{app_name}-rating-{rating}-reviews.csv'
        reviews_df.to_csv(file_name, index=False, header=True, encoding='utf_8_sig')
        print(f'Combined reviews for {app_name} with rating {rating} saved to {file_name}')


Error fetching reviews for Mondly in il: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /il/app/mondly/id987873536 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))


2024-02-22 21:32:11,763 [INFO] Base - Initialised: AppStore('ae', 'mondly', 987873536)
2024-02-22 21:32:11,765 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/ae/app/mondly/id987873536
2024-02-22 21:32:14,363 [INFO] Base - [id:987873536] Fetched 150 reviews (150 fetched in total)
2024-02-22 21:32:14,976 [INFO] Base - Initialised: AppStore('ar', 'mondly', 987873536)
2024-02-22 21:32:14,977 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/ar/app/mondly/id987873536
2024-02-22 21:32:16,195 [INFO] Base - [id:987873536] Fetched 69 reviews (69 fetched in total)
2024-02-22 21:32:17,731 [INFO] Base - Initialised: AppStore('au', 'mondly', 987873536)
2024-02-22 21:32:17,732 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/au/app/mondly/id987873536
2024-02-22 21:32:22,760 [INFO] Base - [id:987873536] Fetched 320 reviews (320 fetched in total)
2024-02-22 21:32:24,289 [INFO] Base - [id:987873536] Fetched 402 reviews (402 fetched in total)
2024-02-2

Error fetching reviews for Mondly in ca: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /ca/app/mondly/id987873536 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))


2024-02-22 21:33:51,959 [INFO] Base - Initialised: AppStore('cl', 'mondly', 987873536)
2024-02-22 21:33:51,964 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/cl/app/mondly/id987873536
2024-02-22 21:33:54,867 [INFO] Base - [id:987873536] Fetched 121 reviews (121 fetched in total)
2024-02-22 21:33:56,619 [INFO] Base - Initialised: AppStore('cn', 'mondly', 987873536)
2024-02-22 21:33:56,620 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/cn/app/mondly/id987873536
2024-02-22 21:34:01,864 [INFO] Base - [id:987873536] Fetched 360 reviews (360 fetched in total)
2024-02-22 21:34:07,468 [INFO] Base - [id:987873536] Fetched 680 reviews (680 fetched in total)
2024-02-22 21:34:13,097 [INFO] Base - [id:987873536] Fetched 1020 reviews (1020 fetched in total)
2024-02-22 21:34:18,787 [INFO] Base - [id:987873536] Fetched 1280 reviews (1280 fetched in total)
2024-02-22 21:34:23,928 [INFO] Base - [id:987873536] Fetched 1567 reviews (1567 fetched in total)
2024-02-22 2

Error fetching reviews for Mondly in dk: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /dk/app/mondly/id987873536 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))


2024-02-22 21:36:04,414 [INFO] Base - Initialised: AppStore('fr', 'mondly', 987873536)
2024-02-22 21:36:04,416 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/fr/app/mondly/id987873536
2024-02-22 21:36:23,464 [ERROR] Base - Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/fr/apps/987873536/reviews?l=en-GB&offset=0&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 429 error responses'))
2024-02-22 21:36:23,472 [INFO] Base - [id:987873536] Fetched 0 reviews (0 fetched in total)
2024-02-22 21:36:23,887 [INFO] Base - Initialised: AppStore('gl', 'mondly', 987873536)
2024-02-22 21:36:23,889 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/gl/app/mondly/id987873536
2024-02-22 21:36:24,189 [ERROR] Base - Something went wrong: 'data'
2024-02-22 21:36:24,190 [INFO] Base - [id:987873536] Fetched 0 reviews (0 fetched in total)
202

Error fetching reviews for Mondly in ie: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /ie/app/mondly/id987873536 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))


2024-02-22 21:37:34,304 [INFO] Base - Initialised: AppStore('in', 'mondly', 987873536)
2024-02-22 21:37:34,305 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/in/app/mondly/id987873536
2024-02-22 21:37:54,602 [ERROR] Base - Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/in/apps/987873536/reviews?l=en-GB&offset=80&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 429 error responses'))
2024-02-22 21:37:54,613 [INFO] Base - [id:987873536] Fetched 80 reviews (80 fetched in total)


Error fetching reviews for Mondly in ir: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /ir/app/mondly/id987873536 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))
Error fetching reviews for Mondly in it: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /it/app/mondly/id987873536 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))
Error fetching reviews for Mondly in jp: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /jp/app/mondly/id987873536 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))


2024-02-22 21:41:13,730 [INFO] Base - Initialised: AppStore('kr', 'mondly', 987873536)
2024-02-22 21:41:13,731 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/kr/app/mondly/id987873536
2024-02-22 21:41:33,189 [ERROR] Base - Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/kr/apps/987873536/reviews?l=en-GB&offset=0&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 429 error responses'))
2024-02-22 21:41:33,193 [INFO] Base - [id:987873536] Fetched 0 reviews (0 fetched in total)


Error fetching reviews for Mondly in mx: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /mx/app/mondly/id987873536 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))
Error fetching reviews for Mondly in nl: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /nl/app/mondly/id987873536 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))
Error fetching reviews for Mondly in no: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /no/app/mondly/id987873536 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))
Error fetching reviews for Mondly in nz: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /nz/app/mondly/id987873536 (Cau

2024-02-22 21:45:54,732 [INFO] Base - Initialised: AppStore('ru', 'mondly', 987873536)
2024-02-22 21:45:54,734 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/ru/app/mondly/id987873536
2024-02-22 21:46:15,516 [ERROR] Base - Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/ru/apps/987873536/reviews?l=en-GB&offset=20&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 429 error responses'))
2024-02-22 21:46:15,524 [INFO] Base - [id:987873536] Fetched 20 reviews (20 fetched in total)


Error fetching reviews for Mondly in sa: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /sa/app/mondly/id987873536 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))
Error fetching reviews for Mondly in se: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /se/app/mondly/id987873536 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))


2024-02-22 21:48:30,173 [INFO] Base - Initialised: AppStore('tr', 'mondly', 987873536)
2024-02-22 21:48:30,174 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/tr/app/mondly/id987873536
2024-02-22 21:48:35,389 [INFO] Base - [id:987873536] Fetched 220 reviews (220 fetched in total)
2024-02-22 21:48:41,107 [INFO] Base - [id:987873536] Fetched 420 reviews (420 fetched in total)
2024-02-22 21:48:46,828 [INFO] Base - [id:987873536] Fetched 680 reviews (680 fetched in total)
2024-02-22 21:48:55,128 [INFO] Base - [id:987873536] Fetched 860 reviews (860 fetched in total)
2024-02-22 21:49:00,824 [INFO] Base - [id:987873536] Fetched 1020 reviews (1020 fetched in total)
2024-02-22 21:49:08,817 [INFO] Base - [id:987873536] Fetched 1280 reviews (1280 fetched in total)
2024-02-22 21:49:14,306 [INFO] Base - [id:987873536] Fetched 1520 reviews (1520 fetched in total)
2024-02-22 21:49:19,924 [INFO] Base - [id:987873536] Fetched 1720 reviews (1720 fetched in total)
2024-02-22 21:49:20,3

Error fetching reviews for Mondly in uk: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /uk/app/mondly/id987873536 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))
Error fetching reviews for Mondly in us: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /us/app/mondly/id987873536 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))
Error fetching reviews for Mondly in fi: HTTPSConnectionPool(host='apps.apple.com', port=443): Max retries exceeded with url: /fi/app/mondly/id987873536 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))


KeyboardInterrupt: 