Importing the required libraries

In [409]:
import numpy as np
import pandas as pd
import openai
import os

from pymongo import MongoClient

Connecting to MongoDB

In [2]:
%%time

client = MongoClient(
    os.environ["MONGODB_URL"],
    serverSelectionTimeoutMS=300000
)
db = client["vidio"]
collection = db["google_play_store_reviews"]
df_original = pd.DataFrame(list(collection.find()))

CPU times: total: 2.08 s
Wall time: 1min 46s


Loading the data

In [4]:
df = df_original.copy()
df = df.drop_duplicates()
df = df.drop("_id", axis=1)
df = df.replace("empty", np.nan)
df = df.sort_values("at", ascending=False)
df = df.reset_index(drop=True)

In [6]:
# df[(df["topic"].notna()) & (df["score"] <= 3)].to_excel("topic_cleaning.xlsx")

In [8]:
df_sliced = df[(df["topic"].notna()) & (df["score"] <= 3)]

Creating the functions for translation and topic modeling using the GPT-3.5 Turbo model

In [11]:
openai.api_key = os.environ["OPENAI_API_KEY"]

In [229]:
def translate_to_english(text):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": f'Please translate this Indonesian text "{text}" to english in the format [EN: translation], but if there is no English translation, return [EN: Cannot be translated]. Please make sure write in the format that I requested only.'
            }
        ]
    )

    return response["choices"][0]["message"]["content"]

In [292]:
def assign_topic(text):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": f'Please assign one of the topics (Advertisement, Watching Experience, Package, Technical, Network, Others) to this text "{text}" in the format [Topic: assigned topic]. Please make sure write in the format that I requested only.'
            }
        ]
    )

    return response["choices"][0]["message"]["content"]

Translating all the reviews to English

In [78]:
%%time

english = []
for i in df_sliced["content_original"]:
    translated_text = "Translation Error"
    for j in range(5):
        try:
            translated_text = translate_to_english(i)
            break
        except:
            pass
    english.append(translated_text)

CPU times: total: 17.4 s
Wall time: 13h 7min 4s


In [250]:
for idx, val in enumerate(english):
    print(f"{idx}. {val}")

0. [EN: Subscription but weird, still cutting my main data quota and making it more wasteful, thought subscription without ads but instead flooded with sales promos... 😏😏]
1. [EN: My package was duplicated but there was no reversal.]
2. [EN: Always fail to open live EPL broadcast]
3. [EN: What you want to watch is all paid for 🤣 download as well because there's a bonus video quota, it's good for entertainment when downloading and entering the application, mostly paid for what you want to watch 🤣.]
4. [EN: Love to watch for free]
5. [EN: Why rarely release new Films, and now there are advertisements during movie screenings.]
6. [EN: Connection is not good]
7. [EN: Still trying]
8. [EN: Buffering, slow, even though the network is stable]
9. [EN: Already subscribed to the diamond package, but when logging in, it is not registered in the package. It's useless for me to pay if in the end it can't be used.]
10. [EN: The application is useless, I already downloaded it and it's running videos 

Finding the indices of the translated reviews that aren't in the correct format

In [272]:
import re

def find_invalid_indices(english):
    invalid_indices = []
    for i, text in enumerate(english):
        if not re.match(r'^\[EN: [^\[\]]+\]$', text):
            invalid_indices.append(i)
    return invalid_indices

In [283]:
invalid_indices = find_invalid_indices(english)
invalid_indices

[23,
 51,
 61,
 89,
 97,
 146,
 156,
 161,
 180,
 183,
 193,
 231,
 259,
 262,
 283,
 296,
 319,
 321,
 328,
 329,
 331,
 351,
 394,
 428,
 446,
 451,
 455,
 480,
 488,
 490,
 499,
 509,
 517,
 523,
 524,
 553,
 564,
 569,
 575,
 594,
 599,
 633,
 664,
 669,
 690,
 692,
 741,
 797,
 804,
 849,
 857,
 865,
 869,
 893,
 901,
 914,
 925,
 951,
 983,
 985,
 1011,
 1012,
 1016,
 1019,
 1039,
 1058,
 1067,
 1068,
 1069,
 1070,
 1105,
 1157,
 1164,
 1171,
 1177,
 1209,
 1223,
 1226,
 1267,
 1291,
 1292,
 1299,
 1306,
 1320,
 1345,
 1353,
 1354,
 1362,
 1373,
 1387,
 1390,
 1396,
 1439,
 1545,
 1557,
 1562,
 1590,
 1592,
 1635,
 1661,
 1677,
 1689,
 1697,
 1700,
 1724,
 1745,
 1769,
 1801,
 1802,
 1824,
 1839,
 1845,
 1888,
 1891,
 1896,
 1911,
 1927,
 1936,
 1961,
 1966,
 1978,
 1986,
 2077,
 2088,
 2106,
 2128,
 2131,
 2142,
 2163,
 2164,
 2172,
 2174,
 2182,
 2184,
 2197,
 2210,
 2250,
 2254,
 2280,
 2285,
 2288,
 2300,
 2309,
 2322,
 2339,
 2349,
 2373,
 2384,
 2418,
 2443,
 2467,
 2470,
 

In [284]:
len(invalid_indices)

1109

In [287]:
[list(df_sliced["content_original"])[i] for i in invalid_indices]

['"ada banyak tayangan yang bisa kamu saksikan secara gratis di Vidio ya. Namun jika ingin bisa menyaksikan semua tayangan Premier dan terbebas dari iklan, kami sarankan kamu untuk mengaktifkan paket berlangganan Vidio terlebih dahulu. Jangan lupa ditambahkan bintangnya ya kak" Lawakan macam apa, lebih lawakan ini daripada tim sepakbola yang ngelawak. Expecting untuk nambah bintang? No. Think yourself. Iklan boleh namun banyak dan ga bisa di skip jangan lah. Kau kira it\'s easy dapet duit? Pikirkan',
 'Kok akun gw tiba" keluar terus gak bisa login lagi,pas di coba lagi berkali" gabisa mulu,padahal gw udh bayar buat berlangganan,tolong dong',
 'ishh bintang 2 dulu laa, soalnya gatau cara liat video yang udah aku unduh harusnya tuu fitur gtuuu tuu jelas terpampang biar ga puyeng nyarinya eh udh nyari² ga ketemu terus padahal udh download banyak apa harus premium dulu biar bisa liat apa yang kita download?',
 'Kok videonya eror titiba2 ada tulisan kaloh mau di matin atau cari acara lain, 

Re-translating the reviews that aren't in the correct format

In [288]:
%%time

english_revision = []
for i in [list(df_sliced["content_original"])[i] for i in invalid_indices]:
    translated_text = "[EN: Cannot be translated]"
    for j in range(5):
        try:
            while True:
                translated_text = translate_to_english(i)
                if re.match(r'^\[EN: [^\[\]]+\]$', translated_text):
                    break
            break
        except:
            pass
    english_revision.append(translated_text)

CPU times: total: 3.12 s
Wall time: 1h 37min 35s


In [289]:
english_revision

["[EN: There are many shows that you can watch for free on Vidio. However, if you want to watch all Premier shows and be free from ads, we recommend that you activate a Vidio subscription package first. Don't forget to give it a star, okay? What kind of comedy is this? This is funnier than a football team's jokes. Expecting more stars? No. Think for yourself. Ads are okay but don't put too many and can't be skipped. Do you think it's easy to get money? Think about it.]",
 '[EN: "Why does my account keep logging out and I can\'t log in anymore? I have tried multiple times but it still doesn\'t work, even though I have paid for a subscription. Please help."]',
 "[EN: Ishh, let's start with level 2 first because I don't know how to watch the videos that I've downloaded. There should be a feature to make it clear so I won't have to strain looking for it. I've searched and searched but couldn't find it despite having downloaded many. Do I need to go premium to be able to watch what I've dow

Combining the reviews that were previously not in the correct format but have now been translated with the original translated reviews

In [333]:
english_revision[1100] = "[EN: Angel]"
english_revision[1101] = "[EN: Cannot be translated]"
english_revision[1102] = "[EN: Cannot be translated]"
english_revision[1103] = "[EN: Cannot be translated]"
english_revision[1104] = "[EN: Cannot be translated]"
english_revision[1105] = "[EN: Complicated]"
english_revision[1106] = "[EN: I'm sorry, sir. I purchased it through Gopay, but the payment process is complicated. Please help me reset my Google, Gmail, and other passwords. Even though it's smooth for other applications, it's difficult for me to purchase a subscription on Vidio. This is a payment, not a theft. It shouldn't be this hard.]"
english_revision[1107] = "[EN: There were bugs in the Southampton vs MU match from the 80th to 90th minute. If the customer service is aware, please check the comments on their Instagram. Then, can the tech team fix the bugs? There's no need to ask for screenshots, they should already be aware, right? Hopefully, all the bugs can be fixed in the next match. The problem occurred through the website.]"
english_revision[1108] = "[EN: Bad]"

In [334]:
for i, j in zip(invalid_indices, english_revision):
    print(i, j)

23 [EN: There are many shows that you can watch for free on Vidio. However, if you want to watch all Premier shows and be free from ads, we recommend that you activate a Vidio subscription package first. Don't forget to give it a star, okay? What kind of comedy is this? This is funnier than a football team's jokes. Expecting more stars? No. Think for yourself. Ads are okay but don't put too many and can't be skipped. Do you think it's easy to get money? Think about it.]
51 [EN: "Why does my account keep logging out and I can't log in anymore? I have tried multiple times but it still doesn't work, even though I have paid for a subscription. Please help."]
61 [EN: Ishh, let's start with level 2 first because I don't know how to watch the videos that I've downloaded. There should be a feature to make it clear so I won't have to strain looking for it. I've searched and searched but couldn't find it despite having downloaded many. Do I need to go premium to be able to watch what I've downlo

In [336]:
english[19233]

'[EN: translation] PLEASE RETURN THE DA VINCI CHANNEL!! DA VINCI HAS QUALITY CONTENT AND ADDS INSIGHT AND KNOWLEDGE! In fact, one of the reasons I use Vidio Platinum is to watch Da Vinci. Why is Da Vinci now missing from Vidio!!!???'

In [337]:
for i, j in zip(invalid_indices, english_revision):
    english[i] = j

In [338]:
english[19233]

'[EN: Please bring back the Da Vinci channel!! Da Vinci has quality content, and it adds insight and knowledge! In fact, one of the reasons I subscribed to Vidio Platinum was to watch Da Vinci. Why has Da Vinci disappeared from Vidio???]'

In [355]:
for idx, val in enumerate(english):
    english[idx] = val.replace("[", "").replace("EN:", "").replace("]", "").replace('"', '').strip()

In [356]:
english[19233]

'Please bring back the Da Vinci channel!! Da Vinci has quality content, and it adds insight and knowledge! In fact, one of the reasons I subscribed to Vidio Platinum was to watch Da Vinci. Why has Da Vinci disappeared from Vidio???'

In [359]:
df_sliced["content_english"] = english

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sliced["content_english"] = english


In [360]:
df_sliced.head()

Unnamed: 0,reviewId,userName,userImage,content_original,content_english,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,topic
3,d4c34358-8c30-4de9-bd6d-56460318335e,Fey Kirana,https://play-lh.googleusercontent.com/a-/ACB-R...,"Berlangganan tapi aneh,,ttp aja malah mmotong ...","Subscription but weird, still cutting my main ...",1,0,6.3.8-80a04c7878,2023-05-04 00:48:03,"Hai Sahabat Vidio , perihal apa yang dapat kam...",2023-05-04 00:51:55,Advertisement
6,1a074b76-870c-457f-b7d3-b39814159333,Syifaun Muhammad,https://play-lh.googleusercontent.com/a/AGNmyx...,paket saya terdoble tapi tdk ada reversal,My package was duplicated but there was no rev...,1,0,6.3.8-80a04c7878,2023-05-03 19:38:14,Hai kak. kami akan bantu pengecekan untuk kend...,2023-05-03 20:34:39,Package
7,045e7574-29b8-4bdc-91ea-46ff8d9a22b3,Probo Sucitro,https://play-lh.googleusercontent.com/a/AGNmyx...,Selalu gagal buka tayangan live EPL,Always fail to open live EPL broadcast,1,0,6.3.8-80a04c7878,2023-05-03 19:34:32,Hai kak. kami akan bantu pengecekan untuk kend...,2023-05-03 20:34:15,Bad Application
8,c6345d62-e852-4539-8a1a-2c8d8813a1b0,Gulbastian,https://play-lh.googleusercontent.com/a-/ACB-R...,Apa yg Mao ditonton berbayar semua🤣 download j...,What you want to watch is all paid for 🤣 downl...,1,0,6.3.8-80a04c7878,2023-05-03 19:10:57,Hai Kak. terima kasih atas reviewnya. Yuk berl...,2023-05-03 20:33:53,Bad Application
10,80d063c3-65b1-4cac-8f6a-3e7d0b0ce177,Putra Siantar,https://play-lh.googleusercontent.com/a/AGNmyx...,Kasih tontonan gratis la,Love to watch for free,3,0,,2023-05-03 17:41:18,"Hai Sahabat Vidio, terima kasih atas reviewnya...",2023-05-03 20:33:07,Bad Application


Assigning topics to each review

In [376]:
%%time

topics = []
for i in df_sliced["content_original"]:
    labeled_topic = "[Topic: Others]"
    for j in range(5):
        try:
            labeled_topic = assign_topic(i)
            break
        except:
            pass
    topics.append(labeled_topic)

CPU times: total: 10.4 s
Wall time: 9h 11min 46s


In [379]:
for idx, val in enumerate(topics):
    print(f"{idx}. {val}")

0. [Topic: Technical]
1. [Package: paket saya terdoble tapi tdk ada reversal]
2. [Technical: Selalu gagal buka tayangan live EPL]
3. [Watching Experience: assigned topic]
4. [Others: assigned topic]
5. [Others: assigned topic]
6. [Topic: Network]
7. [Others: assigned topic]
8. [Technical: Buffering, lemot, padahal jaringan stabil]
9. [Package: Sudah berlangganan paket diamond,,pas login malah tidak terdaftar di paket tersebut..percuma saya bayar kalau ujung ujungnya gx bisa dipakai.]
10. [Technical: Aplikasi ga guna udh gw donwload udh muter video malah lemot pdhl pakwt gua msih bnyak jringan bgus kgk lag..pokoknya apliksi kgk guna]
11. [Technical: Tidak bisa nonton menggunakan wifi!!]
12. [Topic: Advertisement]
13. [Others: assigned topic]
14. [Advertisement: assigned topic]
15. [Technical: Masa perangkat tidak bisa memutar resolusi 740 sih]
16. [Topic: Advertisement]
17. [Topic: Package]
18. [Topic: Technical]
19. [Others: cannot be assigned a specific topic]
20. [Topic: Advertisemen

In [386]:
cleaned_topics = [i for i in topics]

for idx, val in enumerate(cleaned_topics):
    if "Advertisement" in val:
        cleaned_topics[idx] = "Advertisement"
    elif "Watching Experience" in val:
        cleaned_topics[idx] = "Watching Experience"
    elif "Package" in val:
        cleaned_topics[idx] = "Package"
    elif "Technical" in val:
        cleaned_topics[idx] = "Technical"
    elif "Network" in val:
        cleaned_topics[idx] = "Network"
    elif "Others" in val:
        cleaned_topics[idx] = "Others"

In [388]:
for idx, val in enumerate(cleaned_topics):
    print(f"{idx}. {val}")

0. Technical
1. Package
2. Technical
3. Watching Experience
4. Others
5. Others
6. Network
7. Others
8. Technical
9. Package
10. Technical
11. Technical
12. Advertisement
13. Others
14. Advertisement
15. Technical
16. Advertisement
17. Package
18. Technical
19. Others
20. Advertisement
21. Others
22. Technical
23. Package
24. Advertisement
25. Advertisement
26. Technical
27. Others
28. Technical
29. Package
30. Technical
31. Others
32. Others
33. Advertisement
34. Advertisement
35. Others
36. Advertisement
37. Advertisement
38. Technical
39. Others
40. Advertisement
41. Technical
42. Advertisement
43. Others
44. Advertisement
45. Others
46. Others
47. Advertisement
48. Advertisement
49. Others
50. Technical
51. Technical
52. Advertisement
53. Technical
54. Advertisement
55. Package
56. Watching Experience
57. Advertisement
58. Technical
59. Technical
60. Advertisement
61. Watching Experience
62. Others
63. Technical
64. Others
65. Advertisement
66. Technical
67. Others
68. Others
69. A

In [392]:
df_sliced["topic"] = cleaned_topics

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sliced["topic"] = cleaned_topics


Combining df_sliced to df

In [396]:
df["content_english"] = [np.nan] * len(df)
df["topic"] = [np.nan] * len(df)
df.head()

Unnamed: 0,reviewId,userName,userImage,content_original,content_english,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,topic
0,788e563e-ade2-48bb-851f-7f874f382a1a,Mawar Manjiri,https://play-lh.googleusercontent.com/a/AGNmyx...,"Kalau menurut saya apk ini bagus banget, karen...",,5,0,6.3.8-80a04c7878,2023-05-04 02:14:44,"Hai Sahabat Vidio, terimakasih sudah memberika...",2023-05-04 03:13:00,
1,a5aa454f-a32a-4db8-a3e0-ba7efee57e3b,Dhany Aprian,https://play-lh.googleusercontent.com/a/AGNmyx...,Fitur apk nya sih bagus tpi kenapa ya kalo apk...,,4,0,6.2.6-0cbd587cd9,2023-05-04 02:08:51,"Hai Sahabat Vidio, mohon maaf atas ketidaknyam...",2023-05-04 03:12:43,
2,e22143ed-c586-43b9-9b9e-b5b68451b053,Muhammad Ridho,https://play-lh.googleusercontent.com/a-/ACB-R...,Mntul,,5,0,6.3.8-80a04c7878,2023-05-04 00:58:56,"Hai Sahabat Vidio, thank you for your 5 stars....",2023-05-04 01:57:15,
3,d4c34358-8c30-4de9-bd6d-56460318335e,Fey Kirana,https://play-lh.googleusercontent.com/a-/ACB-R...,"Berlangganan tapi aneh,,ttp aja malah mmotong ...",,1,0,6.3.8-80a04c7878,2023-05-04 00:48:03,"Hai Sahabat Vidio , perihal apa yang dapat kam...",2023-05-04 00:51:55,
4,c85fa116-7949-4f0e-b5e8-57202d62248e,Nur Adhari Qosasih,https://play-lh.googleusercontent.com/a-/ACB-R...,Ngelag,,5,0,6.3.8-80a04c7878,2023-05-03 20:56:08,"Hai Kak Nur, Kami mohon maaf atas ketidaknyama...",2023-05-03 21:03:03,


In [407]:
df_merged = pd.merge(df.drop(["content_english", "topic"], axis=1), df_sliced[["topic"]], left_index=True, right_index=True, how="outer")
df_merged = pd.merge(df_merged, df_sliced[["content_english"]], left_index=True, right_index=True, how="outer")
df_merged = df_merged[["reviewId", "userName", "userImage", "content_original", "content_english", "score", "thumbsUpCount", "reviewCreatedVersion", "at", "replyContent", "repliedAt", "topic"]]
df_merged = df_merged.fillna("empty")
df_merged.head(10)

Unnamed: 0,reviewId,userName,userImage,content_original,content_english,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,topic
0,788e563e-ade2-48bb-851f-7f874f382a1a,Mawar Manjiri,https://play-lh.googleusercontent.com/a/AGNmyx...,"Kalau menurut saya apk ini bagus banget, karen...",empty,5,0,6.3.8-80a04c7878,2023-05-04 02:14:44,"Hai Sahabat Vidio, terimakasih sudah memberika...",2023-05-04 03:13:00,empty
1,a5aa454f-a32a-4db8-a3e0-ba7efee57e3b,Dhany Aprian,https://play-lh.googleusercontent.com/a/AGNmyx...,Fitur apk nya sih bagus tpi kenapa ya kalo apk...,empty,4,0,6.2.6-0cbd587cd9,2023-05-04 02:08:51,"Hai Sahabat Vidio, mohon maaf atas ketidaknyam...",2023-05-04 03:12:43,empty
2,e22143ed-c586-43b9-9b9e-b5b68451b053,Muhammad Ridho,https://play-lh.googleusercontent.com/a-/ACB-R...,Mntul,empty,5,0,6.3.8-80a04c7878,2023-05-04 00:58:56,"Hai Sahabat Vidio, thank you for your 5 stars....",2023-05-04 01:57:15,empty
3,d4c34358-8c30-4de9-bd6d-56460318335e,Fey Kirana,https://play-lh.googleusercontent.com/a-/ACB-R...,"Berlangganan tapi aneh,,ttp aja malah mmotong ...","Subscription but weird, still cutting my main ...",1,0,6.3.8-80a04c7878,2023-05-04 00:48:03,"Hai Sahabat Vidio , perihal apa yang dapat kam...",2023-05-04 00:51:55,Technical
4,c85fa116-7949-4f0e-b5e8-57202d62248e,Nur Adhari Qosasih,https://play-lh.googleusercontent.com/a-/ACB-R...,Ngelag,empty,5,0,6.3.8-80a04c7878,2023-05-03 20:56:08,"Hai Kak Nur, Kami mohon maaf atas ketidaknyama...",2023-05-03 21:03:03,empty
5,f2b4f187-4b24-4d6d-8fb4-44f596d1fafb,Dedi Saputra,https://play-lh.googleusercontent.com/a/AGNmyx...,Bugus,empty,5,0,6.3.8-80a04c7878,2023-05-03 20:25:49,Hai kak. Terima kasih sudah memberikan bintang...,2023-05-03 20:33:24,empty
6,1a074b76-870c-457f-b7d3-b39814159333,Syifaun Muhammad,https://play-lh.googleusercontent.com/a/AGNmyx...,paket saya terdoble tapi tdk ada reversal,My package was duplicated but there was no rev...,1,0,6.3.8-80a04c7878,2023-05-03 19:38:14,Hai kak. kami akan bantu pengecekan untuk kend...,2023-05-03 20:34:39,Package
7,045e7574-29b8-4bdc-91ea-46ff8d9a22b3,Probo Sucitro,https://play-lh.googleusercontent.com/a/AGNmyx...,Selalu gagal buka tayangan live EPL,Always fail to open live EPL broadcast,1,0,6.3.8-80a04c7878,2023-05-03 19:34:32,Hai kak. kami akan bantu pengecekan untuk kend...,2023-05-03 20:34:15,Technical
8,c6345d62-e852-4539-8a1a-2c8d8813a1b0,Gulbastian,https://play-lh.googleusercontent.com/a-/ACB-R...,Apa yg Mao ditonton berbayar semua🤣 download j...,What you want to watch is all paid for 🤣 downl...,1,0,6.3.8-80a04c7878,2023-05-03 19:10:57,Hai Kak. terima kasih atas reviewnya. Yuk berl...,2023-05-03 20:33:53,Watching Experience
9,02c3c7d6-d44c-4649-a216-2c9b4dbc280a,Aswad Getop,https://play-lh.googleusercontent.com/a/AGNmyx...,Saya suka,empty,5,0,6.3.8-80a04c7878,2023-05-03 18:53:38,Hai kak. Terima kasih sudah memberikan bintang...,2023-05-03 20:33:15,empty


Replacing existing values with new ones

In [410]:
%%time

client = MongoClient(
    os.environ["MONGODB_URL"],
    serverSelectionTimeoutMS=300000
)
db = client["vidio"]
collection = db["google_play_store_reviews"]
df_merged_dict = df_merged.to_dict("records")
collection.delete_many({})

batch_size = 1_000
num_records = len(df_merged_dict)
num_batches = num_records // batch_size

if num_records % batch_size != 0:
    num_batches += 1

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min(start_idx + batch_size, num_records)
    batch = df_merged_dict[start_idx:end_idx]
    
    if batch:
        collection.insert_many(batch)

print("Data replaced successfully")

Data replaced successfully
CPU times: total: 7.83 s
Wall time: 11min 53s
