In [1]:
from faker import Faker 
import random
from string import ascii_lowercase, ascii_uppercase
from faker.providers import credit_card
import uuid
import dotenv
import os
from pymongo import MongoClient
import json
import datetime
import re
import pandas as pd
from datetime import timedelta

# set up environment values
fake = Faker('en_IN')


In [2]:
def create_user_id(existing_user_ids:list):
    prefix = random.choice(['0b','0x','0z','7s','7d','7x','3b','9u','9d','3d','0d','7b','7a','9a','7a','3x','7xd','0zc','9zd','5de','3od','0zc','0sx'])
    body1 = f"{random.randint(0000,9999)}{''.join(random.choices(ascii_uppercase,k=2))}{''.join(random.choices(ascii_lowercase,k=6))}"
    body2 = f"{random.randint(00,99)}{''.join(random.choices(ascii_lowercase,k=3))}{''.join(random.choices('E,F,W,X,Z,D,Y,Z,Q,R,S,T,L,N,O,M'.split(','),k=2))}"
    suffix = f"{random.randint(1000,9999)}"
    
    id = prefix+body1+body2+suffix
    if id not in existing_user_ids:
        return id 
    else:
        create_user_id(existing_user_ids)

def create_user_name(existing_user_names:list):
    name = fake.user_name()
    if name not in existing_user_names:
        return name
    else:
        create_user_name(existing_user_names)

 
def create_enrolled_dt(dt):
    publish_dt = datetime.datetime.strptime(dt,'%Y-%m-%d %H:%M:%S').date()
    rand_dt = fake.date_time_between_dates(publish_dt, datetime.date(2023,5,28))
    return rand_dt.strftime('%Y-%m-%d %H:%M:%S')


def create_progress_report(status):
    if status == 'in-progress':
        return random.randint(5,90)
    elif status == 'enrolled':
        return 1
    else:
        return 100
    

def create_certification_dt(enrolled_dt):
    enrolled_dt = datetime.datetime.strptime(enrolled_dt, '%Y-%m-%d %H:%M:%S').date()
    tentative_dt = enrolled_dt + timedelta(days=45)
    c_date = fake.date_time_between_dates(tentative_dt, datetime.date(2023,11,30))
    return c_date.strftime('%Y-%m-%d %H:%M:%S')


def create_payment_id(existing_payment_ids):
    payment_id_ = f"{random.randint(100,10000)}{fake.bban().lower()}{fake.ean()}"
    if payment_id_ not in existing_payment_ids:
        return payment_id_
    else:
        create_payment_id(existing_payment_ids) 
    

**Indian Learner**

In [3]:
data_list = []

course_df = pd.read_csv('./data/course.csv')
course_df = course_df[course_df['num_subscribers']>1000]

# course_ids list
course_ids = course_df['course_id'].tolist()
# course published date dict
course_publish_dt = course_df.set_index('course_id')['published_date'].to_dict()

# read json file to get existing numbers
with open("./existing_users.json", "r") as read_file:
    existing_users = json.load(read_file)

existing_ids = existing_users["existing_userids"]
existing_names = existing_users['existing_usernames']
existing_payment_ids = existing_users['existing_paymentids']

for i in range(100):
    
    user_id = create_user_id(existing_user_ids=existing_ids)
    existing_ids.append(user_id)
    user_name = create_user_name(existing_user_names=existing_names)
    existing_names.append(user_name)
    first_name = fake.first_name()
    last_name = fake.last_name()
    email_id = f"{first_name.lower()}.{last_name.lower()}{random.choice([random.randint(0,999),''])}@{random.choice(['bluevice.com', 'example.in','seesight.com','yellowstone.in', 'example.com'])}"
    mobile = "+91-" +fake.phone_number()
    address = fake.address().replace('\n',',')[::-1].replace(' ', ',',1).replace('-',',',1)[::-1].replace(',,',',')
    country = 'India'
    profession = random.choice(['student', 'professional', 'freelancer', 'businessman', 'homemaker', 'trainer', 'influencer', 'artist'])
    age = str(random.randint(17,50))
    field_of_interest = random.choice(['data science and data analytics', 'language studies', 'science', 'design and artistic', 'IT'])
    reason_of_enrollment = random.choice(['MOOC', 'career change', 'new hobby', 'free-lancing', 'self-improvement'])
    is_verified = random.choice(['yes','no'])
    verification_type = "adhar" if is_verified=='yes' else None
    verification_id = "".join(random.choices([str(i) for i in range(0,10)],k=11)) if is_verified=='yes' else None
    joined_on = fake.date_time_between_dates(datetime.date(2021,1,1), datetime.date(2023,1,1)).strftime('%Y-%m-%d %H:%M:%S')

    # course_data
    courses = {}

    enrolled_course_ids = random.choices(course_ids, k=random.randint(0,7))

    if len(enrolled_course_ids) > 0:
        for course_no, course_id in enumerate(enrolled_course_ids):
            enrolled_dt = create_enrolled_dt(course_publish_dt[course_id])
            coupon_ed = fake.random_element(['DISCOUNTED', 'SALEED', 'PROMOED', 'GROWWITHED','DOWITHED','PRACTICEWITHWED', 'LEARNWITHED','LIVEWITHED','LASTMINUTED','LONGRUNED' \
                'DEVWITHED','ARTISTED','RISEWITHED','DANCEWITHED','BREATHWITHED','CALLWITHED'])
            paid_or_not = fake.random_element(['Yes','No'])
            payment_method = fake.random_element(['UPI','credit card', 'debit card','net banking', 'promo'])
            payment_id = create_payment_id(existing_payment_ids)
            existing_payment_ids.append(payment_id) # add the payment id to list
            progress_status = fake.random_element(['enrolled','in-progress','completed']) if datetime.datetime.strptime(enrolled_dt, '%Y-%m-%d %H:%M:%S').date() < datetime.date(2023,7,1) \
                else fake.random_element(['enrolled','in-progress'])
            progress_report = create_progress_report(progress_status)
            paid_price = random.randint(0,2000)
            certification_dt = create_certification_dt(enrolled_dt=enrolled_dt) if progress_status=='completed' else None
            courses[str(course_no+1)] = {
                "course_id": course_id,
                "enrolled_dt": enrolled_dt,
                "is_paid": paid_or_not,
                "price_paid": f"{paid_price} INR",
                "discount_applied": True,
                "coupon_code": coupon_ed,
                "payment_method": payment_method,
                "payment_id": payment_id,
                "progress_status": progress_status,
                "progress_report": progress_report,
                "certification_dt": certification_dt
            }
    else:
        courses = None

    
    data = {
        "_id":user_id,
        "user_name":user_name,
        "email_id":email_id,
        "signin_dt":joined_on,
        "first_name": first_name,
        "last_name": last_name,
        "age":age,
        "mobile": mobile,
        "address": address,
        "country":country,
        "profession":profession,
        "field_of_interest":field_of_interest,
        "reason_of_enrollment":reason_of_enrollment,
        "is_verified":is_verified,
        "verification_type":verification_type,
        "verification_id":verification_id,
        "courses": courses
    }
    
    data_list.append(data)

In [4]:
# # write the data to a disk

# def write_to_disk(data):
#     pattern = re.compile(r'learners_(\d{1,2})_(in|us).csv')
#     numbers_ = [int(file.split('_')[1]) for file in os.listdir('./data/') if pattern.match(file)]
#     next_number = 1 if len(numbers_)== 0 else (max(numbers_)+1)
#     learners_file = f"./data/learners_{next_number}_us.csv" # change the country code

#     df = pd.DataFrame(data)
#     df.to_csv(learners_file,index=False)

# write_to_disk(data=data_list)

# # write internal json
# json_existing_users = {
#     "existing_userids":existing_ids,
#     "existing_usernames":existing_names,
#     "existing_paymentids": existing_payment_ids
# }

# with open("./existing_users.json",'w') as json_file:
#     json.dump(json_existing_users, json_file, indent=4)

In [5]:
dotenv.load_dotenv()

USER = os.getenv("MONGO_USER")
PASSWORD = os.getenv("MONGO_PWD")
DATABASE = os.getenv("MONGO_DB")
COLLECTION = "LEARNERS"

ATLAS_URI = f"mongodb+srv://{USER}:{PASSWORD}@cluster0.0nlbhko.mongodb.net/?retryWrites=true&w=majority"
mongo_client=MongoClient(ATLAS_URI)
collection = mongo_client[DATABASE][COLLECTION]

# load data to MongoDB
result = collection.insert_many(data_list)
print('Successfully inserted data')
# print("Inserted ID:\n")
# print(result.inserted_ids)

# write internal json
json_existing_users = {
    "existing_userids":existing_ids,
    "existing_usernames":existing_names,
    "existing_paymentids": existing_payment_ids
}

with open("./existing_users.json",'w') as json_file:
    json.dump(json_existing_users, json_file, indent=4)


Successfully inserted data


In [6]:
# # write internal json
# json_existing_users = {
#     "existing_userids":[],
#     "existing_usernames": [],
#     "existing_paymentids": []
# }

# with open("./existing_users.json",'w') as json_file:
#     json.dump(json_existing_users, json_file, indent=4)