In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [2]:
def convert_to_list(string):
    try:
        list_ = [o.strip(' ') for o in string.split(',') if o not in ['', ' ', ',']]
        if len(list_) == 0:
            return None
        return list_
    except:
        return None

In [3]:
def handle_x_in_str(ins, x='Instructor: '):
    try:
        if x in ins:
            ins = ins.replace(x, '')
        return convert_to_list(ins)
    except:
        return None

In [4]:
def convert_to_float(value):
    # is nan
    if pd.isna(value):
        return np.float64(0)
    
    # if float
    if isinstance(value, (int, float, np.float64)):
        return value

    # if string
    try:
        return np.float64(value.replace(',', ''))
    except:
        return np.float64(0)

In [5]:
def equalize_lists(row):
    u, r = row['User'], row['Reviews'] 
    try:
        if pd.isna(u) or pd.isna(r):
            u = []
            r = []
    except:
        if u and r and len(u) != len(r):
            min_len = min(len(u), len(r))
            u = u[:min_len]
            r = r[:min_len]

    return pd.Series([u, r])

## *<i>Making Title Ids<i>*

In [6]:
# change path based on ur strucuture
course = pd.read_csv('../assets/raw_data/course.csv')
user_rating = pd.read_csv('../assets/raw_data/user_rating.csv')
courses = pd.read_csv('../assets/raw_data/courses.csv')

In [7]:
titles = set(course['Title']) | set(courses['Title']) | set(user_rating['Title'])
map_title = {
    v: x for x, v in enumerate(titles)
}

title = pd.DataFrame(list(map_title.items()), columns=['Title', 'Index'])
title.to_csv('title.csv', index=False)

## *<i> course.csv <i>*

In [8]:
course['Skills'] = course['Skills'].apply(lambda x: convert_to_list(x))
course['Instructors'] = course['Instructors'].apply(lambda x: handle_x_in_str(x))
course['Rating count'] = (course['Rating count'].apply(lambda x: convert_to_float(x))).astype(np.int64)
course['links'] = course['links'].apply(lambda x: convert_to_list(x))
course['Title']= course['Title'].map(map_title)

In [9]:
course.rename(columns={
    'Title': 'Title Index',
    'links': 'Links'
}, inplace=True)

course.rename(columns=lambda x: x.lower().replace(" ", "_"), inplace=True)
course.to_csv('formated_course.csv', index=False)

## *<i> courses.csv <i>*

In [10]:
courses['Rating'] = courses['Rating'].apply(lambda x: convert_to_float(x))
courses['Rating count'] = (courses['Rating count'].apply(lambda x: convert_to_float(x))).astype(np.int64)
courses['Skills'] = courses['Skills'].apply(lambda x: convert_to_list(x))
courses['Instructors'] = courses['Instructors'].apply(lambda x: handle_x_in_str(x))
courses['Courses_Link'] = courses['Courses_Link'].apply(lambda x: convert_to_list(x))
courses['links'] = courses['links'].apply(lambda x: convert_to_list(x))
courses['Courses'] = courses['Courses'].apply(lambda x: convert_to_list(x))
courses['Title']= courses['Title'].map(map_title)

In [11]:
courses.rename(columns={
    'Title': 'title_index',
    'links': 'instructor_links',
    'Courses_Link': 'courses_link'
}, inplace=True)

courses.rename(columns=lambda x: x.lower().replace(" ", "_"), inplace=True)
courses.to_csv('formated_courses.csv', index=False)

## *<i> user_rating.csv <i>*

In [12]:
user_rating['User'] = user_rating['User'].apply(lambda x: convert_to_list(x))
user_rating['Reviews'] = user_rating['Reviews'].apply(lambda x: convert_to_list(x))
user_rating['Title']= user_rating['Title'].map(map_title)
user_rating[['User', 'Reviews']] = user_rating[['User', 'Reviews']].apply(equalize_lists, axis=1)
user_rating = user_rating.explode(['User', 'Reviews']).reset_index(drop=True)
user_rating = user_rating.dropna()
user_rating = user_rating.drop_duplicates()

## *<i> Making User Ids <i>*

In [13]:
users = set(user_rating['User'])
map_user = {
    v: x for x, v in enumerate(users)
}
user_rating['User'] = user_rating['User'].map(map_user)

In [14]:
user = pd.DataFrame(list(map_user.items()), columns=['user', 'index'])
user.to_csv('user.csv', index=False)
user_rating.rename(columns={
    'Title': 'title_index',
    'User':'user_index',
    'Reviews': 'rating'
}, inplace=True)
user_rating.rename(columns=lambda x: x.lower().replace(" ", "_"), inplace=True)
user_rating.to_csv('formatted_user_rating.csv', index=False)

## Generating Synthetic Data

As the original dataset has no overlapping courses that are rated by multiple user, cuz i didn't scrape that much data, so i will generate the fake user data.

In [15]:
def syn_rating(ratings: list[int]) -> int:
    """Generate Random rating"""
    mean_ = ratings.mean()
    std_ = ratings.std()

    count_ = np.unique(ratings, return_counts=True)
    c = ['normal', 'choice']
    if np.random.choice(c) == 'normal':
        fr = np.random.normal(mean_, std_ if std_ > 0 else 1)
    else:
        count_ = np.unique(ratings)
        fr = np.random.choice(count_[0])
    
    fr = int(min(max(round(fr), 1), 5))

    return fr

In [16]:
def syn_name(r=120):
    ruser = ''.join(chr((int(c, 16) % 26) + 97) for c in os.urandom(np.random.randint(1, 4)).hex())  
    return f"fake_u{ruser}_{r + 1}"

In [17]:
# to make new data of user if necessary
def gen_syn_data(title_indexes, user=user, user_rating=user_rating, loop_count = 400, random_gen_per_title_low = 14, random_gen_per_title_high = 40, r=None, save=True, return_=False):
    """This method will generate new fake data"""
    if r == None:
        r = 1
        
    tusr_rating = pd.DataFrame()
    tusr = pd.DataFrame()

    pbar = tqdm(range(loop_count), total=loop_count, colour='blue')
    for _ in pbar:
        choosen_title = title_indexes.pop()

        pbar.set_description(f'Title [choosen_title]')
        ratings = user_rating[user_rating['title_index'] == choosen_title]['rating'].values
        ratings = np.array([int(r) for r in ratings])

        if len(title_indexs) == 0:
            break
        fake_data_count = np.random.randint(random_gen_per_title_low, random_gen_per_title_high)
    
        pbar.set_description(f'Generating.....')
        for _ in range(fake_data_count):
            r = r + 1
            rat = syn_rating(ratings)
            usr = syn_name(r)
    
            temp = pd.DataFrame({
                'title_index': [choosen_title],
                'user_index': [r],
                'rating': [rat]
            })
    
            tusr_rating = pd.concat([tusr_rating, temp])
    
            temp_usr = pd.DataFrame({
                'user': [usr],
                'index': [r]
            })
    
            tusr = pd.concat([tusr, temp_usr])

    user = pd.concat([user, tusr]).reset_index(drop=True)
    user_rating = pd.concat([user_rating, tusr_rating]).reset_index(drop=True)

    if save:
        user_rating.to_csv('syntheic_fuser_rating.csv', index=False)
        user.to_csv('synthetic_user.csv', index=False)
    if return_:
        return user, user_rating       

In [18]:
def augument_rating(user=user, user_rating=user_rating, random_gen_per_title_low = 10, random_gen_per_title_high = 40, r=None, save=True, return_=False):
    """This method will aguement new rating using already present users"""
    desired_diff = int(np.random.randint(random_gen_per_title_low, random_gen_per_title_high))
    tusr_rating = pd.DataFrame()
    title_indexes = list(user_rating['title_index'].unique())

        
    pbar = tqdm(enumerate(title_indexes), total=len(title_indexes), colour='blue')
    all_users = user['index'].values
    
    for _, choosen_title in pbar:
        pbar.set_description(f'Course {choosen_title}')

        course_ratings = user_rating[user_rating['title_index'] == choosen_title]
        unique_users = course_ratings['user_index'].unique()
        total_ratings = len(course_ratings)
        users_rated = len(unique_users)
        
        current_diff = total_ratings - users_rated
        extra_needed = max(desired_diff - current_diff, 0)
    
        existing_users = user_rating[user_rating['title_index'] == choosen_title]['user_index'].values
    
        eligible_users = list(set(all_users) - set(existing_users))
        if extra_needed == 0:
            continue
                
        all_users = user['index'].values
        eligible_users = list(set(all_users) - set(unique_users))
        
        if len(eligible_users) < extra_needed:
            eligible_users = list(all_users)
                    
        chosen_users = np.random.choice(eligible_users, size=extra_needed, replace=False)
    
        ratings = user_rating[user_rating['title_index'] == choosen_title]['rating'].values
        ratings = np.array([int(r) for r in ratings])
    
        for r_idx in chosen_users:
            rat = syn_rating(ratings)  # Generate synthetic rating based on existing ratings
            temp = pd.DataFrame({
                'title_index': [choosen_title],
                'user_index': [r_idx],
                'rating': [rat]
            })
            tusr_rating = pd.concat([tusr_rating, temp])
    
    user_rating = pd.concat([user_rating, tusr_rating]).reset_index(drop=True)
        
    if save:
        user_rating.to_csv('augmented_user_rating.csv', index=False)
    if return_:
        return user_rating

In [19]:
title_indexes = list(user_rating['title_index'].unique())
try:
    r, c = user.shape
except:
    r, c = None, None

In [20]:
user_rating = augument_rating(return_=True)

Course 48: 100%|[34m███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████[0m| 548/548 [00:41<00:00, 13.11it/s][0m
