In [26]:
import numpy as np
import pandas as pd 

In [27]:
# Use the links to the datasets you download instead of the kaggle directory
edx_df = pd.read_csv('/kaggle/input/edx-courses-dataset-2021/EdX.csv')
coursera_df = pd.read_csv('/kaggle/input/coursera-courses-dataset-2021/Coursera.csv')

In [28]:
# Using the necessary features for predciting the Course from both the datasets 
edx_df = edx_df[['Name', 'University', 'Difficulty Level', 'Link', 'Course Description']]
edx_df.columns = ['Course Name', 'University', 'Difficulty Level', 'Course URL', 'Course Description']

coursera_df = coursera_df[['Course Name', 'University', 'Difficulty Level', 'Course Description', 'Skills']]
coursera_df.columns = ['Course Name', 'University', 'Difficulty Level', 'Course URL', 'Course Description']


In [29]:
#Combining both the datasets
df = edx_df.append(coursera_df, ignore_index=True)

  df = edx_df.append(coursera_df, ignore_index=True)


In [30]:
df.count()

Course Name           4242
University            4242
Difficulty Level      4242
Course URL            4242
Course Description    4242
dtype: int64

In [31]:
# the look of the dataset
df.head(5)

Unnamed: 0,Course Name,University,Difficulty Level,Course URL,Course Description
0,How to Learn Online,edX,Beginner,https://www.edx.org/course/how-to-learn-online,"Designed for those who are new to elearning, t..."
1,Programming for Everybody (Getting Started wit...,The University of Michigan,Beginner,https://www.edx.org/course/programming-for-eve...,This course aims to teach everyone the basics ...
2,CS50's Introduction to Computer Science,Harvard University,Beginner,https://www.edx.org/course/cs50s-introduction-...,"This is CS50x , Harvard University's introduct..."
3,The Analytics Edge,Massachusetts Institute of Technology,Intermediate,https://www.edx.org/course/the-analytics-edge,"In the last decade, the amount of data availab..."
4,Marketing Analytics: Marketing Measurement Str...,"University of California, Berkeley",Beginner,https://www.edx.org/course/marketing-analytics...,Begin your journey in a new career in marketin...


In [32]:
#saving the df into csv for further usage
df.to_csv("combined_dataset.csv")

In [33]:
# checking the unique values to convert into numerical values
print(df['Difficulty Level'].unique())

['Beginner' 'Intermediate' 'Advanced' 'Not Calibrated' 'Conversant']


In [34]:
# importing necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
# preprocessing the course description column 

import string

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Convert text to lowercase
    text = text.lower()

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

In [36]:
# The main code to find the similarities and suggest the course to the user based on three inputs
def recommend_courses(user_interests, user_skills, difficulty_level_required):
    # Validate user inputs
    if not isinstance(user_interests, list):
        raise ValueError("User interests should be provided as a list.")
    
    if not isinstance(user_skills, list):
        raise ValueError("User skills should be provided as a list.")
    
    if not isinstance(difficulty_level_required, int):
        raise ValueError("Difficulty level should be provided as an integer.")
    
    # Load the dataset
    # NOTE------------------- 
    #make sure to load the dataset obtained after combining both the dataframes.
    #----------------
    df = pd.read_csv('/kaggle/working/combined_dataset.csv')

    # Map difficulty levels to numerical values. I havent considered Not calibrated and conversant 
    # to a different values or could be taken some mean or average to fill it out but I left it to be 0.
    difficulty_mapping = {
        'Beginner': 1,
        'Intermediate': 2,
        'Advanced': 3,
        'Not Calibrated': 0,
        'Conversant': 0
    }
    df['Difficulty Level'] = df['Difficulty Level'].map(difficulty_mapping)

    # Preprocess the data using the previuously defined function 
    df['Course Description'] = df['Course Description'].apply(preprocess_text)

    # Feature extraction
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['Course Description'])

    # Calculate similarity scores between user interests and course descriptions
    user_interests_tfidf = vectorizer.transform([preprocess_text(' '.join(user_interests))])
    similarity_scores = (tfidf_matrix * user_interests_tfidf.T).toarray().flatten()

    # Filter courses based on similarity scores and difficulty level
    filtered_courses = df[similarity_scores > 0]
    filtered_courses = filtered_courses[filtered_courses['Difficulty Level'] <= difficulty_level_required]

    # Get top recommended courses based on difficulty level
    top_recommendations = filtered_courses.head(5)  # Get the top 5 recommendations

    # Suggesting additional courses based on user skills
    additional_courses = df[df['Course Name'].str.contains('|'.join(user_skills))].head(2)  # Get 2 additional courses matching user skills
    
    recommendations = pd.concat([top_recommendations, additional_courses], ignore_index=True)
    
    return recommendations




In [37]:
#FINALLY TRYING OUT THE CODE and getting the output but not as aspected as the accuracy doesnt seem to 
# be good but can be improved by further pre processing and adding new features or input.
try:
    # enter the data in List only.
    user_interests = ['data science','python']
    user_skills = ['machine learning','Mathematics']
    # enter just numerical value.
    difficulty_level_required = 2  # Intermediate level

    # Call the recommendation function with the user inputs
    recommendations = recommend_courses(user_interests, user_skills, difficulty_level_required)

    # Print the recommendations
    print('Top 5 Recommendations:')
    for index, row in recommendations.head(5).iterrows():
        print('Course Name:', row['Course Name'])
        print('Difficulty Level:', row['Difficulty Level'])
        print('Course URL:', row['Course URL'])
        print('Course Description:', row['Course Description'])
        print('************************************************************')

    print('Additional Course Suggestions:')
    for index, row in recommendations.iterrows():
        course_description = row['Course Description']
        if any(skill.lower() in course_description.lower() for skill in user_skills):
            print("Course Name:", row['Course Name'])
            print("Course Description:", course_description)
            print("Difficulty Level:", row['Difficulty Level'])
            print()

except ValueError as e:
    print("Error:", str(e))

Top 5 Recommendations:
Course Name: Programming for Everybody (Getting Started with Python)
Difficulty Level: 1
Course URL: https://www.edx.org/course/programming-for-everybody-getting-started-with-pyt
Course Description: this course aims to teach everyone the basics of programming computers using python we cover the basics of how one constructs a program from a series of simple instructions in python the course has no prerequisites and avoids all but the simplest mathematics anyone with moderate computer experience should be able to master the materials in this course this course will cover chapters 15 of the textbook python for everybody once a student completes this course they will be ready to take more advanced programming courses this course covers python 3 computer science
************************************************************
Course Name: CS50's Introduction to Computer Science
Difficulty Level: 1
Course URL: https://www.edx.org/course/cs50s-introduction-to-computer-scien