# Importing Libraries

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import warnings
warnings.filterwarnings('ignore')

# For NLP chatbot
from sklearn.metrics.pairwise import cosine_similarity
import random

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\farou\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\farou\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\farou\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Data Loading and Exploration


In [43]:
df = pd.read_csv('data/edited_skill_exchange_dataset.csv')


# Display basic information about the dataset


In [44]:
print("Dataset Shape:", df.shape)
print("\nData Types:")
print(df.dtypes)
print("\nSample Data:")
print(df.head())

Dataset Shape: (10000, 6)

Data Types:
user_id            int64
joinedDate        object
joinedCourses     object
skills            object
desired_skills    object
isVerified          bool
dtype: object

Sample Data:
   user_id  joinedDate                            joinedCourses  \
0        1  2022-08-28  Machine Learning, CSS, Excel, SQL, HTML   
1        2  2023-12-04  Data Science, Excel, Python, JavaScript   
2        3  2023-04-10          JavaScript, Python, Excel, Java   
3        4  2022-01-30              AI, Machine Learning, Excel   
4        5  2022-09-07                                   Python   

                               skills  \
0                           HTML, SQL   
1   HTML, CSS, JavaScript, Excel, SQL   
2                     HTML, CSS, Java   
3  HTML, Excel, SQL, Java, Blockchain   
4                 CSS, JavaScript, AI   

                                      desired_skills  isVerified  
0  CSS, Java, Machine Learning, Blockchain, Data ...       False  

## 2. Data Preparation and Cleaning


In [45]:
# Convert joinedDate to datetime
df['joinedDate'] = pd.to_datetime(df['joinedDate'])

# Calculate membership duration (in days)
df['membershipDuration'] = (pd.Timestamp('2025-03-12') - df['joinedDate']).dt.days

# Function to clean and standardize comma-separated text fields
def clean_text_list(text):
    if isinstance(text, str):
        # Split by comma, strip whitespace, and rejoin
        items = [item.strip() for item in text.split(',')]
        return ', '.join(items)
    return text

# Apply cleaning to text columns
for col in ['joinedCourses', 'skills', 'desired_skills']:
    df[col] = df[col].apply(clean_text_list)

# Create count features
df['course_count'] = df['joinedCourses'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)
df['skills_count'] = df['skills'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)
df['desired_skills_count'] = df['desired_skills'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)

# Display the cleaned data
print("\nCleaned Dataset:")
print(df.head())


Cleaned Dataset:
   user_id joinedDate                            joinedCourses  \
0        1 2022-08-28  Machine Learning, CSS, Excel, SQL, HTML   
1        2 2023-12-04  Data Science, Excel, Python, JavaScript   
2        3 2023-04-10          JavaScript, Python, Excel, Java   
3        4 2022-01-30              AI, Machine Learning, Excel   
4        5 2022-09-07                                   Python   

                               skills  \
0                           HTML, SQL   
1   HTML, CSS, JavaScript, Excel, SQL   
2                     HTML, CSS, Java   
3  HTML, Excel, SQL, Java, Blockchain   
4                 CSS, JavaScript, AI   

                                      desired_skills  isVerified  \
0  CSS, Java, Machine Learning, Blockchain, Data ...       False   
1              JavaScript, Python, Java, Node.js, AI        True   
2                                  CSS, SQL, Node.js        True   
3  SQL, Node.js, Machine Learning, Blockchain, Da...        True  


## 3. Data Understanding and Visualization

In [46]:
plt.figure(figsize=(15, 10))

# Distribution of course counts
plt.subplot(2, 2, 1)
sns.histplot(df['course_count'], kde=True)
plt.title('Distribution of Course Counts')
plt.xlabel('Number of Courses')
plt.ylabel('Frequency')

# Distribution of skill counts
plt.subplot(2, 2, 2)
sns.histplot(df['skills_count'], kde=True)
plt.title('Distribution of Skill Counts')
plt.xlabel('Number of Skills')
plt.ylabel('Frequency')

# Distribution of desired skill counts
plt.subplot(2, 2, 3)
sns.histplot(df['desired_skills_count'], kde=True)
plt.title('Distribution of Desired Skill Counts')
plt.xlabel('Number of Desired Skills')
plt.ylabel('Frequency')

# Membership duration distribution
plt.subplot(2, 2, 4)
sns.histplot(df['membershipDuration'], kde=True)
plt.title('Distribution of Membership Duration')
plt.xlabel('Days')
plt.ylabel('Frequency')

plt.tight_layout()
plt.savefig('distributions.png')
plt.close()

# Function to extract all unique skills/courses from a column

In [47]:
def extract_unique_items(df, column_name):
    all_items = []
    for items_str in df[column_name]:
        if isinstance(items_str, str):
            items = [item.strip() for item in items_str.split(',')]
            all_items.extend(items)
    return list(set(all_items))

# Get unique items


In [48]:
all_courses = extract_unique_items(df, 'joinedCourses')
all_skills = extract_unique_items(df, 'skills')
all_desired_skills = extract_unique_items(df, 'desired_skills')
print(f"\nTotal unique courses: {len(all_courses)}")
print(f"Total unique skills: {len(all_skills)}")
print(f"Total unique desired skills: {len(all_desired_skills)}")


Total unique courses: 12
Total unique skills: 13
Total unique desired skills: 13


# Top courses visualization

In [49]:
def plot_top_items(df, column_name, title, n=10):
    all_items = []
    for items_str in df[column_name]:
        if isinstance(items_str, str):
            items = [item.strip() for item in items_str.split(',')]
            all_items.extend(items)

    item_counts = pd.Series(all_items).value_counts().head(n)

    plt.figure(figsize=(12, 6))
    sns.barplot(x=item_counts.values, y=item_counts.index)
    plt.title(f'Top {n} {title}')
    plt.xlabel('Count')
    plt.tight_layout()
    plt.savefig(f'top_{column_name}.png')
    plt.close()

# Plot top items for each category
plot_top_items(df, 'joinedCourses', 'Courses')
plot_top_items(df, 'skills', 'Skills')
plot_top_items(df, 'desired_skills', 'Desired Skills')

## 4. Feature Engineering for Skill Matching


In [50]:
# Create binary features for skills and desired skills using CountVectorizer
def create_binary_features(df, column_name):
    vectorizer = CountVectorizer(tokenizer=lambda x: [item.strip() for item in x.split(',')])
    binary_features = vectorizer.fit_transform(df[column_name].fillna(''))
    binary_df = pd.DataFrame(binary_features.toarray(), columns=vectorizer.get_feature_names_out())
    return binary_df, vectorizer

# Create binary feature matrices
skills_binary, skills_vectorizer = create_binary_features(df, 'skills')
desired_skills_binary, desired_skills_vectorizer = create_binary_features(df, 'desired_skills')
courses_binary, courses_vectorizer = create_binary_features(df, 'joinedCourses')

# Combine features for clustering
combined_features = pd.concat([
    skills_binary,
    desired_skills_binary,
    courses_binary,
    df[['membershipDuration', 'course_count', 'skills_count', 'desired_skills_count', 'isVerified']].reset_index(drop=True)
], axis=1)


# Scale numerical features

In [51]:
scaler = StandardScaler()
numerical_features = ['membershipDuration', 'course_count', 'skills_count', 'desired_skills_count']
combined_features[numerical_features] = scaler.fit_transform(combined_features[numerical_features])

# Convert boolean to int
combined_features['isVerified'] = combined_features['isVerified'].astype(int)

print("\nFeature Engineering Complete")
print(f"Total features: {combined_features.shape[1]}")


Feature Engineering Complete
Total features: 43


## 5. Clustering with KMeans (KMM)


In [52]:
# Determine optimal number of clusters using Elbow Method
inertia = []
k_range = range(2, 8)  # For a small dataset, we'll test up to 7 clusters
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(combined_features)
    inertia.append(kmeans.inertia_)

# Plot Elbow Method
plt.figure(figsize=(10, 6))
plt.plot(k_range, inertia, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.grid(True)
plt.savefig('elbow_method.png')
plt.close()

In [53]:
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(combined_features)

# Add cluster labels to the original dataframe
df['cluster'] = cluster_labels

# Analyze clusters
cluster_stats = df.groupby('cluster').agg({
    'membershipDuration': 'mean',
    'course_count': 'mean',
    'skills_count': 'mean',
    'desired_skills_count': 'mean',
    'isVerified': 'mean',
    'user_id': 'count'
}).rename(columns={'user_id': 'count'})

print("\nCluster Statistics:")
print(cluster_stats)

# Visualize clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x='skills_count',
    y='desired_skills_count',
    hue='cluster',
    size='course_count',
    sizes=(50, 200),
    palette='viridis',
    data=df
)
plt.title('User Clusters by Skills and Desired Skills')
plt.xlabel('Number of Current Skills')
plt.ylabel('Number of Desired Skills')
plt.savefig('user_clusters.png')
plt.close()


Cluster Statistics:
         membershipDuration  course_count  skills_count  desired_skills_count  \
cluster                                                                         
0                799.872926      2.066775      4.916633              4.631728   
1                799.262735      1.855228      2.215626              4.676369   
2                800.730166      3.832565      3.327030              6.885148   
3                795.425455      4.309818      3.277091              3.734545   

         isVerified  count  
cluster                     
0          0.506273   2471  
1          0.502872   2611  
2          0.519373   2168  
3          0.498182   2750  


## 6. Skill Matching System using SVM

In [54]:
# Prepare data for SVM - we'll predict if a user would be interested in "Machine Learning" as an example
target_skill = "Machine Learning"

# Check if the target skill is in the user's desired skills
df['wants_' + target_skill.replace(' ', '_')] = df['desired_skills'].apply(
    lambda x: 1 if target_skill in x else 0
)

# Feature matrix for prediction
X_svm = combined_features
y_svm = df['wants_' + target_skill.replace(' ', '_')]

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_svm, y_svm, test_size=0.3, random_state=42)

# Train SVM model
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train, y_train)

# Evaluate model
y_pred = svm_model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Not Interested', 'Interested'],
            yticklabels=['Not Interested', 'Interested'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title(f'Confusion Matrix for {target_skill} Interest Prediction')
plt.savefig('confusion_matrix.png')
plt.close()

print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1556
           1       1.00      1.00      1.00      1444

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000



## 7. User Matching for Peer Learning


In [55]:
# Create a function to find matching users for peer learning
def find_learning_partners(user_id, df, n_recommendations=3):
    # Get the user's data
    user = df[df['user_id'] == user_id].iloc[0]

    # Get the user's desired skills
    user_desired_skills = [skill.strip() for skill in user['desired_skills'].split(',')]

    # Get the user's current skills
    user_current_skills = [skill.strip() for skill in user['skills'].split(',')]

    # Find potential matches (users who have skills that our user wants to learn)
    potential_matches = []

    for _, other_user in df[df['user_id'] != user_id].iterrows():
        other_skills = [skill.strip() for skill in other_user['skills'].split(',')]
        other_desired_skills = [skill.strip() for skill in other_user['desired_skills'].split(',')]

        # Calculate skill match score (how many of user's desired skills the other user has)
        skill_match_score = sum(1 for skill in user_desired_skills if skill in other_skills)

        # Calculate reciprocal match score (how many of other's desired skills the user has)
        reciprocal_match_score = sum(1 for skill in other_desired_skills if skill in user_current_skills)

        # Calculate total match score - we can weight these differently if needed
        total_match_score = skill_match_score + 0.5 * reciprocal_match_score

        if skill_match_score > 0:  # Only consider if there's at least one skill match
            potential_matches.append({
                'user_id': other_user['user_id'],
                'skills': other_user['skills'],
                'matching_skills': [skill for skill in user_desired_skills if skill in other_skills],
                'can_learn_from_you': [skill for skill in other_desired_skills if skill in user_current_skills],
                'match_score': total_match_score
            })

    # Sort by match score (higher is better)
    sorted_matches = sorted(potential_matches, key=lambda x: x['match_score'], reverse=True)

    # Return top N recommendations
    return sorted_matches[:n_recommendations]


In [56]:
# Test the matching function
test_user_id = 1
matches = find_learning_partners(test_user_id, df, 3)

print(f"\nLearning Partner Recommendations for User {test_user_id}:")
for i, match in enumerate(matches, 1):
    print(f"\nMatch {i}:")
    print(f"User ID: {match['user_id']}")
    print(f"Skills: {match['skills']}")
    print(f"Can teach you: {', '.join(match['matching_skills'])}")
    print(f"Can learn from you: {', '.join(match['can_learn_from_you'])}")
    print(f"Match score: {match['match_score']}")


Learning Partner Recommendations for User 1:

Match 1:
User ID: 3643
Skills: CSS, JavaScript, Excel, Java, Machine Learning, Blockchain
Can teach you: CSS, Java, Machine Learning, Blockchain
Can learn from you: HTML, SQL
Match score: 5.0

Match 2:
User ID: 5313
Skills: HTML, CSS, Excel, Java, Node.js, Machine Learning, Data Science
Can teach you: CSS, Java, Machine Learning, Data Science
Can learn from you: HTML, SQL
Match score: 5.0

Match 3:
User ID: 5674
Skills: HTML, CSS, Python, Node.js, Machine Learning, Blockchain, Data Science
Can teach you: CSS, Machine Learning, Blockchain, Data Science
Can learn from you: HTML, SQL
Match score: 5.0


## 8. NLP Chatbot for Skill Guidance

In [57]:
# First, let's create a dataset of common questions and their responses for our skill guidance chatbot
skill_conversation_data = [
    {
        'question': 'What courses should I take to learn Machine Learning?',
        'answer': 'To learn Machine Learning, start with Python, Statistics, and Linear Algebra basics, then take courses on ML algorithms, neural networks, and practical ML projects.'
    },
    {
        'question': 'How do I learn Data Science?',
        'answer': 'To learn Data Science, I recommend courses in Python or R, statistics, data visualization, machine learning, and big data technologies. Start with the fundamentals and then work on real-world projects.'
    },
    {
        'question': 'What skills do I need for web development?',
        'answer': 'For web development, core skills include HTML, CSS, and JavaScript. For frontend, learn frameworks like React, Vue, or Angular. For backend, consider Node.js, Django, or Ruby on Rails, along with database skills.'
    },
    {
        'question': 'How long does it take to learn Python?',
        'answer': 'Learning Python basics takes 2-4 weeks, becoming proficient takes 3-6 months, and mastery requires ongoing practice. The timeline depends on your prior programming experience and study consistency.'
    },
    {
        'question': 'Which is better to learn first, Java or Python?',
        'answer': 'Python is often recommended for beginners due to its simpler syntax and readability. Java has a steeper learning curve but is valuable for enterprise applications. Choose based on your career goals.'
    },
    {
        'question': 'What should I learn after HTML and CSS?',
        'answer': 'After HTML and CSS, learn JavaScript to add interactivity to websites. Then consider a frontend framework like React, Vue, or Angular, and basic backend concepts for full-stack development.'
    },
    {
        'question': 'How to start learning AI?',
        'answer': 'Start learning AI with Python programming, statistics, and linear algebra. Then progress to machine learning fundamentals, neural networks, and specialized areas like NLP or computer vision.'
    },
    {
        'question': 'What are the best resources to learn SQL?',
        'answer': 'Great SQL learning resources include interactive platforms like SQLZoo and Mode Analytics, courses on Coursera and DataCamp, and practice through real database projects.'
    },
    {
        'question': 'How do I find a study partner?',
        'answer': 'You can find a study partner by using our matching system, joining relevant online communities, participating in forums related to your interests, or attending virtual meetups and hackathons.'
    },
    {
        'question': 'What skills are in demand right now?',
        'answer': 'Currently in-demand skills include Machine Learning, Data Science, Cloud Computing (AWS/Azure), DevOps, Full-Stack Development, Cybersecurity, and Blockchain development.'
    }
]


# Preprocess text function (tokenization, removing stopwords, lemmatization)

In [58]:
nltk.download('punkt_tab')
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters
    text = re.sub(r'\W', ' ', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

# Preprocess the conversation data
processed_questions = [preprocess_text(item['question']) for item in skill_conversation_data]

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_questions)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\farou\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [59]:
# Function to get the most similar response
def get_chatbot_response(user_input, threshold=0.3):
    # Preprocess the user input
    processed_input = preprocess_text(user_input)

    # Vectorize the user input
    user_vector = tfidf_vectorizer.transform([processed_input])

    # Calculate similarity scores
    similarity_scores = cosine_similarity(user_vector, tfidf_matrix)[0]

    # Get the index of the most similar question
    max_similarity_index = np.argmax(similarity_scores)
    max_similarity = similarity_scores[max_similarity_index]

    # If similarity is above threshold, return the corresponding answer
    if max_similarity >= threshold:
        return skill_conversation_data[max_similarity_index]['answer']
    else:
        # Generate a generic response for questions not in our dataset
        return "I'm not sure about that specific topic. However, I can help you find courses or study partners for various skills. Could you tell me what skills you're interested in learning?"



## Testing the NLP

In [60]:
# Test the chatbot with sample questions
sample_questions = [
    "What should I learn to become a data scientist?",
    "I want to learn web development, what should I study?",
    "How can I find someone to study programming with?",
    "What programming language should I learn first?",
    "I'm interested in artificial intelligence"
]

print("\nChatbot Response Examples:")
for question in sample_questions:
    print(f"\nQ: {question}")
    print(f"A: {get_chatbot_response(question)}")

## 9. Building a complete skill guidance system

def skill_guidance_system(user_id, user_query, df):
    """
    Combined skill guidance system that leverages all components:
    1. Chatbot for general skill guidance
    2. User matching for peer learning
    3. Course recommendations based on desired skills
    """
    response = {}

    # Get chatbot response
    chatbot_answer = get_chatbot_response(user_query)
    response['guidance'] = chatbot_answer

    # If user_id is provided, get personalized recommendations
    if user_id and user_id in df['user_id'].values:
        # Get matching learning partners
        learning_partners = find_learning_partners(user_id, df, n_recommendations=2)
        response['learning_partners'] = learning_partners

        # Get user's desired skills
        user_desired_skills = df[df['user_id'] == user_id]['desired_skills'].iloc[0].split(', ')

        # Recommend courses based on desired skills (simple frequency-based recommendation)
        recommended_courses = []
        for skill in user_desired_skills:
            skill = skill.strip()
            # Find users who have this skill
            users_with_skill = df[df['skills'].apply(lambda x: skill in x)]
            # Get courses they took
            if not users_with_skill.empty:
                courses = []
                for course_list in users_with_skill['joinedCourses']:
                    courses.extend([c.strip() for c in course_list.split(',')])

                # Get most common courses
                if courses:
                    course_counts = pd.Series(courses).value_counts()
                    top_courses = course_counts.head(2).index.tolist()
                    recommended_courses.append({
                        'skill': skill,
                        'recommended_courses': top_courses
                    })

        response['course_recommendations'] = recommended_courses

    return response



Chatbot Response Examples:

Q: What should I learn to become a data scientist?
A: To learn Data Science, I recommend courses in Python or R, statistics, data visualization, machine learning, and big data technologies. Start with the fundamentals and then work on real-world projects.

Q: I want to learn web development, what should I study?
A: For web development, core skills include HTML, CSS, and JavaScript. For frontend, learn frameworks like React, Vue, or Angular. For backend, consider Node.js, Django, or Ruby on Rails, along with database skills.

Q: How can I find someone to study programming with?
A: You can find a study partner by using our matching system, joining relevant online communities, participating in forums related to your interests, or attending virtual meetups and hackathons.

Q: What programming language should I learn first?
A: Python is often recommended for beginners due to its simpler syntax and readability. Java has a steeper learning curve but is valuable fo

## Whole System Testing

In [61]:
test_user_id = 1
test_query = "I want to learn mACHINE LEARNING AND I DONT KNOW WHERE TO START"

guidance_result = skill_guidance_system(test_user_id, test_query, df)

print("\nComplete Skill Guidance System Output:")
print("\nChatbot Guidance:")
print(guidance_result['guidance'])

print("\nRecommended Learning Partners:")
for partner in guidance_result['learning_partners']:
    print(f"User {partner['user_id']} - Can teach you: {', '.join(partner['matching_skills'])}")

print("\nCourse Recommendations:")
for rec in guidance_result['course_recommendations']:
    print(f"For {rec['skill']}: {', '.join(rec['recommended_courses'])}")



Complete Skill Guidance System Output:

Chatbot Guidance:
Start learning AI with Python programming, statistics, and linear algebra. Then progress to machine learning fundamentals, neural networks, and specialized areas like NLP or computer vision.

Recommended Learning Partners:
User 3643 - Can teach you: CSS, Java, Machine Learning, Blockchain
User 5313 - Can teach you: CSS, Java, Machine Learning, Data Science

Course Recommendations:
For CSS: Excel, HTML
For Java: CSS, Data Science
For Machine Learning: AI, Java
For Blockchain: Data Science, HTML
For Data Science: CSS, AI


Conclusion and Next Steps:

In this notebook, we've built a comprehensive skill matching and guidance system that includes:

1. Data preparation and cleaning
2. Exploratory data analysis with visualizations
3. Feature engineering for skills and courses
4. User clustering with K-Means (KMM)
5. Skill interest prediction with SVM
6. User matching for peer learning
7. NLP-based chatbot for skill guidance
8. Integra

ADD SVM IMPLM

In [62]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder


# Prepare data for association rule mining (convert skills to transaction format)
def prepare_transactions(df, column_name):
    transactions = []
    for skills in df[column_name]:
        if isinstance(skills, str):
            transactions.append([skill.strip() for skill in skills.split(',')])
    return transactions


# Get transactions from skills and desired skills
skill_transactions = prepare_transactions(df, 'skills')
desired_skill_transactions = prepare_transactions(df, 'desired_skills')

# Convert to binary format for apriori algorithm
te = TransactionEncoder()
te_ary = te.fit_transform(skill_transactions)
skill_df = pd.DataFrame(te_ary, columns=te.columns_)

# Find frequent itemsets with Apriori algorithm
min_support = 0.01  # Minimum support threshold
frequent_itemsets = apriori(skill_df, min_support=min_support, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)

# Display top rules by lift
print("\nTop Association Rules (by lift):")
if not rules.empty:
    top_rules = rules.sort_values('lift', ascending=False).head(10)
    for idx, rule in top_rules.iterrows():
        antecedents = ', '.join(list(rule['antecedents']))
        consequents = ', '.join(list(rule['consequents']))
        print(
            f"{antecedents} → {consequents} (Support: {rule['support']:.3f}, Confidence: {rule['confidence']:.3f}, Lift: {rule['lift']:.3f})")
else:
    print("No rules found with the current thresholds")

# Visualize top rules
if not rules.empty:
    plt.figure(figsize=(10, 6))
    plt.scatter(rules['support'], rules['confidence'], alpha=0.5, s=rules['lift'] * 20)
    plt.xlabel('Support')
    plt.ylabel('Confidence')
    plt.title('Association Rules - Support vs Confidence')

    # Annotate top rules
    for idx, rule in top_rules.iterrows():
        ant = ', '.join(list(rule['antecedents']))
        con = ', '.join(list(rule['consequents']))
        label = f"{ant} → {con}"
        plt.annotate(label,
                     (rule['support'], rule['confidence']),
                     xytext=(7, -5),
                     textcoords='offset points',
                     fontsize=8)

    plt.tight_layout()
    plt.savefig('association_rules.png')
    plt.close()

target_skills = ["Machine Learning", "Python", "JavaScript", "Data Science", "AI"]
svm_results = {}

for skill in target_skills:
    # Create target variable - does the user want this skill?
    df['wants_' + skill.replace(' ', '_')] = df['desired_skills'].apply(
        lambda x: 1 if skill in x else 0
    )

    # Feature matrix for prediction
    X_svm = combined_features
    y_svm = df['wants_' + skill.replace(' ', '_')]

    # Split data for training and testing
    X_train, X_test, y_train, y_test = train_test_split(X_svm, y_svm, test_size=0.3, random_state=42)

    # Train SVM model
    svm_model = SVC(kernel='linear', probability=True, random_state=42)
    svm_model.fit(X_train, y_train)

    # Evaluate model
    y_pred = svm_model.predict(X_test)

    # Calculate performance metrics
    accuracy = (y_pred == y_test).mean()
    precision = sum((y_pred == 1) & (y_test == 1)) / sum(y_pred == 1) if sum(y_pred == 1) > 0 else 0
    recall = sum((y_pred == 1) & (y_test == 1)) / sum(y_test == 1) if sum(y_test == 1) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    svm_results[skill] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'model': svm_model,
        'class_report': classification_report(y_test, y_pred)
    }

# Display SVM performance for each skill
print("\nSVM Performance for Skill Prediction:")
for skill, metrics in svm_results.items():
    print(f"\n{skill}:")
    print(f"Accuracy: {metrics['accuracy']:.3f}")
    print(f"Precision: {metrics['precision']:.3f}")
    print(f"Recall: {metrics['recall']:.3f}")
    print(f"F1-Score: {metrics['f1']:.3f}")
    print("Classification Report:")
    print(metrics['class_report'])

# Visualize SVM performance comparison
plt.figure(figsize=(12, 6))
metrics = ['accuracy', 'precision', 'recall', 'f1']
skill_names = list(svm_results.keys())

# Create a dataframe for easier plotting
svm_metrics_df = pd.DataFrame({
    skill: [svm_results[skill][metric] for metric in metrics]
    for skill in skill_names
}, index=metrics)

svm_metrics_df.plot(kind='bar', figsize=(12, 6))
plt.title('SVM Performance Metrics by Skill')
plt.ylabel('Score')
plt.xlabel('Metric')
plt.ylim(0, 1)
plt.legend(title='Skill')
plt.tight_layout()
plt.savefig('svm_performance.png')
plt.close()


## Hybrid Recommendation System: Combining ADD and SVM

def hybrid_skill_recommendation(user_id, df, svm_results, rules):
    """
    Generate skill recommendations using both association rules and SVM predictions
    """

    # Get user's current skills
    if user_id not in df['user_id'].values:
        return {"error": "User not found"}

    user_skills = df[df['user_id'] == user_id]['skills'].iloc[0]
    if not isinstance(user_skills, str):
        user_skills = ""
    user_skill_list = [skill.strip() for skill in user_skills.split(',')]

    # 1. Recommendations from Association Rules
    rule_recommendations = set()
    if not rules.empty:
        for idx, rule in rules.iterrows():
            antecedents = set(rule['antecedents'])
            if antecedents.issubset(set(user_skill_list)):
                consequents = set(rule['consequents'])
                # Add consequents that user doesn't already have
                for skill in consequents:
                    if skill not in user_skill_list:
                        rule_recommendations.add(skill)

    # 2. Recommendations from SVM models
    svm_recommendations = {}
    # Create feature vector for this user
    user_features = combined_features.loc[df['user_id'] == user_id]

    if not user_features.empty:
        for skill, data in svm_results.items():
            model = data['model']
            # Get probability of wanting this skill
            skill_prob = model.predict_proba(user_features)[0][1]
            if skill not in user_skill_list:  # Only recommend skills the user doesn't have
                svm_recommendations[skill] = skill_prob

    # Sort SVM recommendations by probability
    sorted_svm_recs = sorted(svm_recommendations.items(), key=lambda x: x[1], reverse=True)

    return {
        "association_rule_recommendations": list(rule_recommendations),
        "svm_recommendations": [(skill, prob) for skill, prob in sorted_svm_recs]
    }


# Test the hybrid recommendation system
test_user_id = 42  # Change to a user ID in your dataset

hybrid_recs = hybrid_skill_recommendation(test_user_id, df, svm_results, rules)

print(f"\nHybrid Skill Recommendations for User {test_user_id}:")
print("\nRecommendations from Association Rules:")
for skill in hybrid_recs["association_rule_recommendations"][:5]:
    print(f"- {skill}")

print("\nRecommendations from SVM (with probability):")
for skill, prob in hybrid_recs["svm_recommendations"][:5]:
    print(f"- {skill}: {prob:.3f}")


## Algorithm Comparison and Evaluation

# Function to evaluate recommendation performance
def evaluate_recommendations(df, test_size=0.2, random_state=42):
    """
    Split the data and evaluate both recommendation approaches
    """
    # Split users into train and test sets
    user_ids = df['user_id'].unique()
    test_users = np.random.RandomState(random_state).choice(
        user_ids, size=int(len(user_ids) * test_size), replace=False
    )
    train_users = np.array([uid for uid in user_ids if uid not in test_users])

    # Train data only includes train users
    train_df = df[df['user_id'].isin(train_users)]

    # Prepare transactions for association rules
    train_skill_transactions = prepare_transactions(train_df, 'skills')
    te_train = TransactionEncoder()
    te_train_ary = te_train.fit_transform(train_skill_transactions)
    train_skill_df = pd.DataFrame(te_train_ary, columns=te_train.columns_)

    # Find frequent itemsets and rules
    train_frequent_itemsets = apriori(train_skill_df, min_support=0.01, use_colnames=True)
    train_rules = association_rules(train_frequent_itemsets, metric="confidence", min_threshold=0.3)

    # Train SVM models
    train_svm_results = {}
    for skill in ["Machine Learning", "Python", "JavaScript", "Data Science", "AI"]:
        # Create binary features for skills using the training data
        train_skills_binary, _ = create_binary_features(train_df, 'skills')
        train_desired_skills_binary, _ = create_binary_features(train_df, 'desired_skills')
        train_courses_binary, _ = create_binary_features(train_df, 'joinedCourses')

        # Combine features
        train_combined = pd.concat([
            train_skills_binary,
            train_desired_skills_binary,
            train_courses_binary,
            train_df[['membershipDuration', 'course_count', 'skills_count', 'desired_skills_count',
                      'isVerified']].reset_index(drop=True)
        ], axis=1)

        # Scale numerical features
        train_combined[numerical_features] = scaler.transform(train_combined[numerical_features])
        train_combined['isVerified'] = train_combined['isVerified'].astype(int)

        # Create target variable
        train_df['wants_' + skill.replace(' ', '_')] = train_df['desired_skills'].apply(
            lambda x: 1 if skill in x else 0
        )

        # Train SVM
        svm_model = SVC(kernel='linear', probability=True, random_state=42)
        svm_model.fit(train_combined, train_df['wants_' + skill.replace(' ', '_')])
        train_svm_results[skill] = {'model': svm_model}

    # Evaluate on test users
    precision_add = []
    precision_svm = []
    recall_add = []
    recall_svm = []

    for user_id in test_users:
        # Get user's desired skills (ground truth)
        desired_skills = df[df['user_id'] == user_id]['desired_skills'].iloc[0]
        if not isinstance(desired_skills, str) or desired_skills == "Unknown":
            continue

        true_desired = set([skill.strip() for skill in desired_skills.split(',')])

        # Get recommendations using both methods
        hybrid_recs = hybrid_skill_recommendation(user_id, df, train_svm_results, train_rules)

        # ADD recommendations
        add_recs = set(hybrid_recs["association_rule_recommendations"])

        # SVM recommendations (top 5)
        svm_recs = set([skill for skill, _ in hybrid_recs["svm_recommendations"][:5]])

        # Calculate precision and recall if we have recommendations and desired skills
        if true_desired and add_recs:
            add_correct = len(add_recs.intersection(true_desired))
            p_add = add_correct / len(add_recs) if len(add_recs) > 0 else 0
            r_add = add_correct / len(true_desired) if len(true_desired) > 0 else 0
            precision_add.append(p_add)
            recall_add.append(r_add)

        if true_desired and svm_recs:
            svm_correct = len(svm_recs.intersection(true_desired))
            p_svm = svm_correct / len(svm_recs) if len(svm_recs) > 0 else 0
            r_svm = svm_correct / len(true_desired) if len(true_desired) > 0 else 0
            precision_svm.append(p_svm)
            recall_svm.append(r_svm)

    # Calculate average precision and recall
    avg_precision_add = np.mean(precision_add) if precision_add else 0
    avg_precision_svm = np.mean(precision_svm) if precision_svm else 0
    avg_recall_add = np.mean(recall_add) if recall_add else 0
    avg_recall_svm = np.mean(recall_svm) if recall_svm else 0

    # Calculate F1 scores
    f1_add = 2 * avg_precision_add * avg_recall_add / (avg_precision_add + avg_recall_add) if (
                                                                                                          avg_precision_add + avg_recall_add) > 0 else 0
    f1_svm = 2 * avg_precision_svm * avg_recall_svm / (avg_precision_svm + avg_recall_svm) if (
                                                                                                          avg_precision_svm + avg_recall_svm) > 0 else 0

    return {
        'ADD': {
            'precision': avg_precision_add,
            'recall': avg_recall_add,
            'f1': f1_add
        },
        'SVM': {
            'precision': avg_precision_svm,
            'recall': avg_recall_svm,
            'f1': f1_svm
        }
    }


# Run evaluation
evaluation_results = evaluate_recommendations(df)

print("\nAlgorithm Evaluation Results:")
print("\nAssociation Rule Discovery (ADD):")
print(f"Precision: {evaluation_results['ADD']['precision']:.3f}")



Top Association Rules (by lift):
SQL, Node.js, Python → CSS (Support: 0.011, Confidence: 0.593, Lift: 1.214)
CSS, SQL, Node.js → Python (Support: 0.011, Confidence: 0.359, Lift: 1.206)
CSS, AI, HTML → Python (Support: 0.010, Confidence: 0.351, Lift: 1.177)
CSS, JavaScript, Node.js → Python (Support: 0.014, Confidence: 0.348, Lift: 1.169)
SQL, AI, Excel → HTML (Support: 0.011, Confidence: 0.696, Lift: 1.167)
Blockchain, JavaScript → Excel (Support: 0.011, Confidence: 0.591, Lift: 1.163)
CSS, Java, Python → SQL (Support: 0.013, Confidence: 0.356, Lift: 1.160)
SQL, Java, Python → CSS (Support: 0.013, Confidence: 0.561, Lift: 1.149)
SQL, JavaScript, Java → Python (Support: 0.011, Confidence: 0.342, Lift: 1.148)
SQL, AI → HTML, Excel (Support: 0.011, Confidence: 0.350, Lift: 1.147)

SVM Performance for Skill Prediction:

Machine Learning:
Accuracy: 1.000
Precision: 1.000
Recall: 1.000
F1-Score: 1.000
Classification Report:
              precision    recall  f1-score   support

           0

<Figure size 1200x600 with 0 Axes>