# Importing Libraries

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import warnings
warnings.filterwarnings('ignore')

# For NLP chatbot
from sklearn.metrics.pairwise import cosine_similarity
import random

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\farou\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\farou\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\farou\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Data Loading and Exploration


In [13]:
df = pd.read_csv('data/edited_skill_exchange_dataset.csv')


# Display basic information about the dataset


In [14]:
print("Dataset Shape:", df.shape)
print("\nData Types:")
print(df.dtypes)
print("\nSample Data:")
print(df.head())

Dataset Shape: (10000, 6)

Data Types:
user_id            int64
joinedDate        object
joinedCourses     object
skills            object
desired_skills    object
isVerified          bool
dtype: object

Sample Data:
   user_id  joinedDate                            joinedCourses  \
0        1  2022-08-28  Machine Learning, CSS, Excel, SQL, HTML   
1        2  2023-12-04  Data Science, Excel, Python, JavaScript   
2        3  2023-04-10          JavaScript, Python, Excel, Java   
3        4  2022-01-30              AI, Machine Learning, Excel   
4        5  2022-09-07                                   Python   

                               skills  \
0                           HTML, SQL   
1   HTML, CSS, JavaScript, Excel, SQL   
2                     HTML, CSS, Java   
3  HTML, Excel, SQL, Java, Blockchain   
4                 CSS, JavaScript, AI   

                                      desired_skills  isVerified  
0  CSS, Java, Machine Learning, Blockchain, Data ...       False  

## 2. Data Preparation and Cleaning


In [15]:
# Convert joinedDate to datetime
df['joinedDate'] = pd.to_datetime(df['joinedDate'])

# Calculate membership duration (in days)
df['membershipDuration'] = (pd.Timestamp('2025-03-12') - df['joinedDate']).dt.days

# Function to clean and standardize comma-separated text fields
def clean_text_list(text):
    if isinstance(text, str):
        # Split by comma, strip whitespace, and rejoin
        items = [item.strip() for item in text.split(',')]
        return ', '.join(items)
    return text

# Apply cleaning to text columns
for col in ['joinedCourses', 'skills', 'desired_skills']:
    df[col] = df[col].apply(clean_text_list)

# Create count features
df['course_count'] = df['joinedCourses'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)
df['skills_count'] = df['skills'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)
df['desired_skills_count'] = df['desired_skills'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)

# Display the cleaned data
print("\nCleaned Dataset:")
print(df.head())


Cleaned Dataset:
   user_id joinedDate                            joinedCourses  \
0        1 2022-08-28  Machine Learning, CSS, Excel, SQL, HTML   
1        2 2023-12-04  Data Science, Excel, Python, JavaScript   
2        3 2023-04-10          JavaScript, Python, Excel, Java   
3        4 2022-01-30              AI, Machine Learning, Excel   
4        5 2022-09-07                                   Python   

                               skills  \
0                           HTML, SQL   
1   HTML, CSS, JavaScript, Excel, SQL   
2                     HTML, CSS, Java   
3  HTML, Excel, SQL, Java, Blockchain   
4                 CSS, JavaScript, AI   

                                      desired_skills  isVerified  \
0  CSS, Java, Machine Learning, Blockchain, Data ...       False   
1              JavaScript, Python, Java, Node.js, AI        True   
2                                  CSS, SQL, Node.js        True   
3  SQL, Node.js, Machine Learning, Blockchain, Da...        True  


## 3. Data Understanding and Visualization

In [16]:
plt.figure(figsize=(15, 10))

# Distribution of course counts
plt.subplot(2, 2, 1)
sns.histplot(df['course_count'], kde=True)
plt.title('Distribution of Course Counts')
plt.xlabel('Number of Courses')
plt.ylabel('Frequency')

# Distribution of skill counts
plt.subplot(2, 2, 2)
sns.histplot(df['skills_count'], kde=True)
plt.title('Distribution of Skill Counts')
plt.xlabel('Number of Skills')
plt.ylabel('Frequency')

# Distribution of desired skill counts
plt.subplot(2, 2, 3)
sns.histplot(df['desired_skills_count'], kde=True)
plt.title('Distribution of Desired Skill Counts')
plt.xlabel('Number of Desired Skills')
plt.ylabel('Frequency')

# Membership duration distribution
plt.subplot(2, 2, 4)
sns.histplot(df['membershipDuration'], kde=True)
plt.title('Distribution of Membership Duration')
plt.xlabel('Days')
plt.ylabel('Frequency')

plt.tight_layout()
plt.savefig('distributions.png')
plt.close()

# Function to extract all unique skills/courses from a column

In [17]:
def extract_unique_items(df, column_name):
    all_items = []
    for items_str in df[column_name]:
        if isinstance(items_str, str):
            items = [item.strip() for item in items_str.split(',')]
            all_items.extend(items)
    return list(set(all_items))

# Get unique items


In [18]:
all_courses = extract_unique_items(df, 'joinedCourses')
all_skills = extract_unique_items(df, 'skills')
all_desired_skills = extract_unique_items(df, 'desired_skills')
print(f"\nTotal unique courses: {len(all_courses)}")
print(f"Total unique skills: {len(all_skills)}")
print(f"Total unique desired skills: {len(all_desired_skills)}")


Total unique courses: 12
Total unique skills: 13
Total unique desired skills: 13


# Top courses visualization

In [19]:
def plot_top_items(df, column_name, title, n=10):
    all_items = []
    for items_str in df[column_name]:
        if isinstance(items_str, str):
            items = [item.strip() for item in items_str.split(',')]
            all_items.extend(items)

    item_counts = pd.Series(all_items).value_counts().head(n)

    plt.figure(figsize=(12, 6))
    sns.barplot(x=item_counts.values, y=item_counts.index)
    plt.title(f'Top {n} {title}')
    plt.xlabel('Count')
    plt.tight_layout()
    plt.savefig(f'top_{column_name}.png')
    plt.close()

# Plot top items for each category
plot_top_items(df, 'joinedCourses', 'Courses')
plot_top_items(df, 'skills', 'Skills')
plot_top_items(df, 'desired_skills', 'Desired Skills')