In [1]:
import pandas as pd

In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

# Load dataset
df = pd.read_csv("linked_in_data.csv")

# Preprocess data
# Handle categorical variables
categorical_columns = ["Workplace", "Location", "Experiences", "Activities", "About", "Photo"]  # Add other categorical columns if any
for column in categorical_columns:
    df[column] = df[column].astype("category")

# Perform one-hot encoding for categorical variables
encoder = OneHotEncoder(handle_unknown='ignore')  # Set handle_unknown='ignore' to handle unknown categories gracefully
encoded_data = encoder.fit_transform(df[categorical_columns])

# Combine encoded data with numerical features
numerical_features = [
    "Connections", "Followers", "Number of Experiences", 
    "Number of Educations", "Number of Licenses", 
    "Number of Volunteering", "Number of Skills", 
    "Number of Recommendations", "Number of Projects", 
    "Number of Publications", "Number of Courses", 
    "Number of Honors", "Number of Scores", 
    "Number of Languages", "Number of Organizations", 
    "Number of Interests", "Number of Activities"
]  # Add other numerical columns if any

X = pd.concat([
    df[numerical_features], pd.DataFrame(encoded_data.toarray())
], axis=1)

# Convert feature names to strings
X.columns = X.columns.astype(str)

# Define labels
y = df["Label"].replace({
    "LLPs": 0,
    "FLPs": 1,
    "CLPs based on legitimate profiles' statistics": 10,
    "CLPs based on fake profiles' statistics": 11
})  # Labels (LLPs, FLPs, CLPs, etc.)

# Initialize the Random Forest classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
classifier.fit(X, y)

# Example of predicting a new profile
# Let's assume we have a new profile represented as a list of features
# Categorical profile
new_profile_categorical = {
    "Workplace": "XYZ Bot Development",
    "Location": "Nowhere, Internet",
    "Experiences": "AI Chatbot Developer, Automated Content Generator",
    "Activities": "Automated liking and commenting on posts",
    "About": "I am an AI-powered virtual assistant designed to optimize your LinkedIn experience.",
    "Photo": "No",  # No profile photo
}

# Numerical profile
new_profile_numerical = {
    "Connections": 10,  # Unusually low number of connections
    "Followers": 5,  # Unusually low number of followers
    "Number of Experiences": 2,
    "Number of Educations": 1,
    "Number of Licenses": 0,  # No licenses
    "Number of Volunteering": 0,
    "Number of Skills": 10,  # High number of skills
    "Number of Recommendations": 0,
    "Number of Projects": 0,  # No projects
    "Number of Publications": 0,
    "Number of Courses": 0,  # No courses
    "Number of Honors": 0,
    "Number of Scores": 0,
    "Number of Languages": 1,  # Minimal language proficiency
    "Number of Organizations": 1,
    "Number of Interests": 2,  # Minimal interests
    "Number of Activities": 5  # Low activity level
}

# Combine categorical and numerical profiles
new_profile = {**new_profile_categorical, **new_profile_numerical}

# Convert the new profile to a DataFrame
new_profile_df = pd.DataFrame([new_profile])

# Handle categorical variables
for column in categorical_columns:
    new_profile_df[column] = new_profile_df[column].astype("category")

# Perform one-hot encoding for categorical variables
encoded_data_new_profile = encoder.transform(new_profile_df[categorical_columns])

# Combine encoded data with numerical features
X_new_profile = pd.concat([
    new_profile_df[numerical_features], pd.DataFrame(encoded_data_new_profile.toarray())
], axis=1)

# Convert feature names to strings
X_new_profile.columns = X_new_profile.columns.astype(str)

# Use the trained model to make predictions on the new profile
prediction = classifier.predict(X_new_profile)
print("Predicted class for new profile:", prediction)


Predicted class for new profile: [1]
