In [2]:
import pandas as pd
import numpy as np

In [2]:
np.random.seed(42)
n = 200

df = pd.DataFrame({
    "cgpa": np.round(np.random.uniform(6.0, 9.8, n), 2),
    "internships": np.random.randint(0, 5, n),

    # Technical skills (multi-select → binary)
    "ml": np.random.choice([0, 1], n, p=[0.6, 0.4]),
    "dsa": np.random.choice([0, 1], n, p=[0.5, 0.5]),
    "web_dev": np.random.choice([0, 1], n, p=[0.6, 0.4]),
    "python": np.random.choice([0, 1], n, p=[0.4, 0.6]),
    "sql": np.random.choice([0, 1], n, p=[0.5, 0.5]),
    "java": np.random.choice([0, 1], n, p=[0.6, 0.4]),
    "cpp": np.random.choice([0, 1], n, p=[0.7, 0.3]),
    "data_science": np.random.choice([0, 1], n, p=[0.6, 0.4]),
    "cloud": np.random.choice([0, 1], n, p=[0.75, 0.25]),
    "devops": np.random.choice([0, 1], n, p=[0.8, 0.2]),

    # Communication skills
    "communication_skills": np.random.choice(
        ["Poor", "Average", "Good", "Excellent"],
        n,
        p=[0.15, 0.35, 0.35, 0.15]
    )
})

# Realistic placement logic
df["placement"] = np.where(
    (df["cgpa"] >= 7.5) &
    (df["internships"] >= 1) &
    (
        (df["dsa"] == 1) |
        (df["ml"] == 1) |
        (df["data_science"] == 1)
    ) &
    (df["communication_skills"].isin(["Good", "Excellent"])),
    "Yes",
    "No"
)

df.head()

Unnamed: 0,cgpa,internships,ml,dsa,web_dev,python,sql,java,cpp,data_science,cloud,devops,communication_skills,placement
0,7.42,3,1,0,1,1,1,1,0,1,0,0,Poor,No
1,9.61,2,1,0,1,0,1,1,0,0,0,0,Good,Yes
2,8.78,0,0,1,0,1,0,0,0,0,0,0,Average,No
3,8.27,3,0,0,1,1,0,0,0,0,0,0,Good,No
4,6.59,3,0,1,0,1,1,1,0,0,0,0,Good,No


In [3]:
df.to_csv("../data/placement_data.csv", index=False)
print("CSV created with", len(df), "rows and", df.shape[1], "columns")


CSV created with 200 rows and 14 columns


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   cgpa                  200 non-null    float64
 1   internships           200 non-null    int32  
 2   ml                    200 non-null    int64  
 3   dsa                   200 non-null    int64  
 4   web_dev               200 non-null    int64  
 5   python                200 non-null    int64  
 6   sql                   200 non-null    int64  
 7   java                  200 non-null    int64  
 8   cpp                   200 non-null    int64  
 9   data_science          200 non-null    int64  
 10  cloud                 200 non-null    int64  
 11  devops                200 non-null    int64  
 12  communication_skills  200 non-null    object 
 13  placement             200 non-null    object 
dtypes: float64(1), int32(1), int64(10), object(2)
memory usage: 21.2+ KB


In [3]:
df = pd.read_csv("../data/placement_data.csv")
df.head()

Unnamed: 0,cgpa,internships,ml,dsa,web_dev,python,sql,java,cpp,data_science,cloud,devops,communication_skills,placement
0,7.42,3,1,0,1,1,1,1,0,1,0,0,Poor,No
1,9.61,2,1,0,1,0,1,1,0,0,0,0,Good,Yes
2,8.78,0,0,1,0,1,0,0,0,0,0,0,Average,No
3,8.27,3,0,0,1,1,0,0,0,0,0,0,Good,No
4,6.59,3,0,1,0,1,1,1,0,0,0,0,Good,No


##### Basic Inspection

In [4]:
df.shape
df.columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   cgpa                  200 non-null    float64
 1   internships           200 non-null    int64  
 2   ml                    200 non-null    int64  
 3   dsa                   200 non-null    int64  
 4   web_dev               200 non-null    int64  
 5   python                200 non-null    int64  
 6   sql                   200 non-null    int64  
 7   java                  200 non-null    int64  
 8   cpp                   200 non-null    int64  
 9   data_science          200 non-null    int64  
 10  cloud                 200 non-null    int64  
 11  devops                200 non-null    int64  
 12  communication_skills  200 non-null    object 
 13  placement             200 non-null    object 
dtypes: float64(1), int64(11), object(2)
memory usage: 22.0+ KB


##### Check missing value

In [6]:
df.isnull().sum()

cgpa                    0
internships             0
ml                      0
dsa                     0
web_dev                 0
python                  0
sql                     0
java                    0
cpp                     0
data_science            0
cloud                   0
devops                  0
communication_skills    0
placement               0
dtype: int64

##### Handle missing values

In [7]:
# Numerical columns
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Categorical columns
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

#### Fix data types

##### Convert placement Yes/No → 0/1

In [8]:
df["placement"] = df["placement"].map({"Yes": 1, "No": 0})


##### Encode communication skills

In [9]:
df["communication_skills"] = df["communication_skills"].map({
    "Poor": 0,
    "Average": 1,
    "Good": 2,
    "Excellent": 3
})

##### Final Verification

In [12]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   cgpa                  200 non-null    float64
 1   internships           200 non-null    int64  
 2   ml                    200 non-null    int64  
 3   dsa                   200 non-null    int64  
 4   web_dev               200 non-null    int64  
 5   python                200 non-null    int64  
 6   sql                   200 non-null    int64  
 7   java                  200 non-null    int64  
 8   cpp                   200 non-null    int64  
 9   data_science          200 non-null    int64  
 10  cloud                 200 non-null    int64  
 11  devops                200 non-null    int64  
 12  communication_skills  200 non-null    int64  
 13  placement             200 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 22.0 KB


cgpa                    0
internships             0
ml                      0
dsa                     0
web_dev                 0
python                  0
sql                     0
java                    0
cpp                     0
data_science            0
cloud                   0
devops                  0
communication_skills    0
placement               0
dtype: int64