# Data Preprocessing (Cleaning, Revisualize)

##### Loading the Libraries and Dataset

In [None]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick

# Load dataset
df_clean = pd.read_csv("data/tech_salary_data.csv")

##### Drop Irrelevant Columns and Level Column

In [2]:
# Drop Irrelevant Columns
cols_to_drop = ["timestamp", "company", "tag", "otherdetails", "cityid", "dmaid", "rowNumber"]
for col in cols_to_drop:
    if col in df_clean.columns:
        df_clean.drop(columns=col, inplace=True)

# Drop 'level' column (too complex and not useful)
df_clean.drop(columns=['level'], inplace=True)

##### Removing Rows where Race is Null

In [3]:
# Keep only rows with known Race
df_clean.dropna(subset=["Race"], inplace=True)

##### Removing Rows where Gender is not Male/Female

In [4]:
# Keep only rows with Male/Female Gender
df_clean["gender"] = df_clean["gender"].str.strip().str.title()
df_clean = df_clean[df_clean["gender"].isin(["Male", "Female"])]

##### Changing Null Values in Education to None

In [5]:
# Fill missing Education with "None" value
if "Education" in df_clean.columns:
    df_clean["Education"] = df_clean["Education"].fillna("None")

##### Recalculate Salary-Related Variables and Verify Mismatches
totalyearlycompensation = basesalary + bonus + stockgrantvalue

In [6]:
# Clamp negative bonus/stockgrantvalue up to zero
for col in ["bonus", "stockgrantvalue"]:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].clip(lower=0)

# Recompute totalyearlycompensation from its components
df_clean["totalyearlycompensation"] = (
    df_clean["basesalary"].fillna(0) +
    df_clean["bonus"].fillna(0) +
    df_clean["stockgrantvalue"].fillna(0)
)

# Verify that no mismatches between totalyearlycompensation and its related variables remain
mismatch = (
    df_clean["totalyearlycompensation"] -
    df_clean[["basesalary", "bonus", "stockgrantvalue"]].sum(axis=1)
).abs().gt(1e-6)
print(f"Mismatched rows after recalculation: {mismatch.sum()}")

Mismatched rows after recalculation: 0


##### Synchronizing Education Binary Variables
Masters_Degree, Bachelors_Degree, etc.

In [7]:
# Synchronize education binary variables
edu_bins = [
    ("Doctorate",       "Doctorate_Degree"),
    ("Masters",         "Masters_Degree"),
    ("Bachelors",       "Bachelors_Degree"),
    ("Some_College",    "Some_College"),
    ("Highschool",      "Highschool")
]

def fix_education(row):
    flags = row[[col for _, col in edu_bins]]
    if flags.sum() == 1:
        return row
    row[[col for _, col in edu_bins]] = 0
    level = str(row.get("Education", "")).strip().title()
    for label, col in edu_bins:
        if level.startswith(label):
            row[col] = 1
            break
    return row

df_clean = df_clean.apply(fix_education, axis=1)

##### Duplication of Dataframe for Outlier Elimination (Tukey's Fence Method)

In [8]:
df_cleaned_v2 = df_clean.copy()

##### Identifying Numeric Columns and Computing Q1, Q3 and IQR for each Column

In [9]:
numeric_cols = [
    'totalyearlycompensation',
    'basesalary',
    'bonus',
    'stockgrantvalue',
    'yearsofexperience',
    'yearsatcompany'
]

Q1 = df_cleaned_v2[numeric_cols].quantile(0.25)
Q3 = df_cleaned_v2[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

##### Creating Mask for Outliers

In [10]:
# Build Mask - True if row is within [Q1 - 1.5*IQR, Q3 + 1.5IQR] for EVERY numeric column
fence_low  = Q1 - 1.5 * IQR
fence_high = Q3 + 1.5 * IQR

outlier_mask = pd.concat([
    (df_cleaned_v2[col] < fence_low[col]) |
    (df_cleaned_v2[col] > fence_high[col])
    for col in numeric_cols
], axis=1).any(axis=1)

# Negating it to get non-outliers (using logical not)
mask = ~outlier_mask

n_before = len(df_cleaned_v2)
n_after = mask.sum()
print(f"Dropping {n_before-n_after} / {n_before} rows as outliers ({(n_before-n_after)/n_before:.1%})")

Dropping 3446 / 22200 rows as outliers (15.5%)


##### Copying Outlier-free Data to new Data Frame

In [11]:
df_no_outliers = df_cleaned_v2.loc[mask].copy()

##### Removing Duplicate Entries

In [12]:
df_no_outliers = df_no_outliers.drop_duplicates()

##### Removing Derivative Variables

In [13]:
df_no_outliers = df_no_outliers.drop(columns=["basesalary", "stockgrantvalue", "bonus"], errors="ignore")

##### Comparison of Original and Preprocessed Data and Exporting the Preprocessed Data

In [14]:
# Show the final shape of clean dataset and export to csv
print("Before cleaning: 62,642 rows × 24 columns.")
print(f"After cleaning: {df_no_outliers.shape[0]:,} rows × {df_no_outliers.shape[1]} columns.")

# Export data to new csv file
df_no_outliers.to_csv("tech_salary_data_CLEANED.csv", index=False)
print("Dataset saved to tech_salary_data_CLEANED.csv.")

Before cleaning: 62,642 rows × 24 columns.
After cleaning: 18,570 rows × 13 columns.
Dataset saved to tech_salary_data_CLEANED.csv.


##### Column Types and Missing Values of Preprocessed Data

In [15]:
# Importing cleaned data
df = pd.read_csv("tech_salary_data_CLEANED.csv")

# Make "None" value in Education into string (so it doesn't count as null)
df['Education'] = df['Education'].replace({None: "None"})

# Show the column types and non-null counts
print("=== Columns and Data Types ===")
print(df.info(), "\n")

# Show the missing value counts
print("=== Missing Value Counts ===")
print(df.isnull().sum(), "\n")

=== Columns and Data Types ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18570 entries, 0 to 18569
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   title                    18570 non-null  object 
 1   totalyearlycompensation  18570 non-null  float64
 2   location                 18570 non-null  object 
 3   yearsofexperience        18570 non-null  float64
 4   yearsatcompany           18570 non-null  float64
 5   gender                   18570 non-null  object 
 6   Masters_Degree           18570 non-null  int64  
 7   Bachelors_Degree         18570 non-null  int64  
 8   Doctorate_Degree         18570 non-null  int64  
 9   Highschool               18570 non-null  int64  
 10  Some_College             18570 non-null  int64  
 11  Race                     18570 non-null  object 
 12  Education                18570 non-null  object 
dtypes: float64(3), int64(5), object(5)
memory usa