In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score


In [4]:

# Load the dataset
data = pd.read_csv('processed_dataset.csv')

# # Preprocessing: Handle missing values (e.g., dropping or imputing)
# data = data.dropna()  # Drop rows with missing values
data.isna().sum()

year               0
hwtsupp            0
region             0
statefip           0
metro           9759
metarea       103939
pernum             0
relate             0
age                0
sex                0
race               0
marst              0
nativity       87824
educ99         87412
classwkr           0
wkswork1           0
hrswork        10555
uhrswork           0
union          42379
incwage            0
inclongj       42379
srcearn        42379
qinclong       42379
hisp               0
annhrs             0
incwageman         0
hrwage             0
perconexp          0
hdwfcoh            0
industry           0
occupation         0
education          0
dtype: int64

In [6]:
# Handle missing values
data['metro'].fillna(data['metro'].mode()[0], inplace=True)
data['metarea'].fillna('Unknown', inplace=True)  # New category for missing metropolitan area
data['nativity'].fillna(data['nativity'].mode()[0], inplace=True)
data['educ99'].fillna(data['educ99'].mode()[0], inplace=True)
data['union'].fillna('Unknown', inplace=True)

# Impute numerical columns
data['hrswork'].fillna(data['hrswork'].median(), inplace=True)
data['inclongj'].fillna(data['inclongj'].median(), inplace=True)
data['qinclong'].fillna(data['qinclong'].median(), inplace=True)
data['hrwage'].fillna(data['hrwage'].median(), inplace=True)
data['incwageman'].fillna(data['incwageman'].median(), inplace=True)
data['srcearn'].fillna('Unknown', inplace=True)

# Verify if there are still missing values
missing_summary_after = data.isnull().sum()

# Display the updated summary
print("Missing Values After Imputation:")
print(missing_summary_after)


Missing Values After Imputation:
year          0
hwtsupp       0
region        0
statefip      0
metro         0
metarea       0
pernum        0
relate        0
age           0
sex           0
race          0
marst         0
nativity      0
educ99        0
classwkr      0
wkswork1      0
hrswork       0
uhrswork      0
union         0
incwage       0
inclongj      0
srcearn       0
qinclong      0
hisp          0
annhrs        0
incwageman    0
hrwage        0
perconexp     0
hdwfcoh       0
industry      0
occupation    0
education     0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['metro'].fillna(data['metro'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['nativity'].fillna(data['nativity'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate objec

In [9]:
data.head()
data.columns

Index(['year', 'hwtsupp', 'region', 'statefip', 'metro', 'metarea', 'pernum',
       'relate', 'age', 'sex', 'race', 'marst', 'nativity', 'educ99',
       'classwkr', 'wkswork1', 'hrswork', 'uhrswork', 'union', 'incwage',
       'inclongj', 'srcearn', 'qinclong', 'hisp', 'annhrs', 'incwageman',
       'hrwage', 'perconexp', 'hdwfcoh', 'industry', 'occupation',
       'education'],
      dtype='object')

In [17]:
for column in ['metarea', 'union', 'srcearn']:  # Replace with actual categorical column names
    if data[column].dtype == 'object':  # Check if the column is categorical
        data[column] = data[column].astype('category').cat.codes

In [30]:

# Ensure target column is defined
target_column = 'incwage'  # Replace with the correct target column name in your dataset
X = data.drop(columns=[target_column])
y = data[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train and evaluate Lasso model
lasso = Lasso(alpha=1.0)  # Adjust alpha if needed
lasso.fit(X_train_scaled, y_train)
y_pred_lasso = lasso.predict(X_test_scaled)

lasso_mse = mean_squared_error(y_test, y_pred_lasso)
lasso_r2 = r2_score(y_test, y_pred_lasso)

# Train and evaluate Ridge model
ridge = Ridge(alpha=1.0)  # Adjust alpha if needed
ridge.fit(X_train_scaled, y_train)
y_pred_ridge = ridge.predict(X_test_scaled)

ridge_mse = mean_squared_error(y_test, y_pred_ridge)
ridge_r2 = r2_score(y_test, y_pred_ridge)

# Print metrics
print(f"Lasso: MSE = {lasso_mse:.2f}, R2 = {lasso_r2:.2f}")
print(f"Ridge: MSE = {ridge_mse:.2f}, R2 = {ridge_r2:.2f}")


Lasso: MSE = 10444.13, R2 = 1.00
Ridge: MSE = 1.09, R2 = 1.00


In [18]:

# Separate features (X) and target (y)
target_column = 'incwage'  # Replace 'salary' with the actual target column name
X = data.drop(columns=[target_column])
y = data[target_column]

# print("This is y", y)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print(X_train, X_test, y_train, y_test)


In [21]:

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train)
print(X_test)


[[ 0.59003675 -1.37926896 -1.5046312  ...  0.96285139  0.21397645
   0.67627484]
 [ 0.40529838 -0.57785687 -1.5046312  ... -1.44071028  0.94264169
   0.67627484]
 [-1.16497772  0.15227494 -1.5046312  ...  1.181357   -0.07748965
   0.67627484]
 ...
 [-0.33365508  1.25554115 -1.5046312  ... -1.65921589  1.23410779
   0.67627484]
 [-0.33365508 -0.77157636  0.35744519 ...  1.3998626   0.94264169
   0.67627484]
 [ 0.40529838 -1.17550533 -1.5046312  ... -1.44071028  1.23410779
  -1.67979904]]
[[ 0.40529838  0.46388622  1.28848339 ...  1.181357    0.65117559
   0.67627484]
 [ 0.77477511 -0.80215023 -0.573593   ...  0.30733457  0.21397645
   0.67627484]
 [-0.33365508  2.56007634  1.28848339 ...  0.74434578  0.65117559
  -0.89444108]
 ...
 [ 0.59003675  0.27540956  0.35744519 ...  0.30733457  0.21397645
   0.67627484]
 [-1.16497772 -1.24823155 -1.5046312  ... -1.00369907  1.08837474
   0.67627484]
 [ 0.95951348  0.22006742 -1.5046312  ...  0.74434578 -0.07748965
   0.67627484]]


In [27]:

# L1 Regularization (Lasso)
lasso = Lasso(alpha=1)  # Adjust alpha as needed
lasso.fit(X_train, y_train)
lasso_coefs = pd.Series(lasso.coef_, index=data.drop(columns=[target_column]).columns)

# L2 Regularization (Ridge)
ridge = Ridge(alpha=1)  # Adjust alpha as needed
ridge.fit(X_train, y_train)
ridge_coefs = pd.Series(ridge.coef_, index=data.drop(columns=[target_column]).columns)


In [28]:

# Evaluate both models
lasso_preds = lasso.predict(X_test)
ridge_preds = ridge.predict(X_test)


In [29]:

lasso_mse = mean_squared_error(y_test, lasso_preds)
lasso_r2 = r2_score(y_test, lasso_preds)

ridge_mse = mean_squared_error(y_test, ridge_preds)
ridge_r2 = r2_score(y_test, ridge_preds)

# Output insights
print("Lasso Regression Coefficients:")
print(lasso_coefs)

print("\nRidge Regression Coefficients:")
print(ridge_coefs)


Lasso Regression Coefficients:
year           2994.571375
hwtsupp          -7.153079
region           -0.000000
statefip         -0.000000
metro             0.000000
metarea          -0.120095
pernum           -0.000000
relate           -0.000000
age               0.000000
sex               0.000000
race             -0.000000
marst            -0.000000
nativity         -2.095597
educ99           -1.141103
classwkr          0.000000
wkswork1          0.000000
hrswork          -0.523068
uhrswork         -0.000000
union            -0.000000
inclongj          0.000000
srcearn        -206.640619
qinclong          0.000000
hisp             -0.000000
annhrs            0.000000
incwageman    45489.249665
hrwage            0.000000
perconexp     -3148.487717
hdwfcoh           0.000000
industry          0.000000
occupation       -0.000000
education        -0.000000
dtype: float64

Ridge Regression Coefficients:
year             -0.045052
hwtsupp          -0.004418
region            0.013717
stat

In [64]:

print("\nModel Performance:")
print(f"Lasso: MSE = {lasso_mse:.2f}, R2 = {lasso_r2:.2f}")
print(f"Ridge: MSE = {ridge_mse:.2f}, R2 = {ridge_r2:.2f}")

print("These are the lasso coeffs")
print(lasso_coefs)
print("These are the ridge coeffs")
print(ridge_coefs)
# Gender impact (assuming 'sex_1' represents one of the genders)
if 'sex_1' in lasso_coefs.index:
    print(f"\nImpact of gender on wages (Lasso): {lasso_coefs['sex_1']:.4f}")
if 'sex_1' in ridge_coefs.index:
    print(f"Impact of gender on wages (Ridge): {ridge_coefs['sex_1']:.4f}")



Model Performance:
Lasso: MSE = 10444.13, R2 = 1.00
Ridge: MSE = 1.09, R2 = 1.00
These are the lasso coeffs
year           2994.571375
hwtsupp          -7.153079
region           -0.000000
statefip         -0.000000
metro             0.000000
metarea          -0.120095
pernum           -0.000000
relate           -0.000000
age               0.000000
sex               0.000000
race             -0.000000
marst            -0.000000
nativity         -2.095597
educ99           -1.141103
classwkr          0.000000
wkswork1          0.000000
hrswork          -0.523068
uhrswork         -0.000000
union            -0.000000
inclongj          0.000000
srcearn        -206.640619
qinclong          0.000000
hisp             -0.000000
annhrs            0.000000
incwageman    45489.249665
hrwage            0.000000
perconexp     -3148.487717
hdwfcoh           0.000000
industry          0.000000
occupation       -0.000000
education        -0.000000
dtype: float64
These are the ridge coeffs
year        

In [35]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Ensure target column is defined
target_column = 'incwage'  # Replace with the correct target column name in your dataset
X = data.drop(columns=[target_column])
y = data[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train_scaled.shape)
print(y_train.shape)


print(f"X shape: {X_train_scaled.shape}")
print(f"y shape: {y_train.shape}")

print(X_train_scaled[:5])
print(y_train[:5])

# X_train_scaled, y_train = X_train_scaled.align(y_train, join='inner', axis=0)


# Lasso and Ridge models
lasso = Lasso(alpha=1.0)
ridge = Ridge(alpha=1.0)

# Perform cross-validation
lasso_cv_scores = cross_val_score(lasso, X_train_scaled, y, cv=5, scoring='neg_mean_squared_error')
ridge_cv_scores = cross_val_score(ridge, X_train_scaled, y, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE to positive
lasso_cv_mse = -np.mean(lasso_cv_scores)
ridge_cv_mse = -np.mean(ridge_cv_scores)

# R2 Scores
lasso_r2 = cross_val_score(lasso, X_train_scaled, y, cv=5, scoring='r2')
ridge_r2 = cross_val_score(ridge, X_train_scaled, y, cv=5, scoring='r2')

# Print the results
print(f"Lasso: MSE = {lasso_cv_mse:.2f}, R2 = {np.mean(lasso_r2):.2f}")
print(f"Ridge: MSE = {ridge_cv_mse:.2f}, R2 = {np.mean(ridge_r2):.2f}")


(275429, 31)
(275429,)
X shape: (275429, 31)
y shape: (275429,)
[[ 0.59003675 -1.37926896 -1.5046312  -0.3252048  -1.54479478  1.05044255
   0.5839638  -0.02927377  0.21817064  1.02158505 -0.59188653 -0.66178437
  -0.45523657 -0.58852591  2.04267158 -1.12455603 -2.09466951 -2.09602017
  -0.51676456 -0.70138772 -0.37416977  0.         -0.40911944 -2.02366145
  -0.65522029 -0.01398511  0.64872562  0.31125705  0.96285139  0.21397645
   0.67627484]
 [ 0.40529838 -0.57785687 -1.5046312  -1.21031081  1.5118163  -1.34979873
  -0.67338896 -0.44127847  1.08263319  1.02158505 -0.59188653  0.86296048
   2.37320118 -0.58852591 -0.46819625  0.34696049 -0.01137629 -0.05844287
  -0.51676456 -0.29381893 -0.37416977  0.         -0.40911944  0.11120659
  -0.25951102 -0.01225968  0.39006771  0.31125705 -1.44071028  0.94264169
   0.67627484]
 [-1.16497772  0.15227494 -1.5046312   0.49667936  0.49294594 -0.09004477
   0.5839638  -0.02927377 -0.74234331  1.02158505  0.31631295 -0.66178437
  -0.45523657 -0.5

AttributeError: 'numpy.ndarray' object has no attribute 'align'

In [46]:
# Split the dataset into male and female datasets based on the 'sex' column
df_male = data[data['sex'] == 1]
df_female = data[data['sex'] == 2]

print("********", df_female)
# Separate features (X) and target (y) for males and females
X_male = df_male.drop(columns=['sex', 'incwage'])
y_male = df_male['incwage']

print("This is dataframe for male", X_male)

X_female = df_female.drop(columns=['sex', 'incwage'])
y_female = df_female['incwage']

print("This is dataframe for female", X_female)


********         year      hwtsupp  region  statefip  metro  metarea  pernum  relate  \
175911  2009  2149.719971       3        53    3.0      362       1     101   
175912  2011  1723.689941       3         6    2.0      218       2     201   
175913  2007   854.080017       3         8    4.0      106       2     201   
175914  1990  1548.560059       3        41    1.0      417       2     201   
175915  2009  1755.280029       2        48    3.0      173       1     101   
...      ...          ...     ...       ...    ...      ...     ...     ...   
344282  1981   970.179993       1        39    3.0      417       2    1260   
344283  1999  1753.010010       3         6    3.0      332       4     301   
344284  1981  1971.020020       2        48    3.0      417       2     201   
344285  2007   715.510010       1        19    4.0       96       2     201   
344286  2013   820.739990       3        49    3.0      417       2     201   

        age  sex  ...  qinclong  hisp  ann

In [56]:

# Standardize the features
scaler = StandardScaler()
X_male_scaled = pd.DataFrame(scaler.fit_transform(X_male), columns=X_male.columns)
X_female_scaled = pd.DataFrame(scaler.fit_transform(X_female), columns=X_female.columns)

# Split data into training and testing sets for male and female datasets
X_train_male, X_test_male, y_train_male, y_test_male = train_test_split(X_male_scaled, y_male, test_size=0.2, random_state=42)
X_train_female, X_test_female, y_train_female, y_test_female = train_test_split(X_female_scaled, y_female, test_size=0.2, random_state=42)


In [57]:

# Initialize Lasso (L1 regularization) and Ridge (L2 regularization) models
lasso = Lasso(alpha=1.0)
ridge = Ridge(alpha=1.0)

# Train the Lasso model on the male data
lasso.fit(X_train_male, y_train_male)
y_pred_lasso_male = lasso.predict(X_test_male)
mse_lasso_male = mean_squared_error(y_test_male, y_pred_lasso_male)
r2_lasso_male = r2_score(y_test_male, y_pred_lasso_male)
lasso_coefs_male = pd.Series(lasso.coef_, index=X_train_male.columns)

# Train the Ridge model on the male data
ridge.fit(X_train_male, y_train_male)
y_pred_ridge_male = ridge.predict(X_test_male)
mse_ridge_male = mean_squared_error(y_test_male, y_pred_ridge_male)
r2_ridge_male = r2_score(y_test_male, y_pred_ridge_male)
ridge_coefs_male = pd.Series(ridge.coef_, index=X_train_male.columns)

# Train the Lasso model on the female data
lasso.fit(X_train_female, y_train_female)
y_pred_lasso_female = lasso.predict(X_test_female)
mse_lasso_female = mean_squared_error(y_test_female, y_pred_lasso_female)
r2_lasso_female = r2_score(y_test_female, y_pred_lasso_female)
lasso_coefs_female = pd.Series(lasso.coef_, index=X_train_female.columns)

# Train the Ridge model on the female data
ridge.fit(X_train_female, y_train_female)
y_pred_ridge_female = ridge.predict(X_test_female)
mse_ridge_female = mean_squared_error(y_test_female, y_pred_ridge_female)
r2_ridge_female = r2_score(y_test_female, y_pred_ridge_female)
ridge_coefs_female = pd.Series(ridge.coef_, index=X_train_female.columns)


In [62]:

# Print the results for both male and female
print(f"Male Data (Lasso): MSE = {mse_lasso_male:.2f}, R2 = {r2_lasso_male:.2f}")
print(f"Male Data (Ridge): MSE = {mse_ridge_male:.2f}, R2 = {r2_ridge_male:.2f}")
print(f"Female Data (Lasso): MSE = {mse_lasso_female:.2f}, R2 = {r2_lasso_female:.2f}")
print(f"Female Data (Ridge): MSE = {mse_ridge_female:.2f}, R2 = {r2_ridge_female:.2f}")

# Print the coefficients for Lasso and Ridge models
print("\nLasso Coefficients for Male Data:")
print(lasso_coefs_male)
print("\nRidge Coefficients for Male Data:")
print(ridge_coefs_male)

print("\nLasso Coefficients for Female Data:")
print(lasso_coefs_female)
print("\nRidge Coefficients for Female Data:")
print(ridge_coefs_female)


Male Data (Lasso): MSE = 17795.41, R2 = 1.00
Male Data (Ridge): MSE = 6.22, R2 = 1.00
Female Data (Lasso): MSE = 5414.45, R2 = 1.00
Female Data (Ridge): MSE = 7.72, R2 = 1.00

Lasso Coefficients for Male Data:
year           4036.899814
hwtsupp         -10.123084
region           -0.418743
statefip          0.000000
metro             0.487779
metarea          -0.578634
pernum           -0.212013
relate           -0.000000
age               0.000000
race             -0.000000
marst            -0.000000
nativity         -3.389625
educ99           -2.701959
classwkr          0.000000
wkswork1          0.172577
hrswork          -1.866760
uhrswork          0.000000
union            -0.000000
inclongj          0.000000
srcearn        -283.873991
qinclong          0.000000
hisp             -0.471488
annhrs            0.000000
incwageman    52780.348212
hrwage            0.000000
perconexp     -4251.744222
hdwfcoh           0.000000
industry         -0.000000
occupation       -0.000000
educati