# Import Libraries

In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

# Create the data in the form of DataFrame

In [2]:
data = {
    'Test_Score': [85, 90, 88, 92, 78, 82, 80, 76],
    'Study_Hours': [10, 12, 11, 13, 8, 9, 7, 6],
    'Gender': ['Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Female']
}

# Create a DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Test_Score,Study_Hours,Gender
0,85,10,Male
1,90,12,Male
2,88,11,Male
3,92,13,Male
4,78,8,Female
5,82,9,Female
6,80,7,Female
7,76,6,Female


# Convert the categorical variable 'Gender' into dummy variables

In [3]:
df_dummies_Gender = pd.get_dummies(df['Gender'])
df_dummies_Gender

Unnamed: 0,Female,Male
0,False,True
1,False,True
2,False,True
3,False,True
4,True,False
5,True,False
6,True,False
7,True,False


# Convert the categorical variable 'Gender' into dummy variables and dropping the first dummy variable

In [4]:
df_dummies_Male = pd.get_dummies(df['Gender'], drop_first=True)
df_dummies_Male

Unnamed: 0,Male
0,True
1,True
2,True
3,True
4,False
5,False
6,False
7,False


# Concatenate dummies to original Dataset

In [5]:
df2 = pd.concat([df,df_dummies_Male], axis = 1)
df2

Unnamed: 0,Test_Score,Study_Hours,Gender,Male
0,85,10,Male,True
1,90,12,Male,True
2,88,11,Male,True
3,92,13,Male,True
4,78,8,Female,False
5,82,9,Female,False
6,80,7,Female,False
7,76,6,Female,False


# Drop the Gender column because we have created dummy variables for this column

In [6]:
df2.drop(['Gender'], axis = 1, inplace = True)
df2

Unnamed: 0,Test_Score,Study_Hours,Male
0,85,10,True
1,90,12,True
2,88,11,True
3,92,13,True
4,78,8,False
5,82,9,False
6,80,7,False
7,76,6,False


# Perform Linear Regression 

In [7]:
X = df2[['Study_Hours', 'Male']]
X = sm.add_constant(X)  # Add a constant term for the intercept
# Define the dependent variable (Test_Score)
y = df['Test_Score']
# Fit the regression model
model = sm.OLS(y, X.astype(float)).fit()

# Print the summary of the regression
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             Test_Score   R-squared:                       0.963
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     65.37
Date:                Wed, 29 Oct 2025   Prob (F-statistic):           0.000260
Time:                        13:17:40   Log-Likelihood:                -11.699
No. Observations:                   8   AIC:                             29.40
Df Residuals:                       5   BIC:                             29.64
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          64.3750      3.202     20.106      

  return hypotest_fun_in(*args, **kwds)


# Categorize the categorical features into Nominal OR Ordinal

In [8]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [9]:
df

Unnamed: 0,Test_Score,Study_Hours,Gender
0,85,10,Male
1,90,12,Male
2,88,11,Male
3,92,13,Male
4,78,8,Female
5,82,9,Female
6,80,7,Female
7,76,6,Female


In [10]:
OHE = OneHotEncoder(drop='first', sparse_output = False)
df_OHE_Gender = OHE.fit_transform(df['Gender'].values.reshape(-1,1))

In [11]:
df_OHE_Gender

array([[1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.]])

# EXAMPLE 02

# Import the Libraries

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Import the Salaries.csv dataset

In [13]:
data_url = 'http://vincentarelbundock.github.io/Rdatasets/csv/carData/Salaries.csv'
df = pd.read_csv(data_url, index_col =0 )
df

Unnamed: 0_level_0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Prof,B,19,18,Male,139750
2,Prof,B,20,16,Male,173200
3,AsstProf,B,4,3,Male,79750
4,Prof,B,45,39,Male,115000
5,Prof,B,40,41,Male,141500
...,...,...,...,...,...,...
393,Prof,A,33,30,Male,103106
394,Prof,A,31,19,Male,150564
395,Prof,A,42,25,Male,101738
396,Prof,A,25,15,Male,95329


In [14]:
df.shape

(397, 6)

In [17]:
df['rank'].value_counts()

rank
Prof         266
AsstProf      67
AssocProf     64
Name: count, dtype: int64

# Create the dummy variables for categorical features

In [15]:
df_dummies = pd.get_dummies(df[['rank','sex','discipline']],prefix = '', prefix_sep ='', drop_first=True)
df_dummies.head()

Unnamed: 0_level_0,AsstProf,Prof,Male,B
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,False,True,True,True
2,False,True,True,True
3,True,False,True,True
4,False,True,True,True
5,False,True,True,True


# Concatenate the dummy variables to the original dataset

In [18]:
df2 = pd.concat([df,df_dummies], axis = 1)
df2

Unnamed: 0_level_0,rank,discipline,yrs.since.phd,yrs.service,sex,salary,AsstProf,Prof,Male,B
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Prof,B,19,18,Male,139750,False,True,True,True
2,Prof,B,20,16,Male,173200,False,True,True,True
3,AsstProf,B,4,3,Male,79750,True,False,True,True
4,Prof,B,45,39,Male,115000,False,True,True,True
5,Prof,B,40,41,Male,141500,False,True,True,True
...,...,...,...,...,...,...,...,...,...,...
393,Prof,A,33,30,Male,103106,False,True,True,False
394,Prof,A,31,19,Male,150564,False,True,True,False
395,Prof,A,42,25,Male,101738,False,True,True,False
396,Prof,A,25,15,Male,95329,False,True,True,False


## Drop rank, sex and discipline columns because their dummy variables have been created

In [19]:
df2.drop(['rank','sex','discipline'], axis = 1, inplace = True)

In [20]:
df2

Unnamed: 0_level_0,yrs.since.phd,yrs.service,salary,AsstProf,Prof,Male,B
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,19,18,139750,False,True,True,True
2,20,16,173200,False,True,True,True
3,4,3,79750,True,False,True,True
4,45,39,115000,False,True,True,True
5,40,41,141500,False,True,True,True
...,...,...,...,...,...,...,...
393,33,30,103106,False,True,True,False
394,31,19,150564,False,True,True,False
395,42,25,101738,False,True,True,False
396,25,15,95329,False,True,True,False


# Using Multilinear Regression, predict the salary from the given independent features

In [68]:
X = df2.drop(['salary'], axis = 1)
X = sm.add_constant(X)  # Add a constant term for the intercept

X

Unnamed: 0_level_0,const,yrs.since.phd,yrs.service,AsstProf,Prof,Male,B
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1.0,19,18,False,True,True,True
2,1.0,20,16,False,True,True,True
3,1.0,4,3,True,False,True,True
4,1.0,45,39,False,True,True,True
5,1.0,40,41,False,True,True,True
...,...,...,...,...,...,...,...
393,1.0,33,30,False,True,True,False
394,1.0,31,19,False,True,True,False
395,1.0,42,25,False,True,True,False
396,1.0,25,15,False,True,True,False


In [69]:
y = df2['salary']

In [70]:
y

rownames
1      139750
2      173200
3       79750
4      115000
5      141500
        ...  
393    103106
394    150564
395    101738
396     95329
397     81035
Name: salary, Length: 397, dtype: int64

In [71]:
# Fit the regression model
model = sm.OLS(y, X.astype(float)).fit()

# Print the summary of the regression
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.455
Model:                            OLS   Adj. R-squared:                  0.446
Method:                 Least Squares   F-statistic:                     54.20
Date:                Sun, 26 Oct 2025   Prob (F-statistic):           1.79e-48
Time:                        12:48:31   Log-Likelihood:                -4538.9
No. Observations:                 397   AIC:                             9092.
Df Residuals:                     390   BIC:                             9120.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const          7.886e+04   4990.326     15.803

# Machine Learning Approach

In [72]:
X = df2.drop(['salary'], axis = 1).values
y = df2['salary'].values.reshape(-1, 1)

In [73]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [74]:
from sklearn.linear_model import LinearRegression

In [75]:
Model_Reg = LinearRegression()

In [76]:
Model_Reg.fit(X_train, y_train)

In [77]:
print(Model_Reg.coef_, Model_Reg.intercept_)

[[   506.78274727   -307.88522912 -11506.8305794   32699.54539855
    4365.9822887   16055.49743793]] [76061.88884465]


In [78]:
y_pred = Model_Reg.predict(X_test)

In [79]:
from sklearn import metrics
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2:', metrics.r2_score(y_test, y_pred))

RMSE: 24181.920660962078
R2: 0.2386600734322184


# Categorize the categorical features into Nominal and Ordinal

In [21]:
df

Unnamed: 0_level_0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Prof,B,19,18,Male,139750
2,Prof,B,20,16,Male,173200
3,AsstProf,B,4,3,Male,79750
4,Prof,B,45,39,Male,115000
5,Prof,B,40,41,Male,141500
...,...,...,...,...,...,...
393,Prof,A,33,30,Male,103106
394,Prof,A,31,19,Male,150564
395,Prof,A,42,25,Male,101738
396,Prof,A,25,15,Male,95329


In [23]:
nominal_features = ["sex"]

ordinal_features = ["rank", "discipline"]

numerical_features = ["yrs.since.phd", "yrs.service"]

# Perform one hot encoding on nominal features

In [24]:
X = df.drop(['salary'], axis = 1)
y = df['salary']

## Split data before hot encoding

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

### Hot encoding after splitting data

In [26]:
OHE = OneHotEncoder(drop='first', sparse_output = False)
X_train_nominal = OHE.fit_transform(X_train[nominal_features])
X_test_nominal = OHE.transform(X_test[nominal_features])

In [27]:
# Convert to DataFrame
X_train_nominal_df = pd.DataFrame(X_train_nominal, columns=OHE.get_feature_names_out(nominal_features), index=X_train.index)
X_test_nominal_df = pd.DataFrame(X_test_nominal, columns=OHE.get_feature_names_out(nominal_features), index=X_test.index)

# Perform ordinal encoding on ordinal features

In [28]:
# Ordinal encode ordered features
OE = OrdinalEncoder()
X_train_ordinal = OE.fit_transform(X_train[ordinal_features])
X_test_ordinal = OE.transform(X_test[ordinal_features])

# Convert to DataFrame
X_train_ordinal_df = pd.DataFrame(X_train_ordinal, columns=ordinal_features, index=X_train.index)
X_test_ordinal_df = pd.DataFrame(X_test_ordinal, columns=ordinal_features, index=X_test.index)

# Keep numeric features as it is

In [29]:
X_train_numeric_df = X_train[numerical_features].copy()
X_test_numeric_df = X_test[numerical_features].copy()

# Concatenate all feature types

In [30]:
X_train_combined = pd.concat([X_train_nominal_df, X_train_ordinal_df, X_train_numeric_df], axis=1)
X_test_combined = pd.concat([X_test_nominal_df, X_test_ordinal_df, X_test_numeric_df], axis=1)

In [31]:
X_train_combined[:20]

Unnamed: 0_level_0,sex_Male,rank,discipline,yrs.since.phd,yrs.service
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
263,1.0,2.0,0.0,31,26
114,1.0,2.0,0.0,37,37
177,1.0,0.0,1.0,10,7
64,0.0,0.0,1.0,11,11
246,0.0,2.0,0.0,17,11
94,1.0,2.0,1.0,38,38
394,1.0,2.0,0.0,31,19
125,1.0,2.0,0.0,24,22
369,1.0,2.0,0.0,35,30
196,1.0,0.0,1.0,9,7


# Perform Linear Regression

In [32]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Train linear regression
lr = LinearRegression()
lr.fit(X_train_combined, y_train)

# Predict and evaluate
y_pred = lr.predict(X_test_combined)

R2 = r2_score(y_test, y_pred)

print("R² Score:", R2)

R² Score: 0.10796880791419672
