# BA 222 Project 3

## Importing packages

In [18]:
import pandas as pd
import statsmodels.api as sm

## Reading the data
Also mapping the variable we want to predict to a binary variable instead of yes/no.

In [19]:
odf = pd.read_csv('bankdata_training.csv').drop(['duration'], axis=1)

# y comes as yes/no, but we need numbers for the linear regression
odf['y'] = odf['y'].map({'yes': 1, 'no': 0})

## Dummy Function
Creates dummy variables for all categorical variables.

In [20]:
def make_dummies(df: pd.DataFrame, target:str = 'y') -> pd.DataFrame:
    """
    Creates dummy variables for categorical variables in a DataFrame.

    Parameters:
    - df (DataFrame): The input DataFrame.
    - target (str): The target variable column name. Default is 'y'.

    Returns:
    - df (DataFrame): The modified DataFrame with dummy variables.

    Example:
    >>> df = pd.DataFrame({'Color': ['Red', 'Blue', 'Green'], 'Size': ['Small', 'Medium', 'Large']})
    >>> df = make_dummies(df)
    >>> print(df)
       Color_Blue  Color_Green  Color_Red  Size_Large  Size_Medium  Size_Small
    0           0            0          1           0            0           1
    1           1            0          0           0            1           0
    2           0            1          0           1            0           0
    """
    
    df = df.copy()
    
    for col in df.columns:
        if df[col].dtype == 'object' and col != target:
            df = pd.concat([df, pd.get_dummies(df[col], prefix=col, dtype=int)], axis=1)
            df.drop(col, axis=1, inplace=True)
            
    return df

## Finding the best model
Finds the `num_cols` that would lead to the model with the highest $R$-squared.

Every iteration of the loop, we add a new variable to the model and check if the $R$-squared increases. Then, we add the variable that led to the highest $R$-squared to the list of variables to keep. We keep on going until we have added `num_cols` variables to the model.

In [21]:
num_cols = 5 # columns to find
target = 'y' # target variable
best_cols = [] # list of best columns

odf_dummies = make_dummies(odf) # create dummy variables

# Get the list of remaining columns to consider
remaining_cols = odf_dummies.drop(target, axis=1).columns.to_list()

# Iterate until the desired number of columns is reached
while len(best_cols) < num_cols:
    
    # Create a DataFrame to store the R-squared values for each column
    testing_cols = pd.DataFrame(columns=['rsquared'])
    
    # Iterate over the remaining columns and calculate the R-squared value
    for col in remaining_cols:
        cols = best_cols + [col]
        rsquared = sm.OLS(odf_dummies[target], sm.add_constant(odf_dummies[cols])).fit().rsquared
        testing_cols.loc[col] = [rsquared]
    
    # Select the column with the highest R-squared value and remove it from the remaining columns
    best = testing_cols.idxmax()[0]
    best_cols.append(best)
    remaining_cols.remove(best)
    
best_cols


['nr.employed', 'poutcome_success', 'month_may', 'month_mar', 'cons.conf.idx']

In [22]:
odf_dummies.corr()[['y']].drop('y').rename(columns={'y': 'Correlation with y'}).sort_values('Correlation with y', ascending=False)

Unnamed: 0,Correlation with y
poutcome_success,0.325804
previous,0.255697
month_mar,0.164775
contact_cellular,0.137401
month_sep,0.119395
...,...
poutcome_nonexistent,-0.207179
emp.var.rate,-0.283216
euribor3m,-0.298565
pdays,-0.332012


In [23]:
odf_dummies[[*best_cols, "y"]].corr()[['y']].drop('y').rename(columns={'y': 'Correlation with y'})

Unnamed: 0,Correlation with y
nr.employed,-0.349241
poutcome_success,0.325804
month_may,-0.100321
month_mar,0.164775
cons.conf.idx,0.054393


# Creating the model
Using the variables we found in the previous step, we create the model.

In [24]:
X = sm.add_constant(odf_dummies[best_cols])
Y = odf_dummies['y']

model = sm.OLS(Y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.199
Model:,OLS,Adj. R-squared:,0.198
Method:,Least Squares,F-statistic:,204.7
Date:,"Mon, 04 Dec 2023",Prob (F-statistic):,1.9400000000000003e-195
Time:,19:12:05,Log-Likelihood:,-592.62
No. Observations:,4119,AIC:,1197.0
Df Residuals:,4113,BIC:,1235.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.7578,0.351,19.256,0.000,6.070,7.446
nr.employed,-0.0012,6.62e-05,-18.828,0.000,-0.001,-0.001
poutcome_success,0.3393,0.026,13.028,0.000,0.288,0.390
month_may,-0.0855,0.010,-8.973,0.000,-0.104,-0.067
month_mar,0.2688,0.041,6.487,0.000,0.188,0.350
cons.conf.idx,0.0047,0.001,4.888,0.000,0.003,0.007

0,1,2,3
Omnibus:,1762.071,Durbin-Watson:,1.935
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7285.03
Skew:,2.127,Prob(JB):,0.0
Kurtosis:,7.935,Cond. No.,416000.0


## Predicting the values
Here we read the full csv and predict the values using the model we created earlier. The resulting DataFrame shows the model's performance on the whole dataset.

In [25]:
# Load the test set and convert the target variable to numbers
test = pd.read_csv('bankdata_full.csv')
test['y'] = test['y'].map({'yes': 1, 'no': 0})

results = pd.DataFrame()
results['y'] = test['y']

# Create dummy variables for the test set, select the best cols, add constants, then predict with our model
results['y_hat'] = model.predict(sm.add_constant(make_dummies(test)[best_cols]))
results.corr()**2

Unnamed: 0,y,y_hat
y,1.0,0.196678
y_hat,0.196678,1.0


In [26]:
odf.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'campaign', 'pdays', 'previous',
       'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
       'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [31]:
test_cols = ["cons.conf.idx", "nr.employed", "y"]

for i in odf_dummies.columns:
    if "month" in i or "poutcome" in i:
        test_cols.append(i)
        
odf_dummies[[*test_cols]].corr()[['y']].sort_values('y', ascending=False)

Unnamed: 0,y
y,1.0
poutcome_success,0.325804
month_mar,0.164775
month_sep,0.119395
month_oct,0.105684
month_dec,0.102309
cons.conf.idx,0.054393
month_apr,0.043551
poutcome_failure,0.042927
month_jun,0.023148


In [28]:
odf_dummies[[*test_cols]].corr()[['poutcome_success', 'nr.employed', 'cons.conf.idx']].sort_values('nr.employed', ascending=False)

Unnamed: 0,poutcome_success,nr.employed,cons.conf.idx
nr.employed,-0.35403,1.0,0.107054
poutcome_nonexistent,-0.459409,0.508717,0.100217
month_jul,-0.044054,0.301667,-0.190939
month_aug,0.007639,0.18236,0.450124
month_jun,-0.016975,0.153452,-0.089405
cons.conf.idx,0.087382,0.107054,1.0
month_nov,0.041216,0.011502,-0.076906
month_dec,0.077438,-0.135334,0.110264
month_mar,0.091076,-0.163187,-0.052415
month_may,-0.069113,-0.176529,-0.031903


In [29]:
odf_dummies[[*test_cols]].corr()[['poutcome_success', 'nr.employed', 'cons.conf.idx']].sort_values('cons.conf.idx', ascending=False)

Unnamed: 0,poutcome_success,nr.employed,cons.conf.idx
cons.conf.idx,0.087382,0.107054,1.0
month_aug,0.007639,0.18236,0.450124
month_oct,0.047916,-0.264211,0.166906
month_sep,0.14841,-0.310118,0.164268
month_dec,0.077438,-0.135334,0.110264
nr.employed,-0.35403,1.0,0.107054
poutcome_nonexistent,-0.459409,0.508717,0.100217
poutcome_success,1.0,-0.35403,0.087382
y,0.325804,-0.349241,0.054393
month_may,-0.069113,-0.176529,-0.031903
