In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [4]:
donors = pd.read_csv('learningSet.txt')


In [5]:
# Checking the dtypes
donors.dtypes.value_counts()

int64      300
float64    107
object      74
dtype: int64

In [6]:
# Check for null values in the dataframe. Replace the null values using the methods learned in class.
# Because it is a big table, with over 400 rows, the best way is to to show the percentage of NaN values

nulls_percent_df = donors.isna().sum()/len(donors)

nulls_percent_df = pd.DataFrame(donors.isna().sum()/len(donors)).reset_index()

nulls_percent_df.columns = ['columns_name', 'nulls_percentage']
nulls_percent_df.sort_values(by = ['nulls_percentage'], ascending = False)

# Because we have a lot of rows, we I'm considering 30% of the data missing as a criteria to drop the columns

threshold = 0.30
condition = nulls_percent_df['nulls_percentage'] > threshold
columns_above_threshold = nulls_percent_df[condition]
drop_columns_list = list(columns_above_threshold['columns_name'])
donors = donors.drop(columns=drop_columns_list)

In [7]:
# Dealing with the Nan and empty spaces
# Repleacing the empty spaces for the most present
donors['DOMAIN'].replace(' ', 'R2', inplace = True)

# I'm replacing the nulls for the mode

mode_values = donors.mode().iloc[0]
donors = donors.fillna(mode_values)


In [8]:
# Split the data into numerical and catagorical. Decide if any columns need their dtype changed.

numerical = donors.select_dtypes(include='number')
categorical = donors.select_dtypes(include='object')


In [9]:
# I'm converting the ZIP column to numerical and adding it to the numerical column
# Because the column has some non numeric value, I'm selecting only the numbers
categorical['ZIP'] = categorical['ZIP'].str.extract('(\d+)', expand=False)
numerical['ZIP'] = categorical['ZIP']
categorical = categorical.drop('ZIP', axis=1)

In [10]:
categorical = categorical.reset_index(drop=True)
numerical = numerical.reset_index(drop=True)

In [11]:
# Concatenate numerical and categorical back together again for your X dataframe. Designate the Target as y.

X = pd.concat([numerical, categorical], axis=1)
X = X.drop(['TARGET_B', 'TARGET_D'], axis=1)
y = donors['TARGET_B']


In [12]:
numericalX = X.select_dtypes(include = [np.number])
categoricalX = X.select_dtypes(exclude = [np.number])

In [13]:
categoricalX = categoricalX.astype(str)

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
encoded_categorical.columns = [str(col) if isinstance(col, int) else col for col in encoded_categorical.columns]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numericalX_norm = scaler.fit_transform(numericalX)




In [14]:
numericalX_norm = pd.DataFrame(numericalX_norm, columns=numericalX.columns)

In [15]:
X = pd.concat([numericalX_norm, encoded_categorical], axis=1)

In [16]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2)

In [17]:
X.columns = X.columns.astype(str)

In [18]:
#  Fit a logistic regression model on the training data
#  Check the accuracy on the test data.

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# from sklearn.preprocessing import StandardScaler

# because the data was already scaled and fit in previous steps, I'm defininf the model

lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

# Predict on the test set
y_pred = lr.predict(X_test)


In [19]:
# Compute the accuracy
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.96      0.98      0.97      1471
         1.0       0.10      0.05      0.06        65

    accuracy                           0.94      1536
   macro avg       0.53      0.51      0.52      1536
weighted avg       0.92      0.94      0.93      1536



In [20]:
# Managing imbalance in the dataset

# Check for the imbalance

y_train.value_counts()


0.0    5827
1.0     317
Name: TARGET_B, dtype: int64

In [21]:
df_train = pd.concat([X_train, y_train], axis = 1)

In [22]:
# Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.

category_0 = df_train[df_train["TARGET_B"] == 0]
category_1 = df_train[df_train["TARGET_B"] == 1]




In [23]:
category_0 = category_0.sample(len(category_1))

df_undersampled = pd.concat([category_0, category_1], axis = 0).reset_index(drop = True)


In [24]:
# Checking the balance after the undersample

df_undersampled['TARGET_B'].value_counts()

0.0    317
1.0    317
Name: TARGET_B, dtype: int64

In [25]:
X_train_under = df_undersampled.drop('TARGET_B', axis=1)
y_train_under = df_undersampled['TARGET_B']

# Model with the undersampled data
lr.fit(X_train_under, y_train_under)

# Predict on the test set
y_pred_under = lr.predict(X_test)



In [26]:
print(classification_report(y_test, y_pred_under))

              precision    recall  f1-score   support

         0.0       0.97      0.53      0.68      1471
         1.0       0.05      0.58      0.10        65

    accuracy                           0.53      1536
   macro avg       0.51      0.56      0.39      1536
weighted avg       0.93      0.53      0.66      1536



In [27]:
# Using SMOTE to upsample the data

from imblearn.over_sampling import SMOTE

# Applying SMOTE to the training data only

smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

In [28]:
y_train_sm.value_counts()

0.0    5827
1.0    5827
Name: TARGET_B, dtype: int64

In [29]:
# Model with the oversampled data
lr.fit(X_train_sm, y_train_sm)

# Predict on the test set
y_pred_sm = lr.predict(X_test)

print(classification_report(y_test, y_pred_sm))

              precision    recall  f1-score   support

         0.0       0.96      0.90      0.93      1471
         1.0       0.07      0.17      0.10        65

    accuracy                           0.87      1536
   macro avg       0.51      0.53      0.51      1536
weighted avg       0.92      0.87      0.89      1536

