In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [None]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv') 

In [None]:
numerical.isnull().sum()

In [None]:
numerical.isna().sum()

In [None]:
numerical['WEALTH1'].isna().sum()

In [None]:
numerical['WEALTH1'].value_counts()

In [None]:
categorical.isna().sum()

In [None]:
categorical.isnull().sum()

In [None]:
categorical.drop(['SOLIH'], axis=1,inplace=True)

In [None]:
categorical.drop(['VETERANS'], axis=1,inplace=True)

In [None]:
categorical['GENDER'].fillna('U', inplace=True)

In [None]:
categorical.drop(['OSOURCE'], axis=1,inplace=True)

In [None]:
categorical.isnull().sum()

In [None]:
numerical['WEALTH1'].fillna('0.0', inplace=True)

In [None]:
scaler = MinMaxScaler()

for col in numerical.columns:
    numerical[col] = scaler.fit_transform(numerical[[col]])

numerical.head()

In [None]:
categorical.nunique()

In [None]:
categorical.drop(['ZIP'], axis=1,inplace=True)

In [None]:
one_hot_names = []
for col in categorical.columns:
    col_uniques = sorted(categorical[col].astype(str).unique())
    for unique in col_uniques:
        one_hot_names.append(col+"_"+unique)
        
categorical = pd.DataFrame(OneHotEncoder().fit_transform(categorical.astype(str)).toarray())
categorical.columns = one_hot_names
categorical.head()

In [None]:
targets = pd.read_csv('Y.csv')

In [None]:
targets=targets.drop(columns="TARGET_D")

In [None]:
X = pd.concat([numerical, categorical], axis=1)
y = targets

In [None]:
columns_with_nan = X.columns[X.isnull().any()]
print(columns_with_nan)

In [None]:
X.drop(['WEALTH2'], axis=1,inplace=True)
X.drop(['MSA'], axis=1,inplace=True)
X.drop(['ADI'], axis=1,inplace=True)
X.drop(['DMA'], axis=1,inplace=True)
X.drop(['NEXTDATE'], axis=1,inplace=True)
X.drop(['TIMELAG'], axis=1,inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
accuracy = model.score(X_test, y_test)
accuracy

In [None]:
targets.value_counts()

In [None]:
# oversampling
smote = SMOTE()

x_resampled,y_resampled=smote.fit_resample(X,targets)
y_resampled.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
# undersampling
RUS=RandomUnderSampler(random_state=0)
x_resampled,y_resampled=RUS.fit_resample(X,targets)

y_resampled.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
# tomeklinks
from imblearn.under_sampling import TomekLinks

tl = TomekLinks(sampling_strategy='majority')
X_tl, y_tl=tl.fit_resample(X,y)
y_tl.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy